예제 #1
0
파일: cross.py 프로젝트: lxb1991/NER_TOOL
    def __init__(self, hyper_param, word_embedding, char_embedding, vocabs,
                 task_vocab_size, domain_vocab_size, param_types, device):
        super().__init__()

        word_embeddings_weight = torch.FloatTensor(word_embedding)
        self.word_matrix = Embedding.from_pretrained(word_embeddings_weight, freeze=False)

        char_embeddings_weight = torch.FloatTensor(char_embedding)
        self.char_matrix = Embedding.from_pretrained(char_embeddings_weight, freeze=False)
        self.char_cnn = CharCNN(hyper_param.drop_out, hyper_param.char_embed_dim, hyper_param.char_cnn_kernels)

        self.task_embeddings = Embedding.from_pretrained(
            torch.from_numpy(self.random_embedding(task_vocab_size, 8)), freeze=False).float()
        self.domain_embeddings = Embedding.from_pretrained(
            torch.from_numpy(self.random_embedding(domain_vocab_size, 8)), freeze=False).float()

        self.lstm_input_size = hyper_param.word_embed_dim + hyper_param.char_cnn_kernels * 3
        self.rnn = NoParamLSTM(bidirectional=True, num_layers=1, input_size=self.lstm_input_size,
                               hidden_size=hyper_param.lstm_hidden, batch_first=True)

        self.drop_out = Dropout(p=hyper_param.drop_out)
        self.fc = {}
        for key in param_types:
            self.fc[key] = Linear(hyper_param.lstm_hidden * 2, len(vocabs[key]))
            setattr(self, 'fc_' + '_'.join(key), self.fc[key])
        self.device = device
def get_featurized_embedding(features: List, featurizer, featurizer_dim, dtype=torch.float,
                             device=None, verbose=True,
                             vector_file="data/cn_w2v_glove_features_main_additional_filtered.tch",
                             gen_data=False):
    embeddings_list = np.empty((len(features), featurizer_dim))
    additional_embeddings_list = np.empty((len(features), featurizer_dim))
    index_map = np.empty((len(features)), dtype=np.int64)
    iterator = range(len(features))
    count = 0
    neg_count = 0
    if not gen_data:
        raw_data = load(vector_file)
        embedding = Embedding.from_pretrained(torch.as_tensor(raw_data["main"]["weight"]).to("cpu"))
        additional_embedding = Embedding.from_pretrained(
            torch.as_tensor(raw_data["additional"]["weight"]).to(device))
        index_map = raw_data["index_map"].to(device)

        return embedding, index_map, additional_embedding
    else:
        if verbose:
            print("Processing features of dataset...")
            iterator = tqdm(iterator)
        for i in iterator:
            featurized, oov = featurizer(features[i])
            if oov:
                embeddings_list[count] = featurized
                index_map[i] = count
                count += 1
            else:
                additional_embeddings_list[neg_count] = featurized
                index_map[i] = -1
                neg_count += 1

        embeddings_list = embeddings_list[0:count]
        additional_embeddings_list = additional_embeddings_list[0:neg_count]

        embedding = Embedding.from_pretrained(
            torch.as_tensor(np.array(embeddings_list), dtype=dtype, device=device))
        index_map = torch.as_tensor(index_map, dtype=torch.long, device=device)
        additional_embedding = Embedding.from_pretrained(
            torch.as_tensor(np.array(additional_embeddings_list), dtype=dtype, device=device),
            sparse=True)

        raw_data = {
            "main": embedding.state_dict(),
            "additional": additional_embedding.state_dict(),
            "index_map": index_map
        }

        save_data(vector_file, raw_data)

        return embedding, index_map, additional_embedding
예제 #3
0
파일: base.py 프로젝트: lxb1991/NER_TOOL
    def __init__(self, hyper_param, word_embedding, char_embedding, label_vocab_size, device):
        super(BaseLM, self).__init__()
        self.device = device

        word_embeddings_weight = torch.FloatTensor(word_embedding)
        self.word_matrix = Embedding.from_pretrained(word_embeddings_weight, freeze=False)

        char_embeddings_weight = torch.FloatTensor(char_embedding)
        self.char_matrix = Embedding.from_pretrained(char_embeddings_weight, freeze=False)
        self.char_cnn = CharCNN(hyper_param.drop_out, hyper_param.char_embed_dim, hyper_param.char_cnn_kernels)

        self.lstm_input_size = hyper_param.word_embed_dim + hyper_param.char_cnn_kernels * 3
        self.lstm = LSTM(self.lstm_input_size, hyper_param.lstm_hidden,
                         batch_first=True, bidirectional=True)
        self.drop_out = Dropout(p=hyper_param.drop_out)
예제 #4
0
def setup_simple_classifier():
    BATCH_SIZE = 1
    NUM_TOKENS = 3
    EMB_DIM = 300
    VOCAB_SIZE = 10
    NUM_CLASSES = 3
    embedding = Embedding.from_pretrained(torch.zeros([VOCAB_SIZE, EMB_DIM]))
    embedder = VanillaEmbedder(embedding_dim=EMB_DIM, embedding=embedding)
    labels = torch.LongTensor([[1]])
    encoder = BOW_Encoder(emb_dim=EMB_DIM,
                          embedder=embedder,
                          dropout_value=0,
                          aggregation_type="sum")
    tokens = np.random.randint(0,
                               VOCAB_SIZE - 1,
                               size=(BATCH_SIZE, NUM_TOKENS))
    tokens = torch.LongTensor(tokens)
    simple_classifier = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMB_DIM,
        num_classes=NUM_CLASSES,
        classification_layer_bias=False,
    )
    iter_dict = {"tokens": tokens, "label": labels}
    return iter_dict, simple_classifier, BATCH_SIZE, NUM_CLASSES
예제 #5
0
    def __init__(self, embedding_weights, emb_size):
        self.emb_size = emb_size

        super(CaptionModel, self).__init__()
        self.embedding = Embedding.from_pretrained(
            torch.FloatTensor(embedding_weights))
        self.gru = GRU(embedding_weights.shape[1], emb_size)
예제 #6
0
파일: ner_v1.py 프로젝트: cuilunan/easytext
    def __init__(self, token_vocabulary: Union[Vocabulary,
                                               PretrainedVocabulary],
                 label_vocabulary: LabelVocabulary, word_embedding_dim: int,
                 hidden_size: int, num_layer: int, dropout: float):

        super().__init__()

        self.word_embedding_dim = word_embedding_dim
        self.token_vocabulary = token_vocabulary
        self.label_vocabulary = label_vocabulary

        if isinstance(token_vocabulary, Vocabulary):
            self.embedding: Embedding = Embedding(
                num_embeddings=token_vocabulary.size,
                embedding_dim=word_embedding_dim,
                padding_idx=token_vocabulary.padding_index)

        elif isinstance(token_vocabulary, PretrainedVocabulary):
            self.embedding: Embedding = Embedding.from_pretrained(
                token_vocabulary.embedding_matrix,
                freeze=True,
                padding_idx=token_vocabulary.padding_index)

        self.hidden_size = hidden_size

        self.lstm = LSTM(input_size=word_embedding_dim,
                         hidden_size=hidden_size,
                         num_layers=num_layer,
                         bidirectional=True,
                         dropout=dropout)

        self.liner = Linear(in_features=hidden_size * 2,
                            out_features=label_vocabulary.label_size)
        self.reset_parameters()
def get_featurized_embedding(features: List,
                             featurizer,
                             featurizer_dim,
                             dtype=torch.float,
                             device=None,
                             verbose=True):
    embeddings_list = np.empty((len(features), featurizer_dim))
    index_map = np.empty((len(features)), dtype=np.int64)
    iterator = range(len(features))
    count = 0
    if verbose:
        print("Processing features of dataset...")
        iterator = tqdm(iterator)
    for i in iterator:
        featurized = featurizer(features[i])
        if featurized is not None:
            embeddings_list[count] = featurizer(features[i])
            index_map[i] = count
            count += 1
        else:
            index_map[i] = -1
    embeddings_list = embeddings_list[0:count]
    return Embedding.from_pretrained(
        torch.as_tensor(np.array(embeddings_list), dtype=dtype,
                        device=device)), torch.as_tensor(index_map,
                                                         dtype=torch.long,
                                                         device=device)
 def __init__(self, input_embeddings: str, input_vocabulary_size: int,
              input_embeddings_size: int, clear_text: bool,
              tokenize_model: str):
     super().__init__()
     if clear_text:
         assert tokenize_model is not None
         from pytorch_pretrained_bert import BertTokenizer
         self.bert_tokenizer = BertTokenizer.from_pretrained(
             tokenize_model, do_lower_case=False)
         input_vocabulary_size = len(self.bert_tokenizer.vocab)
         self.lut_embeddings = Embedding(
             num_embeddings=input_vocabulary_size,
             embedding_dim=input_embeddings_size,
             padding_idx=pad_token_index)
         self._is_fixed = False
     else:
         self.bert_tokenizer = None
         if input_embeddings is not None:
             self.lut_embeddings = Embedding.from_pretrained(
                 embeddings=input_embeddings, freeze=True)
             self._is_fixed = True
         else:
             self.lut_embeddings = Embedding(
                 num_embeddings=input_vocabulary_size,
                 embedding_dim=input_embeddings_size,
                 padding_idx=pad_token_index)
             self._is_fixed = False
     self._output_dim = input_embeddings_size
예제 #9
0
def evaluate_outer(args: argparse.Namespace) -> dict:
    # log namespace arguments and model directory
    LOGGER.info(args)
    LOGGER.info("Model log directory: %s" % args.model_log_directory)

    # set gpu and cpu hardware
    gpu_device = set_hardware(args)

    # get relevant patterns
    pattern_specs = get_pattern_specs(args)

    # load vocab and embeddings
    vocab_file = os.path.join(args.model_log_directory, "vocab.txt")
    if os.path.exists(vocab_file):
        vocab = Vocab.from_vocab_file(
            os.path.join(args.model_log_directory, "vocab.txt"))
    else:
        raise FileNotFoundError("File not found: %s" % vocab_file)

    # generate embeddings to fill up correct dimensions
    embeddings = torch.zeros(len(vocab), args.word_dim)
    embeddings = Embedding.from_pretrained(embeddings,
                                           freeze=args.static_embeddings,
                                           padding_idx=PAD_TOKEN_INDEX)

    # load evaluation data here
    eval_input, eval_text = read_docs(args.eval_data, vocab)
    LOGGER.info("Sample evaluation text: %s" % eval_text[:10])
    eval_input = cast(List[List[int]], eval_input)
    eval_labels = read_labels(args.eval_labels)
    num_classes = len(set(eval_labels))
    eval_data = list(zip(eval_input, eval_labels))

    # get semiring
    semiring = get_semiring(args)

    # create SoftPatternClassifier
    model = SoftPatternClassifier(
        pattern_specs,
        num_classes,
        embeddings,  # type:ignore
        vocab,
        semiring,
        args.tau_threshold,
        args.no_wildcards,
        args.bias_scale,
        args.wildcard_scale,
        0.)

    # log information about model
    LOGGER.info("Model: %s" % model)

    # execute inner evaluation workflow
    clf_report = evaluate_inner(eval_data, model, args.model_checkpoint,
                                args.model_log_directory, args.batch_size,
                                args.output_prefix, gpu_device,
                                args.max_doc_len)
    return clf_report
    def concat_embeddings(embedding_1, embedding_2):
        if (embedding_1.num_embeddings != embedding_2.num_embeddings):
            raise ValueError(
                "Number of embeddings (num_embeddings) should match")

        embedded_weight = torch.cat((embedding_1.weight, embedding_2.weight),
                                    dim=1)
        embedding_concat = Embedding.from_pretrained(embedded_weight)
        return embedding_concat
예제 #11
0
 def __init__(self,
              embeddings,
              max_word=32,
              multi_image=1,
              multi_merge='att',
              labels=None,
              aete_s=2000,
              aete_r=5,
              lstm_dim=256,
              lambda_a=0.85,
              teacher_forcing=None,
              image_model=None,
              image_pretrained=None,
              finetune_image=False,
              image_finetune_epoch=None,
              rl_opts=None,
              word_idxs=None,
              device='gpu',
              verbose=False):
     super(TieNet, self).__init__(max_word, multi_image, multi_merge,
                                  teacher_forcing, image_finetune_epoch,
                                  rl_opts, word_idxs, verbose)
     # Label statistics
     self.chexpert_labels, self.lp, self.ln, self.lq = self._load_labels(
         labels)
     # Various NN parameters
     self.feat_dim = lstm_dim
     self.lstm_dim = lstm_dim
     self.lambda_a = lambda_a
     self.dropout = Dropout(0.5)
     # Image processes
     if image_model is None:
         image_model = 'resnet50'
     self.image_feats, image_dim = ImageClassification.image_features(
         image_model, not finetune_image, True, image_pretrained, device)
     self._init_multi_image(image_dim, self.VISUAL_NUM, lstm_dim)
     self.image_proj = Linear(image_dim, lstm_dim)
     # Word processes
     self.init_h = Linear(lstm_dim, lstm_dim)
     self.init_c = Linear(lstm_dim, lstm_dim)
     self.att_v = Linear(image_dim, image_dim)
     self.att_h = Linear(lstm_dim, image_dim)
     self.att_a = Linear(image_dim, 1)
     self.gate = Linear(lstm_dim, image_dim)
     input_dim = image_dim + embeddings.shape[1]
     self.lstm_word = LSTMCell(input_dim, lstm_dim)
     self.embeddings = Embedding.from_pretrained(
         embeddings,
         freeze=False,
         padding_idx=PretrainedEmbeddings.INDEX_PAD)
     self.embed_num = self.embeddings.num_embeddings
     self.word_dense = Linear(lstm_dim, embeddings.shape[0], bias=False)
     # AETE processes
     self.aete1 = Linear(lstm_dim, aete_s)
     self.aete2 = Linear(aete_s, aete_r)
     # Joint
     self.joint = Linear(lstm_dim + image_dim, self.DISEASE_NUM * 2)
 def __init__(self, config, stroke_embedding=None):
     super(LPN, self).__init__()
     self.config = config
     self.init_predictors()
     self.init_base_lstm()
     if stroke_embedding is not None:
         self.stroke_embedding = Embedding.from_pretrained(
             torch.from_numpy(stroke_embedding), freeze=True)
     else:
         self.stroke_embedding = None
예제 #13
0
 def __init__(
     self,
     embeddings: torch.tensor,
     hidden_size: int,
     num_layers: int,
     dropout: float,
     bidirectional: bool,
     num_class: int,
 ) -> None:
     super(SeqClassifier, self).__init__()
     self.embed = Embedding.from_pretrained(embeddings, freeze=False)
예제 #14
0
 def __init__(self, config: Dict, vocab: Vocab, emb_matrix):
     """
     Baseline classifier, hyperparameters are passed in `config`.
     Consists of recurrent part and a classifier (Multilayer Perceptron) part
     Keys are:
         - freeze: whether word embeddings should be frozen
         - cell_type: one of: RNN, GRU, LSTM, which recurrent cell model should use
         - hidden_size: size of hidden state for recurrent cell
         - num_layers: amount of recurrent cells in the model
         - cell_dropout: dropout rate between recurrent cells (not applied if model has only one cell!)
         - bidirectional: boolean, whether to use unidirectional of bidirectional model
         - out_activation: one of: "sigmoid", "tanh", "relu", "elu". Activation in classifier part
         - out_dropout: dropout rate in classifier part
         - out_sizes: List[int], hidden size of each layer in classifier part. Empty list means that final
             layer is attached directly to recurrent part output
     :param config: configuration of model
     :param vocab: vocabulary
     :param emb_matrix: embeddings matrix from `prepare_emb_matrix`
     """
     super().__init__()
     self.config = config
     self.vocab = vocab
     self.emb_matrix = emb_matrix
     self.embeddings = Embedding.from_pretrained(emb_matrix,
                                                 freeze=config["freeze"],
                                                 padding_idx=vocab.PAD_IDX)
     cell_types = {"RNN": RNN, "GRU": GRU, "LSTM": LSTM}
     cell_class = cell_types[config["cell_type"]]
     self.cell = cell_class(
         input_size=emb_matrix.size(1),
         batch_first=True,
         hidden_size=config["hidden_size"],
         num_layers=config["num_layers"],
         dropout=config["cell_dropout"],
         bidirectional=config["bidirectional"],
     )
     activation_types = {
         "sigmoid": sigmoid,
         "tanh": tanh,
         "relu": relu,
         "elu": elu,
     }
     self.out_activation = activation_types[config["out_activation"]]
     self.out_dropout = Dropout(config["out_dropout"])
     cur_out_size = config["hidden_size"] * config["num_layers"]
     if config["bidirectional"]:
         cur_out_size *= 2
     out_layers = []
     for cur_hidden_size in config["out_sizes"]:
         out_layers.append(Linear(cur_out_size, cur_hidden_size))
         cur_out_size = cur_hidden_size
     out_layers.append(Linear(cur_out_size, 6))
     self.out_proj = Sequential(*out_layers)
예제 #15
0
    def __init__(self,
                 token_vocabulary: Union[Vocabulary, PretrainedVocabulary],
                 token_embedding_dim: int,
                 category_vocabulary: LabelVocabulary,
                 category_embedding_dim: int,
                 label_vocabulary: LabelVocabulary
                 ):
        super().__init__()

        self._token_vocabulary = token_vocabulary

        if isinstance(self._token_vocabulary, Vocabulary):
            self.token_embedding = Embedding(num_embeddings=self._token_vocabulary.size,
                                             embedding_dim=token_embedding_dim,
                                             padding_idx=self._token_vocabulary.padding_index)
        elif isinstance(self._token_vocabulary, PretrainedVocabulary):
            self.token_embedding = Embedding.from_pretrained(
                embeddings=self._token_vocabulary.embedding_matrix,
                padding_idx=self._token_vocabulary.padding_index,
                freeze=False
            )
        else:
            raise RuntimeError(
                f"token_vocabulary type: {type(token_vocabulary)} 不是 Vocabulary 或 PretrainedVocabulary")

        self._category_vocabulary = category_vocabulary
        self.category_embedding = Embedding(num_embeddings=self._category_vocabulary.label_size,
                                            embedding_dim=category_embedding_dim,
                                            padding_idx=self._category_vocabulary.padding_index)

        lstm_hidden_size = token_embedding_dim
        lstm_input_size = token_embedding_dim + category_embedding_dim
        self.lstm = LSTM(input_size=lstm_input_size,
                         hidden_size=lstm_hidden_size,
                         num_layers=1,
                         bidirectional=False,
                         batch_first=True,
                         dropout=0.4)

        attention_input_size = (category_embedding_dim + lstm_hidden_size)
        attetion_value_hidden_size = None
        self.attention_seq2vec = AttentionSeq2Vec(input_size=attention_input_size,
                                                  query_hidden_size=lstm_input_size,
                                                  value_hidden_size=attetion_value_hidden_size)

        attention_output_size = \
            attention_input_size if attetion_value_hidden_size is None else attetion_value_hidden_size

        fc_input_size = attention_output_size + lstm_hidden_size
        self.fc = Linear(in_features=fc_input_size,
                         out_features=label_vocabulary.label_size)

        self.reset_parameters()
예제 #16
0
파일: sat.py 프로젝트: TCBpenta8/ifcc-1
    def __init__(self,
                 embeddings,
                 max_word=32,
                 multi_image=1,
                 multi_merge='att',
                 context_dim=512,
                 lstm_dim=1000,
                 lambda_a=1.0,
                 teacher_forcing=None,
                 image_model=None,
                 image_pretrained=None,
                 finetune_image=False,
                 image_finetune_epoch=None,
                 rl_opts=None,
                 word_idxs=None,
                 device='gpu',
                 verbose=False):
        super(ShowAttendAndTell,
              self).__init__(max_word, multi_image, multi_merge,
                             teacher_forcing, image_finetune_epoch, rl_opts,
                             word_idxs, verbose)
        self.feat_dim = context_dim
        self.lstm_dim = lstm_dim
        self.lambda_a = lambda_a

        self.dropout = Dropout(0.5)
        # Image processes
        if image_model is None:
            image_model = 'vgg'
        self.image_feats, image_dim = ImageClassification.image_features(
            image_model, not finetune_image, True, image_pretrained, device)
        self._init_multi_image(image_dim, self.VISUAL_NUM, lstm_dim)
        self.image_proj = Linear(image_dim, context_dim)
        # Word processes
        self.init_h = Linear(context_dim, lstm_dim)
        self.init_c = Linear(context_dim, lstm_dim)
        self.att_v = Linear(image_dim, image_dim)
        self.att_h = Linear(lstm_dim, image_dim)
        self.att_a = Linear(image_dim, 1)
        self.gate = Linear(lstm_dim, image_dim)
        input_dim = image_dim + embeddings.shape[1]
        self.lstm_word = LSTMCell(input_dim, lstm_dim)
        self.embeddings = Embedding.from_pretrained(
            embeddings,
            freeze=False,
            padding_idx=PretrainedEmbeddings.INDEX_PAD)
        self.embed_num = self.embeddings.num_embeddings
        # Deep output
        self.lh = Linear(lstm_dim, embeddings.shape[1])
        self.lz = Linear(image_dim, embeddings.shape[1])
        self.lo = Linear(embeddings.shape[1], embeddings.shape[0], bias=False)
예제 #17
0
    def __init__(self, char_embedding, char_embed_dim, char_hidden_dim,
                 dropout_rate):
        super(CharacterCNN, self).__init__()
        char_embeddings_weight = torch.FloatTensor(char_embedding)
        self.char_matrix = Embedding.from_pretrained(char_embeddings_weight,
                                                     freeze=False)

        # self.char_matrix = nn.Embedding(char_embedding.shape[0], char_embedding.shape[1])
        # self.char_matrix.weight.data.copy_(torch.from_numpy(char_embedding))
        self.drop_out = Dropout(p=dropout_rate)
        self.char_cnn_layer = nn.Conv1d(char_embed_dim,
                                        char_hidden_dim,
                                        kernel_size=3,
                                        padding=1)
    def __init__(self, embedding_matrix, padding_idx, static=True):
        """Construct GloveEmbedding.

        Args:
            embedding_matrix (torch.Tensor): The matrix contrainining the embedding weights
            padding_idx (int): The padding index in the tokenizer.
            static (bool): Whether or not to freeze embeddings.
        """
        super(GloveEmbedding, self).__init__()
        self.embedding = Embedding.from_pretrained(embedding_matrix)
        self.embedding.padding_idx = padding_idx
        if static:
            self.embedding.weight.required_grad = False
        self.flatten = Flatten(start_dim=1)
예제 #19
0
    def __init__(self, input_size, hidden_size, bilstm_layers, weights_matrix, cam_type, device, context='art',
                 pos_dim=100, src_dim=100, pos_quartiles=4, nr_srcs=3):
        super(ContextAwareModel, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size # + pos_dim + src_dim
        self.bilstm_layers = bilstm_layers
        self.device = device
        self.cam_type = cam_type
        self.context = context

        # Store pretrained embeddings to use as representations of sentences
        self.weights_matrix = torch.tensor(weights_matrix, dtype=torch.float, device=self.device)
        self.embedding = Embedding.from_pretrained(self.weights_matrix)
        self.embedding_pos = Embedding(pos_quartiles, pos_dim)  # option to embed position of target sentence in article
        self.embedding_src = Embedding(nr_srcs, src_dim)
        self.emb_size = weights_matrix.shape[1]

        # Initialise LSTMS for article and event context
        self.lstm_art = LSTM(self.input_size, self.hidden_size, num_layers=self.bilstm_layers, bidirectional=True, dropout=0.2)
        self.lstm_ev1 = LSTM(self.input_size, self.hidden_size, num_layers=self.bilstm_layers, bidirectional=True, dropout=0.2)
        self.lstm_ev2 = LSTM(self.input_size, self.hidden_size, num_layers=self.bilstm_layers, bidirectional=True, dropout=0.2)

        # Attention-related attributes
        # self.attention = BahdanauAttention(self.hidden_size, key_size=self.hidden_size * 2, query_size=self.emb_size)
        # self.rob_squeezer = nn.Linear(self.emb_size, self.hidden_size)

        self.dropout = Dropout(0.6)
        self.num_labels = 2
        self.pad_index = 0

        if self.context == 'art':
            self.context_rep_dim = self.emb_size + self.hidden_size * 2  # size of target sentences + 1 article
        else:
            self.context_rep_dim = self.emb_size + self.hidden_size * 6  # size of target sentences + 3 articles

        if self.cam_type == 'cim*':
            self.context_rep_dim += src_dim  # add representation of source

        self.half_context_rep_dim = int(self.context_rep_dim*0.5)
        self.dense = nn.Linear(self.context_rep_dim, self.half_context_rep_dim)

        if self.cam_type == 'cnm':
            # optional Context Naive setting
            self.classifier = Linear(self.emb_size, self.num_labels)
        else:
            self.classifier = Linear(self.half_context_rep_dim, self.num_labels) # + self.emb_size + src_dim, 2) #

        self.sigm = Sigmoid()
예제 #20
0
 def __init__(self, input_embeddings, input_vocabulary_size,
              input_embeddings_size, clear_text):
     super().__init__()
     assert not clear_text
     if input_embeddings is not None:
         self.lut_embeddings = Embedding.from_pretrained(
             embeddings=input_embeddings, freeze=True)
         self._is_fixed = True
     else:
         self.lut_embeddings = Embedding(
             num_embeddings=input_vocabulary_size,
             embedding_dim=input_embeddings_size,
             padding_idx=pad_token_index)
         self._is_fixed = False
     self._output_dim = input_embeddings_size
예제 #21
0
    def __init__(self,
                 embeddings,
                 feat_dim=512,
                 max_word=32,
                 multi_image=1,
                 multi_merge='att',
                 teacher_forcing=False,
                 image_model=None,
                 image_pretrained=None,
                 finetune_image=False,
                 image_finetune_epoch=None,
                 rl_opts=None,
                 word_idxs=None,
                 device='gpu',
                 verbose=False):
        super(KnowingWhenToLook,
              self).__init__(max_word, multi_image, multi_merge,
                             teacher_forcing, image_finetune_epoch, rl_opts,
                             word_idxs, verbose)
        self.feat_dim = feat_dim

        self.dropout = Dropout(0.5)
        # Image processes
        if image_model is None:
            image_model = 'resnet'
        self.image_feats, image_dim = ImageClassification.image_features(
            image_model, not finetune_image, True, image_pretrained, device)
        self._init_multi_image(image_dim, self.VISUAL_NUM, feat_dim)
        self.image_proj_l = Linear(image_dim, feat_dim)
        self.image_proj_g = Linear(image_dim, feat_dim)
        # Visual sentinel
        input_dim = feat_dim + embeddings.shape[1]
        self.vs_att_h = Linear(self.VISUAL_NUM, 1, bias=False)
        self.vs_att_v = Linear(feat_dim, self.VISUAL_NUM, bias=False)
        self.vs_att_g = Linear(feat_dim, self.VISUAL_NUM, bias=False)
        self.vs_att_s = Linear(feat_dim, self.VISUAL_NUM, bias=False)
        self.vs_dense1 = Linear(input_dim, feat_dim, bias=False)
        self.vs_dense2 = Linear(feat_dim, feat_dim, bias=False)
        # Word processes
        self.lstm_word = LSTMCell(input_dim, feat_dim)
        self.embeddings = Embedding.from_pretrained(
            embeddings,
            freeze=False,
            padding_idx=PretrainedEmbeddings.INDEX_PAD)
        self.embed_num = self.embeddings.num_embeddings
        self.word_dense = Linear(feat_dim, embeddings.shape[0], bias=False)
    def merge_embeddings(embedding_1, embedding_2, mapping):

        weights = []
        for i in range(embedding_1.num_embeddings
                       ):  # dict file eke embeddig walata loop kala
            # Map the correct tag by mapping given in 'tags' list
            input_1 = torch.LongTensor([i])  # for embedding_1
            input_2 = torch.LongTensor(
                [mapping[i]])  # for embedding_2 (tags 10n ekak select wenawa)

            weights.append(
                torch.cat((embedding_1(input_1), embedding_2(input_2)), dim=1))

        cat_weights = torch.cat(weights, dim=0)
        final_embedding = Embedding.from_pretrained(cat_weights)

        return final_embedding
예제 #23
0
 def __init__(self,
              embedding_vecs,
              lstm_hidden,
              out_dim,
              padding_idx=1,
              projected_dim=200,
              bidirectional=True,
              dropout=0):
     super().__init__()
     self.embedding = Embedding.from_pretrained(embedding_vecs,
                                                padding_idx=padding_idx,
                                                freeze=True)
     self.projection = Linear(embedding_vecs.shape[1], projected_dim)
     self.lstm2out = Linear(
         lstm_hidden * 2 if bidirectional else lstm_hidden, out_dim)
     self.lstm = LSTMAcceptor(projected_dim,
                              lstm_hidden,
                              bidirectional=bidirectional,
                              dropout=dropout)
예제 #24
0
 def get_affect_intensity_embedding(self, fname):
     unique_emotions = self.read_unique_emotions(fname, 2)
     emotion_to_id = {
         emotion: i
         for i, emotion in enumerate(unique_emotions)
     }
     max_tokens = max(self.index_to_token.keys())
     embedding = np.zeros((max_tokens + 1, len(emotion_to_id)))
     with open(fname) as f:
         _ = f.readline()
         for line in f:
             if len(line.split("\t")) == 3:
                 word, score, emotion = line.split("\t")
                 if word in self.token_to_index:
                     embedding[self.token_to_index[word],
                               emotion_to_id[emotion]] = float(
                                   score.strip())
     return Embedding.from_pretrained(
         torch.tensor(embedding.astype(np.float32)))
예제 #25
0
파일: base_model.py 프로젝트: WEYAI/MTSL
 def __init__(self, word_dim, num_words, char_dim, num_chars, num_labels, num_filters,
              kernel_size, rnn_mode, hidden_size, num_layers, embedd_word=None, p_in=0.33, p_out=0.5,
              p_rnn=(0.5, 0.5), lm_loss=0.05, bigram=True, use_crf=True, use_lm=True, use_elmo=False):
     super(BaseModel, self).__init__()
     self.lm_loss = lm_loss
     self.use_elmo = use_elmo
     if self.use_elmo:
         option_file, weight_file = embedd_word
         self.elmo = Elmo(option_file, weight_file, 2, dropout=0)
         word_dim = 1024
         num_filters = 0
     else:
         if isinstance(embedd_word, torch.Tensor):
             self.word_embedd = Embedding.from_pretrained(embedd_word, freeze=False)
         else:
             self.word_embedd = Embedding(num_words, word_dim)
         self.char_embedd = Embedding(num_chars, char_dim)
         self.conv1d = nn.Conv1d(char_dim, num_filters, kernel_size, padding=kernel_size - 1)
         self.dropout_in = nn.Dropout(p=p_in)
     self.dropout_rnn_in = nn.Dropout(p=p_rnn[0])
     self.dropout_out = nn.Dropout(p_out)
     self.use_crf = use_crf
     self.use_lm = use_lm
     if rnn_mode == 'RNN':
         RNN = nn.RNN
     elif rnn_mode == 'LSTM':
         RNN = nn.LSTM
     elif rnn_mode == 'GRU':
         RNN = nn.GRU
     else:
         raise ValueError('Unknown RNN mode: %s' % rnn_mode)
     self.rnn = RNN(word_dim + num_filters, hidden_size, num_layers=num_layers, batch_first=True,
                    bidirectional=True, dropout=p_rnn[1])
     if self.use_crf:
         self.crf = ChainCRF(hidden_size * 2, num_labels, bigram=bigram)
     else:
         self.dense_softmax = nn.Linear(hidden_size * 2, num_labels)
     if self.use_lm:
         self.dense_fw = nn.Linear(hidden_size, num_words)
         self.dense_bw = nn.Linear(hidden_size, num_words)
     self.logsoftmax = nn.LogSoftmax(dim=1)
     self.nll_loss = nn.NLLLoss(size_average=False, reduce=False)
예제 #26
0
    def __init__(self,
                 embeddings,
                 feat_dim=512,
                 max_word=32,
                 multi_image=1,
                 image_pe=True,
                 layer_norm=False,
                 teacher_forcing=False,
                 image_model=None,
                 image_pretrained=None,
                 finetune_image=False,
                 image_finetune_epoch=None,
                 rl_opts=None,
                 word_idxs=None,
                 device='gpu',
                 verbose=False):
        super(_TransformerCaptioner,
              self).__init__(max_word, multi_image, None, teacher_forcing,
                             image_finetune_epoch, rl_opts, word_idxs, verbose)
        self.feat_dim = feat_dim

        self.dropout = Dropout(0.1)
        self.layer_norm = LayerNorm(feat_dim) if layer_norm else None
        # Image processes
        if image_model is None:
            image_model = 'densenet'
        self.image_feats, image_dim = ImageClassification.image_features(
            image_model, not finetune_image, True, image_pretrained, device)
        self.image_proj_l = Linear(image_dim, feat_dim)
        image_len = int(math.sqrt(self.VISUAL_NUM))
        self.image_weight = math.sqrt(image_len)
        if image_pe:
            self.image_pe = PositionalEncoding2D(feat_dim, image_len)
        # Word processes
        self.embeddings = Embedding.from_pretrained(
            embeddings,
            freeze=False,
            padding_idx=PretrainedEmbeddings.INDEX_PAD)
        self.embed_num = self.embeddings.num_embeddings
        self.word_weight = math.sqrt(max_word)
        self.word_pe = PositionalEncoding(feat_dim, max_len=max_word + 1)
        self.word_dense = Linear(feat_dim, embeddings.shape[0], bias=False)
예제 #27
0
    def reload(self, embedding_filename, encoding="utf-8", gpu=False):
        self.gpu = gpu
        words_and_vectors = read_embedding(embedding_filename, encoding)
        self.output_dim = len(words_and_vectors[0][1])
        # noinspection PyCallingNonCallable
        words_and_vectors.insert(0, ("*UNK*", [0.0] * self.output_dim))

        words, vectors_py = zip(*words_and_vectors)
        self.lookup = {word: idx for idx, word in enumerate(words)}
        # noinspection PyCallingNonCallable
        vectors = torch.tensor(vectors_py, dtype=torch.float32)

        # prevent .cuda()
        # noinspection PyReturnFromInit
        self.embedding_ = [
            NoPickle(Embedding.from_pretrained(vectors, freeze=True))
        ]

        if self.project_to:
            self.projection = Linear(self.output_dim, self.project_to)
            self.output_dim = self.project_to
예제 #28
0
    def get_emotion_embedding(self, fname):
        unique_emotions = self.read_unique_emotions(fname, 1)
        emotion_to_id = {
            emotion: i
            for i, emotion in enumerate(unique_emotions)
        }
        max_tokens = max(self.index_to_token.keys())
        embedding = np.zeros((max_tokens + 1, len(emotion_to_id)))
        all_words = set()
        with open(fname) as f:
            for line in f:
                if len(line.split("\t")) == 3:
                    current_word, emotion, association = line.split("\t")
                    if current_word in self.token_to_index:
                        embedding[self.token_to_index[current_word],
                                  emotion_to_id[emotion]] = int(
                                      association.strip())
                    all_words.add(current_word)

        with open("emotion_not_found.txt", "w") as f:
            for word in set(self.token_to_index.keys()).difference(all_words):
                f.write(word + "\n")
        return Embedding.from_pretrained(
            torch.tensor(embedding.astype(np.float32)))
예제 #29
0
 def __init__(self, word_embedding):
     super(WordEmbed, self).__init__()
     word_embeddings_weight = torch.FloatTensor(word_embedding)
     self.word_matrix = Embedding.from_pretrained(word_embeddings_weight,
                                                  freeze=False)
예제 #30
0
 def from_pretrained(self, embedding_matrix):
     if not isinstance(embedding_matrix, torch.Tensor):
         embedding_matrix = torch.tensor(embedding_matrix).float()
     self.embedding = Embedding.from_pretrained(embedding_matrix)