Пример #1
0
class opinionBERT(nn.Module):
    def __init__(self, bert_name: str, num_labels: int, num_layers: int,
                 hidden_size: int, dropout_prob: float, rnn_type: str,
                 bidirectional: bool, use_crf: bool, freeze_bert: bool):

        super().__init__()
        self.bert = BertModel.from_pretrained(bert_name)
        if freeze_bert:
            self.bert.requires_grad = False
        if num_layers > 0:
            if rnn_type == "gru":
                self.rnn = nn.GRU(self.bert.config.hidden_size,
                                  hidden_size,
                                  num_layers=num_layers,
                                  bidirectional=bidirectional,
                                  batch_first=True)
            else:
                self.rnn = nn.LSTM(self.bert.config.hidden_size,
                                   hidden_size,
                                   num_layers=num_layers,
                                   bidirectional=bidirectional,
                                   batch_first=True)
        else:
            self.rnn = nn.Identity()
        self.classifier = nn.Linear((1 + bidirectional) * hidden_size,
                                    num_labels)
        self.dropout = nn.Dropout(dropout_prob)
        self.use_crf = use_crf
        if self.use_crf:
            self.crf = CRF(num_labels, batch_first=True)

    def forward(self,
                input_ids,
                attn_mask,
                crf_attn_mask,
                tags=None,
                class_weights=None):
        bert_output = self.bert(input_ids, attn_mask)
        bert_output = bert_output.last_hidden_state
        bert_output = self.dropout(bert_output)

        rnn_output, _ = self.rnn(bert_output)

        logits = self.classifier(rnn_output)

        if self.use_crf:
            pred = self.crf.decode(logits, crf_attn_mask)
        else:
            detached_logits = logits.detach().cpu().numpy()
            pred = [
                list(sentence_pred)
                for sentence_pred in np.argmax(detached_logits, axis=2)
            ]

        if tags is not None:
            if self.use_crf:
                loss = -self.crf(
                    logits, tags, mask=crf_attn_mask, reduction="mean")
            else:
                num_labels = logits.shape[-1]
                if class_weights is not None:
                    loss_fct = nn.CrossEntropyLoss(weight=class_weights)
                else:
                    loss_fct = nn.CrossEntropyLoss()
                active_loss = attn_mask.view(-1) == 1
                active_logits = logits.view(-1, num_labels)
                active_labels = torch.where(
                    active_loss, tags.view(-1),
                    torch.Tensor([loss_fct.ignore_index
                                  ]).type_as(tags)).long()
                loss = loss_fct(active_logits, active_labels)
            return loss, pred
        else:
            return pred
Пример #2
0
    def __init__(self, config):
        super().__init__()

        self.birnn = BiRNN(config)
        # self.transitions = nn.Parameter(torch.randn(config.num_classes, config.num_classes)) # 转移矩阵,随机初始化
        self.crf = CRF(config.num_classes, batch_first=True)
Пример #3
0
                     hidden_layers,
                     dropout,
                     output_layers,
                     lemma2synsets,
                     synset2id,
                     known_pos,
                     known_entity_tags,
                     use_flair=use_flair,
                     combine_WN_FN=combine_WN_FN)
    model.to(device)
    loss_func_embed = torch.nn.MSELoss()
    if crf_layer is True:
        if "classify_wsd" in output_layers:
            loss_func_classify = torch.nn.CrossEntropyLoss(ignore_index=-100)
        if "pos_tagger" in output_layers:
            loss_func_pos = CRF(len(known_pos), batch_first=True)
        if "ner" in output_layers:
            loss_func_ner = CRF(len(known_entity_tags), batch_first=True)
    else:
        loss_func_classify = torch.nn.CrossEntropyLoss(ignore_index=-100)
        loss_func_pos = torch.nn.CrossEntropyLoss()
        loss_func_ner = torch.nn.CrossEntropyLoss()
    # loss_func_classify = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters())
    # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Eval loop
    if args.mode == "evaluate":
        model.load_state_dict(torch.load(args.save_path))
        model.eval()
        test_accuracy_embed, test_accuracy_classify, log = eval_loop(
Пример #4
0
class CRF_Model(nn.Module):
    def __init__(self, hparams):
        super(CRF_Model, self).__init__()
        self._device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.name = hparams.model_name
        self.word_embedding = nn.Embedding(
            hparams.vocab_size, hparams.embedding_dim)
        if hparams.embeddings is not None:
            print("initializing embeddings from pretrained")
            self.word_embedding.weight.data.copy_(hparams.embeddings)

        self.lstm = nn.LSTM(hparams.embedding_dim, hparams.hidden_dim,
                            bidirectional=hparams.bidirectional,
                            num_layers=hparams.num_layers,
                            dropout=hparams.dropout if hparams.num_layers > 1 else 0,
                            batch_first=True)

        lstm_output_dim = hparams.hidden_dim if hparams.bidirectional is False else hparams.hidden_dim * 2
        self.dropout = nn.Dropout(hparams.dropout)
        self.classifier = nn.Linear(lstm_output_dim, hparams.num_classes)
        self.crf = CRF(hparams.num_classes, batch_first=True)

    def forward(self, x):
        # [Samples_Num, Seq_Len]
        embeddings = self.word_embedding(x)
        embeddings = self.dropout(embeddings)
        # [Samples_Num, Seq_Len]
        o, _ = self.lstm(embeddings)
        # [Samples_Num, Seq_Len, Tags_Num]
        o = self.dropout(o)
        # [Samples_Num, Seq_Len, Tags_Num]
        logits = self.classifier(o)
        # [Samples_Num, Seq_Len]
        return logits

    def log_probs(self, x, tags, mask=None):
        emissions = self(x)
        return self.crf(emissions, tags, mask=mask)

    def predict(self, x):
        emissions = self(x)
        return self.crf.decode(emissions)

    def predict_new(self, x, mask=None):
        emissions = self(x)
        return self.crf.decode(emissions, mask=mask)

    def save_checkpoint(self, model_path):
        """
        Saves the model checkpoint
        Args:
            model_path:

        Returns:

        """
        torch.save(self, model_path)
        model_checkpoint = model_path.replace('.pt', '.pth')
        torch.save(self.state_dict(), model_checkpoint)

    def load_model(self, path):
        """
        Loads the model from a given path, loads it to the available device whether its CUDA or CPU
        Args:
            path:

        Returns:

        """
        state_dict = torch.load(path) if self._device == 'cuda' else torch.load(path,
                                                                                map_location=torch.device(self._device))
        self.load_state_dict(state_dict)

    def encode_tokens(self, tokens, word2idx):
        """
        Helper method during prediction
        Encodes the tokens passed during prediction time, fetches word idx from word2idx
        Args:
            tokens:
            word2idx:

        Returns:

        """
        data = []
        for sentence in tokens:
            paragraph = []
            for i in sentence:
                paragraph.append(word2idx.get(i, 1))
            paragraph = torch.LongTensor(paragraph).to(self._device)
            data.append(paragraph)
        return pad_sequence(data, batch_first=True, padding_value=0)
Пример #5
0
def make_crf(num_tags=5):
    return CRF(num_tags)
Пример #6
0
class RobertaLSTMCRF(RobertaForTokenClassification):
    def __init__(self, config, lstm_hidden_size, lstm_layers):
        super().__init__(config)
        self.lstm = torch.nn.LSTM(
            input_size=config.hidden_size,
            hidden_size=lstm_hidden_size,
            num_layers=lstm_layers,
            dropout=0.2,
            batch_first=True,
            bidirectional=True,
        )
        self.crf = CRF(config.num_labels, batch_first=True)

        del self.classifier
        self.classifier = torch.nn.Linear(2 * lstm_hidden_size,
                                          config.num_labels)

    def forward(
        self,
        input_ids,
        attention_mask=None,
        token_type_ids=None,
        labels=None,
        prediction_mask=None,
    ):

        outputs = self.roberta(
            input_ids,
            attention_mask,
            token_type_ids,
            output_hidden_states=True,
            return_dict=False,
        )
        # seq_output, all_hidden_states, all_self_attntions, all_cross_attentions

        sequence_output = outputs[
            0]  # outputs[1] is pooled output which is none.

        sequence_output = self.dropout(sequence_output)

        lstm_out, *_ = self.lstm(sequence_output)
        sequence_output = self.dropout(lstm_out)

        logits = self.classifier(sequence_output)

        ## CRF
        mask = prediction_mask
        mask = mask[:, :logits.size(1)].contiguous()

        # print(logits)

        if labels is not None:
            labels = labels[:, :logits.size(1)].contiguous()
            loss = -self.crf(
                logits, labels, mask=mask.bool(), reduction="token_mean")

        tags = self.crf.decode(logits, mask.bool())
        # print(tags)
        if labels is not None:
            return (loss, logits, tags)
        else:
            return (logits, tags)
Пример #7
0
 def test_full(self):
     crf = CRF(10, batch_first=True)
     assert crf.batch_first
Пример #8
0
class ElmoLSTMCRF(BaseModel):
    def __init__(self,
                 config,
                 elmo_model,
                 embedding_path,
                 label_path,
                 pos_path,
                 emb_non_trainable=True,
                 use_crf=False,
                 use_char_cnn=False):
        super().__init__(config=config)

        self.config = config
        self.device = config['opt'].device
        self.seq_size = config['n_ctx']
        pos_emb_dim = config['pos_emb_dim']
        elmo_emb_dim = config['elmo_emb_dim']
        lstm_hidden_dim = config['lstm_hidden_dim']
        lstm_num_layers = config['lstm_num_layers']
        lstm_dropout = config['lstm_dropout']
        self.use_crf = use_crf
        self.use_char_cnn = use_char_cnn

        # elmo embedding
        self.elmo_model = elmo_model

        # glove embedding layer
        weights_matrix = super().load_embedding(embedding_path)
        vocab_dim, token_emb_dim = weights_matrix.size()
        padding_idx = config['pad_token_id']
        self.embed_token = super().create_embedding_layer(
            vocab_dim,
            token_emb_dim,
            weights_matrix=weights_matrix,
            non_trainable=emb_non_trainable,
            padding_idx=padding_idx)

        # pos embedding layer
        self.poss = super().load_dict(pos_path)
        self.pos_vocab_size = len(self.poss)
        padding_idx = config['pad_pos_id']
        self.embed_pos = super().create_embedding_layer(
            self.pos_vocab_size,
            pos_emb_dim,
            weights_matrix=None,
            non_trainable=False,
            padding_idx=padding_idx)

        emb_dim = elmo_emb_dim + token_emb_dim + pos_emb_dim
        # char embedding layer
        if self.use_char_cnn:
            self.charcnn = CharCNN(config)
            emb_dim = elmo_emb_dim + token_emb_dim + pos_emb_dim + self.charcnn.last_dim

        # BiLSTM layer
        self.lstm = nn.LSTM(input_size=emb_dim,
                            hidden_size=lstm_hidden_dim,
                            num_layers=lstm_num_layers,
                            dropout=lstm_dropout,
                            bidirectional=True,
                            batch_first=True)

        self.dropout = nn.Dropout(config['dropout'])

        # projection layer
        self.labels = super().load_dict(label_path)
        self.label_size = len(self.labels)
        self.linear = nn.Linear(lstm_hidden_dim * 2, self.label_size)

        # CRF layer
        if self.use_crf:
            self.crf = CRF(num_tags=self.label_size, batch_first=True)

    def forward(self, x):
        # x[0,1] : [batch_size, seq_size]
        # x[2]   : [batch_size, seq_size, max_characters_per_token]
        token_ids = x[0]
        pos_ids = x[1]
        char_ids = x[2]

        mask = torch.sign(torch.abs(token_ids)).to(torch.uint8).to(self.device)
        # mask : [batch_size, seq_size]
        lengths = torch.sum(mask.to(torch.long), dim=1)
        # lengths : [batch_size]

        # 1. Embedding
        elmo_embed_out = self.elmo_model(char_ids)['elmo_representations'][0]
        # elmo_embed_out  : [batch_size, seq_size, elmo_emb_dim]
        '''
        masks = mask.unsqueeze(2).to(torch.float)
        # masks : [batch_size, seq_size, 1]
        elmo_embed_out *= masks # auto-braodcasting
        '''
        token_embed_out = self.embed_token(token_ids)
        # token_embed_out : [batch_size, seq_size, token_emb_dim]
        pos_embed_out = self.embed_pos(pos_ids)
        # pos_embed_out   : [batch_size, seq_size, pos_emb_dim]
        if self.use_char_cnn:
            char_ids = x[2]
            # char_ids : [batch_size, seq_size, char_n_ctx]
            charcnn_out = self.charcnn(char_ids)
            # charcnn_out : [batch_size, seq_size, self.charcnn.last_dim]
            embed_out = torch.cat(
                [elmo_embed_out, token_embed_out, pos_embed_out, charcnn_out],
                dim=-1)
            # embed_out : [batch_size, seq_size, emb_dim]
        else:
            embed_out = torch.cat(
                [elmo_embed_out, token_embed_out, pos_embed_out], dim=-1)
            # embed_out : [batch_size, seq_size, emb_dim]
        embed_out = self.dropout(embed_out)

        # 2. LSTM
        packed_embed_out = torch.nn.utils.rnn.pack_padded_sequence(
            embed_out, lengths, batch_first=True, enforce_sorted=False)
        lstm_out, (h_n, c_n) = self.lstm(packed_embed_out)
        lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(
            lstm_out, batch_first=True, total_length=self.seq_size)
        # lstm_out : [batch_size, seq_size, lstm_hidden_dim*2]
        lstm_out = self.dropout(lstm_out)

        # 3. Output
        logits = self.linear(lstm_out)
        # logits : [batch_size, seq_size, label_size]
        if not self.use_crf: return logits
        prediction = self.crf.decode(logits)
        prediction = torch.as_tensor(prediction, dtype=torch.long)
        # prediction : [batch_size, seq_size]
        return logits, prediction
Пример #9
0
    def __init__(self,
                 config,
                 elmo_model,
                 embedding_path,
                 label_path,
                 pos_path,
                 emb_non_trainable=True,
                 use_crf=False,
                 use_char_cnn=False):
        super().__init__(config=config)

        self.config = config
        self.device = config['opt'].device
        self.seq_size = config['n_ctx']
        pos_emb_dim = config['pos_emb_dim']
        elmo_emb_dim = config['elmo_emb_dim']
        lstm_hidden_dim = config['lstm_hidden_dim']
        lstm_num_layers = config['lstm_num_layers']
        lstm_dropout = config['lstm_dropout']
        self.use_crf = use_crf
        self.use_char_cnn = use_char_cnn

        # elmo embedding
        self.elmo_model = elmo_model

        # glove embedding layer
        weights_matrix = super().load_embedding(embedding_path)
        vocab_dim, token_emb_dim = weights_matrix.size()
        padding_idx = config['pad_token_id']
        self.embed_token = super().create_embedding_layer(
            vocab_dim,
            token_emb_dim,
            weights_matrix=weights_matrix,
            non_trainable=emb_non_trainable,
            padding_idx=padding_idx)

        # pos embedding layer
        self.poss = super().load_dict(pos_path)
        self.pos_vocab_size = len(self.poss)
        padding_idx = config['pad_pos_id']
        self.embed_pos = super().create_embedding_layer(
            self.pos_vocab_size,
            pos_emb_dim,
            weights_matrix=None,
            non_trainable=False,
            padding_idx=padding_idx)

        emb_dim = elmo_emb_dim + token_emb_dim + pos_emb_dim
        # char embedding layer
        if self.use_char_cnn:
            self.charcnn = CharCNN(config)
            emb_dim = elmo_emb_dim + token_emb_dim + pos_emb_dim + self.charcnn.last_dim

        # BiLSTM layer
        self.lstm = nn.LSTM(input_size=emb_dim,
                            hidden_size=lstm_hidden_dim,
                            num_layers=lstm_num_layers,
                            dropout=lstm_dropout,
                            bidirectional=True,
                            batch_first=True)

        self.dropout = nn.Dropout(config['dropout'])

        # projection layer
        self.labels = super().load_dict(label_path)
        self.label_size = len(self.labels)
        self.linear = nn.Linear(lstm_hidden_dim * 2, self.label_size)

        # CRF layer
        if self.use_crf:
            self.crf = CRF(num_tags=self.label_size, batch_first=True)
Пример #10
0
class BertLSTMCRF(BaseModel):
    def __init__(self,
                 config,
                 bert_config,
                 bert_model,
                 bert_tokenizer,
                 label_path,
                 pos_path,
                 use_crf=False,
                 use_pos=False,
                 disable_lstm=False,
                 feature_based=False):
        super().__init__(config=config)

        self.config = config
        self.device = config['opt'].device
        self.seq_size = config['n_ctx']
        pos_emb_dim = config['pos_emb_dim']
        lstm_hidden_dim = config['lstm_hidden_dim']
        lstm_num_layers = config['lstm_num_layers']
        lstm_dropout = config['lstm_dropout']
        self.use_crf = use_crf
        self.use_pos = use_pos
        self.disable_lstm = disable_lstm

        # bert embedding layer
        self.bert_config = bert_config
        self.bert_model = bert_model
        self.bert_tokenizer = bert_tokenizer
        self.bert_feature_based = feature_based
        self.bert_hidden_size = bert_config.hidden_size
        self.bert_num_layers = bert_config.num_hidden_layers

        # DSA layer for bert_feature_based
        dsa_num_attentions = config['dsa_num_attentions']
        dsa_input_dim = self.bert_hidden_size
        dsa_dim = config['dsa_dim']
        dsa_r = config['dsa_r']
        self.dsa = DSA(config,
                       dsa_num_attentions,
                       dsa_input_dim,
                       dsa_dim,
                       dsa_r=dsa_r)
        self.layernorm_dsa = nn.LayerNorm(self.dsa.last_dim)

        bert_emb_dim = self.bert_hidden_size
        if self.bert_feature_based:
            '''
            # 1) last layer, 2) mean pooling
            bert_emb_dim = self.bert_hidden_size
            '''
            # 3) DSA pooling
            bert_emb_dim = self.dsa.last_dim

        # pos embedding layer
        self.poss = super().load_dict(pos_path)
        self.pos_vocab_size = len(self.poss)
        padding_idx = config['pad_pos_id']
        self.embed_pos = super().create_embedding_layer(
            self.pos_vocab_size,
            pos_emb_dim,
            weights_matrix=None,
            non_trainable=False,
            padding_idx=padding_idx)

        # BiLSTM layer
        if self.use_pos:
            emb_dim = bert_emb_dim + pos_emb_dim
        else:
            emb_dim = bert_emb_dim
        if not self.disable_lstm:
            self.lstm = nn.LSTM(input_size=emb_dim,
                                hidden_size=lstm_hidden_dim,
                                num_layers=lstm_num_layers,
                                dropout=lstm_dropout,
                                bidirectional=True,
                                batch_first=True)

        self.dropout = nn.Dropout(config['dropout'])

        # projection layer
        self.labels = super().load_dict(label_path)
        self.label_size = len(self.labels)
        if not self.disable_lstm:
            self.linear = nn.Linear(lstm_hidden_dim * 2, self.label_size)
        else:
            self.linear = nn.Linear(emb_dim, self.label_size)

        # CRF layer
        if self.use_crf:
            self.crf = CRF(num_tags=self.label_size, batch_first=True)

    def _compute_bert_embedding(self, x):
        if self.bert_feature_based:
            # feature-based
            with torch.no_grad():
                if self.config['emb_class'] in ['bart', 'distilbert']:
                    bert_outputs = self.bert_model(input_ids=x[0],
                                                   attention_mask=x[1])
                    # bart model's output(output_hidden_states == True)
                    # [0] last decoder layer's output : [batch_size, seq_size, bert_hidden_size]
                    # [1] all hidden states of decoder layer's
                    # [2] last encoder layer's output : [seq_size, batch_size, bert_hidden_size]
                    # [3] all hidden states of encoder layer's
                    all_hidden_states = bert_outputs[1][0:]
                elif 'electra' in self.config['emb_class']:
                    bert_outputs = self.bert_model(input_ids=x[0],
                                                   attention_mask=x[1],
                                                   token_type_ids=x[2])
                    # electra model's output
                    # list of each layer's hidden states
                    all_hidden_states = bert_outputs
                else:
                    bert_outputs = self.bert_model(
                        input_ids=x[0],
                        attention_mask=x[1],
                        token_type_ids=None if self.config['emb_class'] in [
                            'roberta'
                        ] else x[2])  # RoBERTa don't use segment_ids
                    all_hidden_states = bert_outputs[2][0:]
                    # last hidden states, pooled output,   initial embedding layer, 1 ~ last layer's hidden states
                    # bert_outputs[0],    bert_outputs[1], bert_outputs[2][0],      bert_outputs[2][1:]
                '''
                # 1) last layer
                embedded = bert_outputs[0]
                # embedded : [batch_size, seq_size, bert_hidden_size]
                '''
                '''
                # 2) mean pooling
                stack = torch.stack(all_hidden_states, dim=-1)
                embedded = torch.mean(stack, dim=-1)
                # ([batch_size, seq_size, bert_hidden_size], ..., [batch_size, seq_size, bert_hidden_size])
                # -> stack(-1) -> [batch_size, seq_size, bert_hidden_size, *], ex) * == 25 for bert large
                # -> max/mean(-1) ->  [batch_size, seq_size, bert_hidden_size]
                '''
                # 3) DSA pooling
                stack = torch.stack(all_hidden_states, dim=-2)
                # stack : [batch_size, seq_size, *, bert_hidden_size]
                stack = stack.view(-1, self.bert_num_layers + 1,
                                   self.bert_hidden_size)
                # stack : [*, bert_num_layers, bert_hidden_size]
                dsa_mask = torch.ones(stack.shape[0],
                                      stack.shape[1]).to(self.device)
                # dsa_mask : [*, bert_num_layers]
                dsa_out = self.dsa(stack, dsa_mask)
                # dsa_out : [*, self.dsa.last_dim]
                dsa_out = self.layernorm_dsa(dsa_out)
                embedded = dsa_out.view(-1, self.seq_size, self.dsa.last_dim)
                # embedded : [batch_size, seq_size, self.dsa.last_dim]
        else:
            # fine-tuning
            # x[0], x[1], x[2] : [batch_size, seq_size]
            if self.config['emb_class'] in ['bart', 'distilbert']:
                bert_outputs = self.bert_model(input_ids=x[0],
                                               attention_mask=x[1])
                embedded = bert_outputs[0]
            else:
                bert_outputs = self.bert_model(
                    input_ids=x[0],
                    attention_mask=x[1],
                    token_type_ids=None if self.config['emb_class']
                    in ['roberta'] else x[2])  # RoBERTa don't use segment_ids
                embedded = bert_outputs[0]
                # embedded : [batch_size, seq_size, bert_hidden_size]
        return embedded

    def forward(self, x):
        # x[0,1,2] : [batch_size, seq_size]

        mask = x[1].to(torch.uint8).to(self.device)
        # mask == attention_mask : [batch_size, seq_size]
        lengths = torch.sum(mask.to(torch.long), dim=1)
        # lengths : [batch_size]

        # 1. Embedding
        bert_embed_out = self._compute_bert_embedding(x)
        # bert_embed_out : [batch_size, seq_size, *]
        pos_ids = x[3]
        pos_embed_out = self.embed_pos(pos_ids)
        # pos_embed_out : [batch_size, seq_size, pos_emb_dim]
        if self.use_pos:
            embed_out = torch.cat([bert_embed_out, pos_embed_out], dim=-1)
        else:
            embed_out = bert_embed_out
        # embed_out : [batch_size, seq_size, emb_dim]
        embed_out = self.dropout(embed_out)

        # 2. LSTM
        if not self.disable_lstm:
            packed_embed_out = torch.nn.utils.rnn.pack_padded_sequence(
                embed_out, lengths, batch_first=True, enforce_sorted=False)
            lstm_out, (h_n, c_n) = self.lstm(packed_embed_out)
            lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(
                lstm_out, batch_first=True, total_length=self.seq_size)
            # lstm_out : [batch_size, seq_size, lstm_hidden_dim*2]
            lstm_out = self.dropout(lstm_out)
        else:
            lstm_out = embed_out
            # lstm_out : [batch_size, seq_size, emb_dim]

        # 3. Output
        logits = self.linear(lstm_out)
        # logits : [batch_size, seq_size, label_size]
        if not self.use_crf: return logits
        prediction = self.crf.decode(logits)
        prediction = torch.as_tensor(prediction, dtype=torch.long)
        # prediction : [batch_size, seq_size]
        return logits, prediction
Пример #11
0
    def __init__(self,
                 config,
                 bert_config,
                 bert_model,
                 bert_tokenizer,
                 label_path,
                 pos_path,
                 use_crf=False,
                 use_pos=False,
                 disable_lstm=False,
                 feature_based=False):
        super().__init__(config=config)

        self.config = config
        self.device = config['opt'].device
        self.seq_size = config['n_ctx']
        pos_emb_dim = config['pos_emb_dim']
        lstm_hidden_dim = config['lstm_hidden_dim']
        lstm_num_layers = config['lstm_num_layers']
        lstm_dropout = config['lstm_dropout']
        self.use_crf = use_crf
        self.use_pos = use_pos
        self.disable_lstm = disable_lstm

        # bert embedding layer
        self.bert_config = bert_config
        self.bert_model = bert_model
        self.bert_tokenizer = bert_tokenizer
        self.bert_feature_based = feature_based
        self.bert_hidden_size = bert_config.hidden_size
        self.bert_num_layers = bert_config.num_hidden_layers

        # DSA layer for bert_feature_based
        dsa_num_attentions = config['dsa_num_attentions']
        dsa_input_dim = self.bert_hidden_size
        dsa_dim = config['dsa_dim']
        dsa_r = config['dsa_r']
        self.dsa = DSA(config,
                       dsa_num_attentions,
                       dsa_input_dim,
                       dsa_dim,
                       dsa_r=dsa_r)
        self.layernorm_dsa = nn.LayerNorm(self.dsa.last_dim)

        bert_emb_dim = self.bert_hidden_size
        if self.bert_feature_based:
            '''
            # 1) last layer, 2) mean pooling
            bert_emb_dim = self.bert_hidden_size
            '''
            # 3) DSA pooling
            bert_emb_dim = self.dsa.last_dim

        # pos embedding layer
        self.poss = super().load_dict(pos_path)
        self.pos_vocab_size = len(self.poss)
        padding_idx = config['pad_pos_id']
        self.embed_pos = super().create_embedding_layer(
            self.pos_vocab_size,
            pos_emb_dim,
            weights_matrix=None,
            non_trainable=False,
            padding_idx=padding_idx)

        # BiLSTM layer
        if self.use_pos:
            emb_dim = bert_emb_dim + pos_emb_dim
        else:
            emb_dim = bert_emb_dim
        if not self.disable_lstm:
            self.lstm = nn.LSTM(input_size=emb_dim,
                                hidden_size=lstm_hidden_dim,
                                num_layers=lstm_num_layers,
                                dropout=lstm_dropout,
                                bidirectional=True,
                                batch_first=True)

        self.dropout = nn.Dropout(config['dropout'])

        # projection layer
        self.labels = super().load_dict(label_path)
        self.label_size = len(self.labels)
        if not self.disable_lstm:
            self.linear = nn.Linear(lstm_hidden_dim * 2, self.label_size)
        else:
            self.linear = nn.Linear(emb_dim, self.label_size)

        # CRF layer
        if self.use_crf:
            self.crf = CRF(num_tags=self.label_size, batch_first=True)
Пример #12
0
    def __init__(self,
                 config,
                 embedding_path,
                 label_path,
                 pos_path,
                 emb_non_trainable=True,
                 use_crf=False,
                 use_char_cnn=False):
        super().__init__(config=config)

        self.config = config
        self.device = config['opt'].device
        self.seq_size = config['n_ctx']
        pos_emb_dim = config['pos_emb_dim']
        self.use_crf = use_crf
        self.use_char_cnn = use_char_cnn

        # glove embedding layer
        weights_matrix = super().load_embedding(embedding_path)
        vocab_dim, token_emb_dim = weights_matrix.size()
        padding_idx = config['pad_token_id']
        self.embed_token = super().create_embedding_layer(
            vocab_dim,
            token_emb_dim,
            weights_matrix=weights_matrix,
            non_trainable=emb_non_trainable,
            padding_idx=padding_idx)

        # pos embedding layer
        self.poss = super().load_dict(pos_path)
        self.pos_vocab_size = len(self.poss)
        padding_idx = config['pad_pos_id']
        self.embed_pos = super().create_embedding_layer(
            self.pos_vocab_size,
            pos_emb_dim,
            weights_matrix=None,
            non_trainable=False,
            padding_idx=padding_idx)

        emb_dim = token_emb_dim + pos_emb_dim
        # char embedding layer
        if self.use_char_cnn:
            self.charcnn = CharCNN(config)
            emb_dim = token_emb_dim + pos_emb_dim + self.charcnn.last_dim

        # Densenet layer
        densenet_kernels = config['densenet_kernels']
        first_num_filters = config['densenet_first_num_filters']
        num_filters = config['densenet_num_filters']
        last_num_filters = config['densenet_last_num_filters']
        self.densenet = DenseNet(densenet_kernels,
                                 emb_dim,
                                 first_num_filters,
                                 num_filters,
                                 last_num_filters,
                                 activation=F.relu)
        self.layernorm_densenet = nn.LayerNorm(self.densenet.last_dim)

        self.dropout = nn.Dropout(config['dropout'])

        # projection layer
        self.labels = super().load_dict(label_path)
        self.label_size = len(self.labels)
        self.linear = nn.Linear(last_num_filters, self.label_size)

        # CRF layer
        if self.use_crf:
            self.crf = CRF(num_tags=self.label_size, batch_first=True)
Пример #13
0
class GloveDensenetCRF(BaseModel):
    def __init__(self,
                 config,
                 embedding_path,
                 label_path,
                 pos_path,
                 emb_non_trainable=True,
                 use_crf=False,
                 use_char_cnn=False):
        super().__init__(config=config)

        self.config = config
        self.device = config['opt'].device
        self.seq_size = config['n_ctx']
        pos_emb_dim = config['pos_emb_dim']
        self.use_crf = use_crf
        self.use_char_cnn = use_char_cnn

        # glove embedding layer
        weights_matrix = super().load_embedding(embedding_path)
        vocab_dim, token_emb_dim = weights_matrix.size()
        padding_idx = config['pad_token_id']
        self.embed_token = super().create_embedding_layer(
            vocab_dim,
            token_emb_dim,
            weights_matrix=weights_matrix,
            non_trainable=emb_non_trainable,
            padding_idx=padding_idx)

        # pos embedding layer
        self.poss = super().load_dict(pos_path)
        self.pos_vocab_size = len(self.poss)
        padding_idx = config['pad_pos_id']
        self.embed_pos = super().create_embedding_layer(
            self.pos_vocab_size,
            pos_emb_dim,
            weights_matrix=None,
            non_trainable=False,
            padding_idx=padding_idx)

        emb_dim = token_emb_dim + pos_emb_dim
        # char embedding layer
        if self.use_char_cnn:
            self.charcnn = CharCNN(config)
            emb_dim = token_emb_dim + pos_emb_dim + self.charcnn.last_dim

        # Densenet layer
        densenet_kernels = config['densenet_kernels']
        first_num_filters = config['densenet_first_num_filters']
        num_filters = config['densenet_num_filters']
        last_num_filters = config['densenet_last_num_filters']
        self.densenet = DenseNet(densenet_kernels,
                                 emb_dim,
                                 first_num_filters,
                                 num_filters,
                                 last_num_filters,
                                 activation=F.relu)
        self.layernorm_densenet = nn.LayerNorm(self.densenet.last_dim)

        self.dropout = nn.Dropout(config['dropout'])

        # projection layer
        self.labels = super().load_dict(label_path)
        self.label_size = len(self.labels)
        self.linear = nn.Linear(last_num_filters, self.label_size)

        # CRF layer
        if self.use_crf:
            self.crf = CRF(num_tags=self.label_size, batch_first=True)

    def forward(self, x):
        # x[0, 1] : [batch_size, seq_size]
        # x[2]    : [batch_size, seq_size, char_n_ctx]
        token_ids = x[0]
        pos_ids = x[1]

        mask = torch.sign(torch.abs(token_ids)).to(torch.uint8).to(self.device)
        # mask : [batch_size, seq_size]

        # 1. Embedding
        token_embed_out = self.embed_token(token_ids)
        # token_embed_out : [batch_size, seq_size, token_emb_dim]
        pos_embed_out = self.embed_pos(pos_ids)
        # pos_embed_out   : [batch_size, seq_size, pos_emb_dim]
        if self.use_char_cnn:
            char_ids = x[2]
            # char_ids : [batch_size, seq_size, char_n_ctx]
            charcnn_out = self.charcnn(char_ids)
            # charcnn_out : [batch_size, seq_size, self.charcnn.last_dim]
            embed_out = torch.cat(
                [token_embed_out, pos_embed_out, charcnn_out], dim=-1)
            # embed_out : [batch_size, seq_size, emb_dim]
        else:
            embed_out = torch.cat([token_embed_out, pos_embed_out], dim=-1)
            # embed_out : [batch_size, seq_size, emb_dim]
        embed_out = self.dropout(embed_out)

        # 2. DenseNet
        densenet_out = self.densenet(embed_out, mask)
        # densenet_out : [batch_size, seq_size, last_num_filters]
        densenet_out = self.layernorm_densenet(densenet_out)
        densenet_out = self.dropout(densenet_out)

        # 3. Output
        logits = self.linear(densenet_out)
        # logits : [batch_size, seq_size, label_size]
        if not self.use_crf: return logits
        prediction = self.crf.decode(logits)
        prediction = torch.as_tensor(prediction, dtype=torch.long)
        # prediction : [batch_size, seq_size]
        return logits, prediction
Пример #14
0
 def __init__(self, scorer: EmissionScorer, padding_idx: int = 0) -> None:
     super(CRFTagger, self).__init__()
     self.scorer = scorer
     self.padding_idx = padding_idx
     self.crf = CRF(scorer.num_tags)
     self.reset_parameters()
 ########读取验证集###########
 with open(dev_pkl, "rb") as f:
     dev_features, word_index, char_index = pkl.load(f)
 dev_sents = read_data(dev_path)
 print('读取验证集完成')
 dev_count = len(dev_features)
 #########获取词向量初始矩阵###############
 with open(word_pkl, 'rb') as f:
     word_matrix = pkl.load(f)
 print('初始化词向量完成')
 #########加载模型###############
 lstm = cnn_lstm_no_pad_model(word_matrix, word_dim, len(char_index),
                              char_dim, feature_maps, kernels, hidden_dim,
                              tagset_size)
 lstm.cuda(device=0)
 crf = CRF(tagset_size, batch_first=True)
 crf.cuda(device=0)
 parameters = []
 for param in lstm.parameters():
     parameters.append(param)
 for param in crf.parameters():
     parameters.append(param)
 optimizer = optim.RMSprop(parameters, lr=learn_rate)
 # optimizer=optim.Adam(parameters, lr=learn_rate)
 # optimizer=optim.Adagrad(parameters, lr=learn_rate)
 # optimizer=optim.SGD(parameters, lr=learn_rate)
 ########训练和测试##############
 distant_index = list(range(distant_count))
 dev_index = list(range(dev_count))
 max_f_dev = 0.0
 for epoch in range(epoch_num):
Пример #16
0
class BertLstmCrf(BertModel):
    """On the outputs of Bert there is a LSTM layer.
    On top of the LSTM there is a  CRF layer.
    """
    def __init__(self, config, pad_idx, lstm_hidden_dim, num_lstm_layers,
                 bidirectional, num_labels):
        super(BertLstmCrf, self).__init__(config)
        self.dropout_prob = config.hidden_dropout_prob
        self.pad_idx = pad_idx
        self.lstm_hidden_dim = lstm_hidden_dim
        self.num_lstm_layers = num_lstm_layers
        self.bidirectional = bidirectional
        self.num_labels = num_labels

        self.bert = BertModel(config)

        if self.num_lstm_layers > 1:
            self.lstm = nn.LSTM(input_size=config.hidden_size,
                                hidden_size=self.lstm_hidden_dim,
                                num_layers=self.num_lstm_layers,
                                bidirectional=self.bidirectional,
                                dropout=self.dropout_prob,
                                batch_first=True)
        else:
            self.lstm = nn.LSTM(input_size=config.hidden_size,
                                hidden_size=self.lstm_hidden_dim,
                                num_layers=self.num_lstm_layers,
                                bidirectional=self.bidirectional,
                                batch_first=True)
        if self.bidirectional is True:
            self.linear = nn.Linear(self.lstm_hidden_dim * 2, self.num_labels)
        else:
            self.linear = nn.Linear(self.lstm_hidden_dim, self.num_labels)

        self.crf_layer = CRF(self.num_labels, batch_first=True)
        self.dropout_layer = nn.Dropout(self.dropout_prob)

        self.init_weights()

    def create_mask_for_crf(self, inp):
        """Creates a mask for the feesing to crf layer.

        Args:
            inp (TYPE): input given to bert layer
        """
        mask = (inp != self.pad_idx) & (inp != self.sep_idx)
        # mask = [seq_len, batch_size]

        return mask

    def forward(self,
                input_ids,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                labels=None):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask)
        sequence_output = outputs[0]

        lstm_out, (hidden, cell) = self.lstm(sequence_output)
        logits = self.linear(self.dropout_layer(lstm_out))

        # removing cls token
        logits = logits[:, 1:, :]
        if labels is not None:
            labels = labels[:, 1:]
        input_ids = input_ids[:, 1:]

        # creating mask for crf
        mask = self.create_mask_for_crf(input_ids)

        # crf part
        if labels is not None:
            loss = self.crf_layer(logits, labels, mask=mask) * torch.tensor(
                -1, device=self.device)
        else:
            loss = None

        out = self.crf_layer.decode(logits)
        out = torch.tensor(out, dtype=torch.long, device=self.device)
        # out = [batch_Size, seq_len]
        return out, labels, loss
Пример #17
0
                                      w_tag_pad=w_padding,
                                      t_tag_pad=len(id2label)),
                            model,
                            SimpleLossCompute(criterion, optimizer, scheduler),
                            train=False,
                            id2label=id2label)
        print('Loss:', loss)
        testResult.append(f)
    valBest = max(valResult)
    print('ValBest epoch:',
          [i for i, j in enumerate(valResult) if j == valBest])
    testBest = max(testResult)
    print('TestBest epoch:',
          [i for i, j in enumerate(testResult) if j == testBest])


trainSents = preProcess(readData('dps/swbd/train'))
valSents = preProcess(readData('dps/swbd/val'))
testSents = preProcess(readData('dps/swbd/test'))
label2id, id2label = build_vocab(trainSents)
print(id2label)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                          do_lower_case=False)
trainData = idData(tokenizer, trainSents, label2id)
valData = idData(tokenizer, valSents, label2id)
testData = idData(tokenizer, testSents, label2id)
encoder = Encoder(len(id2label)).to(device)
criterion = CRF(len(id2label), batch_first=True).to(device)

run(EPOCH, encoder, BATCH_SIZE, trainData, valData, testData, id2label,
    tokenizer._convert_token_to_id('[PAD]'), criterion)
Пример #18
0
class BertCrfForNER(BertModel):
    """
    This class inherits functionality from huggingface BertModel.
    It applies a crf layer on the Bert outputs.
    """
    def __init__(self, config, pad_idx, sep_idx, num_labels):
        """Inititalization

        Args:
            config (TYPE): model config flie (similar to bert_config.json)
            num_labels : total number of layers using the bio format
            pad_idx (TYPE): pad_idx of the tokenizer
            device (TYPE): torch.device()
        """
        super(BertCrfForNER, self).__init__(config)
        self.num_labels = num_labels
        self.pad_idx = pad_idx
        self.sep_idx = sep_idx

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.crf_layer = CRF(self.num_labels, batch_first=True)
        self.linear = nn.Linear(config.hidden_size, self.num_labels)
        self.init_weights()

    def create_mask_for_crf(self, inp):
        """Creates a mask for the feeding to crf layer.
           Mask <PAD> and <SEP> token positions
        Args:
            inp (TYPE): input given to bert layer
        """

        mask = (inp != self.pad_idx) & (inp != self.sep_idx)
        # mask = [seq_len, batch_size]

        return mask

    def forward(self,
                input_ids,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                labels=None):
        """Forwar propagate.

        Args:
            input_ids (TYPE): bert input ids
            attention_mask (None, optional): attention mask for bert
            token_type_ids (None, optional): token type ids for bert
            position_ids (None, optional): position ids for bert
            head_mask (None, optional): head mask for bert
            labels (None, optional): labels required while training crf
        """
        # getting outputs from Bert
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask)
        # taking tokens embeddings from the output
        sequence_output = outputs[0]
        # sequence_ouput = [batch_size, seq_len, hidden_size]

        logits = self.linear(sequence_output)
        # logits = [batch_size, seq_len, num_labels]

        # removing cls token
        logits = logits[:, 1:, :]
        if labels is not None:
            labels = labels[:,
                            1:]  # check whether labels include the cls token too or not
        input_ids = input_ids[:, 1:]

        mask = self.create_mask_for_crf(input_ids)
        if labels is not None:
            loss = self.crf_layer(logits, labels, mask=mask) * torch.tensor(
                -1, device=self.device)
        else:
            loss = None
        # this is the crf loss

        out = self.crf_layer.decode(logits)
        out = torch.tensor(out, dtype=torch.long, device=self.device)

        # out = [batch_size, seq_length]
        return out, labels, loss
Пример #19
0
def make_crf(num_tags=5, batch_first=False):
    return CRF(num_tags, batch_first=batch_first)
Пример #20
0
def transformation():  # Do an inference on a single batch of data
    data = None

    # 1) INPUT: convert Korean text input to NER code array
    if flask.request.content_type == 'text/plain':
        '''CHECK file locations'''
        model_config = Config(json_path="config.json")
        tok_path = "./tokenizer_78b3253a26.model"
        ptr_tokenizer = SentencepieceTokenizer(tok_path)

        with open("vocab.pkl", 'rb') as f:
            vocab = pickle.load(f)

        tokenizer = Tokenizer(vocab=vocab,
                              split_fn=ptr_tokenizer,
                              pad_fn=keras_pad_fn,
                              maxlen=model_config.maxlen)

        with open("ner_to_index.json", 'rb') as f:
            ner_to_index = json.load(f)
            index_to_ner = {v: k for k, v in ner_to_index.items()}

        decoder_from_res = DecoderFromNamedEntitySequence(
            tokenizer=tokenizer, index_to_ner=index_to_ner)
        '''
            Assuming request.data is a string: name of txt file
            > NER_OY_data.txt as an example
            > 지금은 /opt/program에 (product-tags)

            HERE:?
        '''
        f = flask.request.data.decode("utf-8")
        lines = f.splitlines(True)
        index = 0

        with open("NER_OY_result.txt", 'w', encoding='utf-8-sig') as w:
            for i in range(len(lines)):
                input_text = ''
                if i % 4 == 1:
                    input_text = lines[i][3:]
                    addInfo = lines[i + 1][3:]
                if input_text == '':
                    continue

                index += 1
                # print("\n## " + str(index) + "\n")

                list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids(
                    [input_text])
                x_input = torch.tensor(list_of_input_ids).long()
                # print(list_of_input_ids)
                # print(x_input)

                data = {"instances": list_of_input_ids}
                predictions = ScoringService.predict(data)

                # 2) OUTPUT: convert NER code to Korean text (FILE)
                emission = torch.tensor(predictions['predictions'])
                num_classes = len(ner_to_index)
                crf = CRF(num_tags=num_classes, batch_first=True)
                list_of_pred_ids = crf.decode(emission)

                input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res(
                    list_of_input_ids=list_of_input_ids,
                    list_of_pred_ids=list_of_pred_ids,
                    unkTokenList=False)
                unkTokenList = makeUNKTokenList(input_text, input_token)
                input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res(
                    list_of_input_ids=list_of_input_ids,
                    list_of_pred_ids=list_of_pred_ids,
                    unkTokenList=unkTokenList)

                w.write('## ' + str(index) + '\n')
                w.write(addInfo)
                w.write(str(list_of_ner_word) + '\n')
                w.write(str(decoding_ner_sentence[6:-5]) + '\n')
            '''RETURN a file: NER_OY_result.txt'''
        return flask.Response(response=open("NER_OY_result.txt", 'r'),
                              status=200,
                              mimetype='text/plain')
    else:
        return flask.Response(
            response='This predictor only supports TEXT data',
            status=415,
            mimetype='text/plain')
Пример #21
0
    def __init__(self, args):
        super(Aspect_CS_GAT_BERT, self).__init__()
        self.args = args
        self.wembeddings = args.bert_model

        # POS-Tagging Embedding Layer
        self.pembeddings = EmbeddingLayer(embedding_size=(232, 232),
                                          dropout=args.posemb_dp,
                                          device=args.device)

        # Residual POS-Tagging Embedding
        self.res_posemb = EmbeddingLayer(
            embedding_size=(2 * args.lstm_hidden_size,
                            2 * args.lstm_hidden_size),
            dropout=None,
            device=args.device)

        # Bi-LSTM Encoder
        self.bilstm = DynamicLSTM(input_size=1000,
                                  hidden_size=args.lstm_hidden_size,
                                  num_layers=args.num_layers,
                                  dropout=args.bilstm_dp,
                                  bidirectional=True,
                                  device=args.device)

        # GCN
        self.gcns = nn.ModuleList()
        for i in range(args.gcn_num_layers):
            gcn = GraphConvolution(
                in_features=2 * args.lstm_hidden_size,
                out_features=2 * args.lstm_hidden_size,
                edge_types=args.edge_types_num,
                dropout=args.gcn_dp if i != args.gcn_num_layers - 1 else 0,
                use_bn=args.gcn_use_bn,
                device=args.device)
            self.gcns.append(gcn)

        # Highway
        if args.highway_use:
            self.hws = nn.ModuleList()
            for i in range(args.gcn_num_layers):
                hw = HighWay(size=2 * args.lstm_hidden_size,
                             dropout_ratio=args.gcn_dp)
                self.hws.append(hw)

        self.sa_output = BottledXavierLinear(
            in_features=4 * args.lstm_hidden_size,
            out_features=args.sa_classes).to(device=args.device)

        # CRF
        self.CRF_model = CRF(4, batch_first=True)

        if args.target_method == 'BIO':
            self.dt_output = nn.Linear(4 * args.lstm_hidden_size, 4)
        else:
            self.dt_output = nn.Linear(4 * args.lstm_hidden_size, 3)

        self.loss_func_sa = FocalLoss(alpha=0.5, num_classes=4)

        self.dropout_sa = nn.Dropout(0.5)  # 0.5
        self.dropout_dt = nn.Dropout(0.35)  # 0.2 0.35
class BERT_LSTM_CRF2(MODEL_TEMP):
    def __init__(self, config={}, show_param=False):
        '''
        :param - dict
            *param['embedding_dim']
            *param['hidden_dim']
            param['n_ent_tags']
            param['n_rel_tags']
            param['n_rels']
            param['n_words']
            *param['start_ent_idx']  int, <start> tag index for entity tag seq
            *param['end_ent_idx']   int, <end> tag index for entity tag seq
            *param['start_rel_idx']  int, <start> tag index for entity tag seq
            *param['end_rel_idx']   int, <end> tag index for entity tag seq
            param['use_cuda']
            param['dropout_prob']
            param['lstm_layer_num']
        '''
        super(BERT_LSTM_CRF2, self).__init__()
        self.config = config
        self.embedding_dim = self.config.get('embedding_dim', 768)
        self.hidden_dim = self.config.get('hidden_dim', 64)  ##TODO: 128*2
        assert self.hidden_dim % 2 == 0, 'hidden_dim for BLSTM must be even'

        self.n_tags = self.config['n_rel_tags'] - 2
        # self.n_words = self.config.get('n_words', 10000)

        self.dropout_prob = self.config.get('dropout_prob', 0)
        self.lstm_layer_num = self.config.get('lstm_layer_num', 1)

        self.use_cuda = self.config.get('use_cuda', 0)
        self.model_type = 'BERT_LSTM_CRF2'

        self.build_model()
        self.reset_parameters()
        if show_param:
            self.show_model_param()

    def show_model_param(self):
        log('='*80, 0)
        log(f'model_type: {self.model_type}', 1)
        log(f'use_cuda: {self.use_cuda}', 1)
        log(f'embedding_dim: {self.embedding_dim}', 1)
        log(f'hidden_dim: {self.hidden_dim}', 1)
        log(f'n_rel_tags: {self.n_tags}', 1)
        # log(f"crf_start_idx: {self.config['start_ent_idx']}", 1)
        # log(f"crf_end_idx: {self.config['end_ent_idx']}", 1)
        log(f'lstm_layer_num: {self.lstm_layer_num}', 1)
        log(f'dropout_prob: {self.dropout_prob}', 1)  
        log('='*80, 0)      

    def build_model(self):
        '''
        build the bert layer, lstm layer and CRF layer
        '''
        # self.word_embeds = nn.Embedding(self.n_words, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim//2, batch_first=True, num_layers=self.lstm_layer_num, dropout=self.dropout_prob, bidirectional=True)
        self.hidden2tag = nn.Linear(self.hidden_dim, self.n_tags)

        self.crf = CRF(self.n_tags, batch_first=True)
        self.bert = transformers.BertModel.from_pretrained('bert-base-chinese')

    def reset_parameters(self):        
        # I.xavier_normal_(self.word_embeds.weight.data)
        self.lstm.reset_parameters()
        # stdv = 1.0 / math.sqrt(self.hidden_dim)
        # for weight in self.lstm.parameters():
        #     I.uniform_(weight, -stdv, stdv)
        I.xavier_normal_(self.hidden2tag.weight.data)
        self.crf.reset_parameters()
        
    def _get_lstm_features(self, x, lens, use_cuda=None):
        '''
        TODO: 添加关于句子长度处理的部分
        :param  
            @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array
            @lens: 每个句子的实际长度
        :return 
            @lstm_feature: (batch_size, T, n_tags) -- 类似于eject score, torch.tensor
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        batch_size, T = x.shape

        words_tensor = self._to_tensor(x, use_cuda)  #(batch_size, T)
        lens = self._to_tensor(lens, use_cuda)
        att_mask = self._generate_mask(lens, max_len=T)
        embeds = self.bert(words_tensor, attention_mask=att_mask)[0]  #(batch_size, T, n_embed)
        
        ##LSTM layer
        if use_cuda:
            h_0 = torch.randn(2*self.lstm_layer_num, batch_size, self.hidden_dim//2).cuda()  #(n_layer*n_dir, N, n_hid)
            c_0 = torch.randn(2*self.lstm_layer_num, batch_size, self.hidden_dim//2).cuda()
        else:
            h_0 = torch.randn(2*self.lstm_layer_num, batch_size, self.hidden_dim//2)
            c_0 = torch.randn(2*self.lstm_layer_num, batch_size, self.hidden_dim//2)
        # c_0 = h_0.clone()
        hidden = (h_0, c_0)
        lstm_out, _hidden = self.lstm(embeds, hidden)   #(batch_size, T, n_dir*n_hid), (h, c)

        ##FC layer
        lstm_feature = self.hidden2tag(lstm_out) #(batch_size, T, n_tags)
        lstm_feature = torch.tanh(lstm_feature)
        return lstm_feature

    def _loss(self, x, y_rel, lens, use_cuda=None):
        '''
        loss function: neg_log_likelihood
        :param
            @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array
            @y_rel: (batch_size, T), np.array, index之后的rel_with_ent seq, 字符级别,
            @lens: (batch_size), list, 具体每个句子的长度, 
        :return 
            @loss: (batch_size), torch.tensor
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        T = x.shape[1]

        logits = self._get_lstm_features(x, lens)   ##(batch_size, T, n_tags)
        tensor_y_rel = self._to_tensor(y_rel, use_cuda)

        lens = self._to_tensor(lens, use_cuda)
        len_mask = self._generate_mask(lens, max_len=T)  ##(batch_size, T)

        log_likelihood_ent = self.crf(emissions=logits, tags=tensor_y_rel, mask=len_mask, reduction='mean')
        return - log_likelihood_ent

    def _output(self, x, lens, use_cuda=None):
        '''
        return the crf decode paths
        :param
            @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array
            @lens: (batch_size), list, 具体每个句子的长度, 
        :return 
            @paths: (batch_size, T), torch.tensor, 最佳句子路径
            @scores: (batch_size), torch.tensor, 最佳句子路径上的得分
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        T = x.shape[1]
        logits = self._get_lstm_features(x, lens, use_cuda)

        lens = self._to_tensor(lens, use_cuda)
        len_mask = self._generate_mask(lens, max_len=T)  ##(batch_size, T)
    
        # paths = self.crf.decode(logits, len_mask)
        paths = self.crf.decode(logits)
        paths = self._to_tensor(paths, use_cuda)
        return paths
Пример #23
0
class GPT2LSTMLogRegCRF(nn.Module):
    def __init__(self, freeze_bert, tokenizer, device, bidirectional,
                 class_weights):
        super(GPT2LSTMLogRegCRF, self).__init__()
        #Instantiating BERT model object
        self.gpt2_layer = GPT2Model.from_pretrained('gpt2',
                                                    output_hidden_states=True,
                                                    output_attentions=False)

        #Freeze bert layers: if True, the freeze BERT weights
        if freeze_bert:
            for p in self.gpt2_layer.parameters():
                p.requires_grad = False

        self.tokenizer = tokenizer
        self.device = device
        self.bidirectional = bidirectional

        self.dropout = nn.Dropout(0.5)

        # lstm layer
        self.lstm_layer = nn.LSTM(input_size=768,
                                  hidden_size=512,
                                  num_layers=1,
                                  bidirectional=bidirectional,
                                  batch_first=True)

        # log reg
        if bidirectional == True:
            self.hidden2tag = nn.Linear(1024, clf_P_fine_num_labels)
        else:
            self.hidden2tag = nn.Linear(512, clf_P_fine_num_labels)

        # crf
        self.crf_layer = CRF(clf_P_fine_num_labels, batch_first=True)

    def forward(self, input_ids=None, attention_mask=None, labels=None):

        # BERT
        outputs = self.gpt2_layer(input_ids, attention_mask=attention_mask)

        # output 0 = batch size 6, tokens 512, each token dimension 768 [CLS] token
        # output 1 = batch size 6, each token dimension 768
        # output 2 = layers 13, batch 6 (hidden states), tokens 512, each token dimension 768
        sequence_output = outputs[2]  # Last layer of each token prediction

        num_layer_sum = 4
        summed_last_4_layers = torch.stack(
            sequence_output[:num_layer_sum]).mean(0)

        summed_last_4_layers = self.dropout(
            summed_last_4_layers)  # newly added dropout

        # lstm with masks (same as attention masks)
        packed_input, perm_idx, seq_lengths = get_packed_padded_output(
            summed_last_4_layers, input_ids, attention_mask, self.tokenizer)
        packed_output, (ht, ct) = self.lstm_layer(packed_input)

        # Unpack and reorder the output
        output, input_sizes = pad_packed_sequence(packed_output,
                                                  batch_first=True)
        _, unperm_idx = perm_idx.sort(0)
        lstm_output = output[
            unperm_idx]  # lstm_output.shape = shorter than the padded torch.Size([6, 388, 512])
        seq_lengths_ordered = seq_lengths[unperm_idx]

        # shorten the labels as per the batchsize
        labels = labels[:, :lstm_output.shape[1]]

        # mask the unimportant tokens before log_reg
        mask = ((input_ids[:, :lstm_output.shape[1]] != 50256)
                & (input_ids[:, :lstm_output.shape[1]] != 50256)
                & (labels != 100))

        # on the first time steps
        for eachIndex in range(mask.shape[0]):
            mask[eachIndex, 0] = True

        mask_expanded = mask.unsqueeze(-1).expand(lstm_output.size())
        lstm_output *= mask_expanded.float()
        labels *= mask.long()

        # log reg
        probablities = self.hidden2tag(lstm_output)

        # CRF emissions
        loss = self.crf_layer(probablities, labels, reduction='token_mean')

        emissions_ = self.crf_layer.decode(probablities)
        emissions = [item for sublist in emissions_
                     for item in sublist]  # flatten the nest list of emissions

        return loss, torch.Tensor(emissions_), labels, mask
Пример #24
0
    def __init__(self, config: RobertaConfig, args: Any,
                 intent_label_dict: Dict[str, List[str]],
                 slot_label_dict: Dict[str, List[str]],
                 pos_label_lst: List[str], tasks: List[str]) -> None:
        super(JointRoberta, self).__init__(config)
        self.args = args

        self.tasks = tasks
        self.intent_label_dict = intent_label_dict
        self.slot_label_dict = slot_label_dict
        self.pos_label_lst = pos_label_lst

        self.num_intent_labels_dict = {
            k: len(v)
            for (k, v) in intent_label_dict.items()
        }
        self.num_slot_labels_dict = {
            k: len(v)
            for (k, v) in slot_label_dict.items()
        }

        self.intent_classifiers = {}
        self.slot_classifiers = {}
        self.crfs = {}

        self.num_pos_labels = len(pos_label_lst)
        self.num_np_labels = 1  # len(np_label_lst)
        self.num_vp_labels = 1  # len(vp_label_lst)
        self.num_entity_labels = 1  # len(entity_label_lst)
        self.num_acronym_labels = 1  # len(acronym_label_lst)

        self.roberta = RobertaModel(config=config)  # Load pretrained bert

        hidden_size = config.hidden_size

        # TODO pos_emb = 50 should be an input variable
        if args.use_pos:
            pos_dim = 50
            hidden_size += pos_dim
            self.pos_emb = (nn.Embedding(self.num_pos_labels, pos_dim)
                            if pos_dim > 0 else None)
        if args.use_np:
            hidden_size += self.num_np_labels
        if args.use_vp:
            hidden_size += self.num_vp_labels
        if args.use_entity:
            hidden_size += self.num_entity_labels
        if args.use_acronym:
            hidden_size += self.num_acronym_labels

        self.custom_pooler = Pooler(hidden_size=hidden_size)
        for pred_type in self.tasks:
            self.intent_classifiers[pred_type] = IntentClassifier(
                hidden_size, self.num_intent_labels_dict[pred_type],
                args.dropout_rate)

        for pred_type in self.tasks:
            self.slot_classifiers[pred_type] = SlotClassifier(
                hidden_size, self.num_slot_labels_dict[pred_type],
                args.dropout_rate)

            if args.use_crf:
                self.crfs[pred_type] = CRF(
                    num_tags=self.num_slot_labels_dict[pred_type],
                    batch_first=True)

        self.intent_classifiers = nn.ModuleDict(self.intent_classifiers)
        self.slot_classifiers = nn.ModuleDict(self.slot_classifiers)
        self.crfs = nn.ModuleDict(self.crfs)
Пример #25
0
 def test_nonpositive_num_tags(self):
     with pytest.raises(ValueError) as excinfo:
         CRF(0)
     assert 'invalid number of tags: 0' in str(excinfo.value)
Пример #26
0
class CRFEvaluateWorkflow(object):
    def __init__(self):
        self.__MODE = "CRF_EVALUATE"
        self.__METRIC = "loss_avg"

        # Validation parameters
        self.batch_size = None
        self.num_workers = None
        self.seed = None
        self.model_config = None
        self.ckpt_dir = None

        # Data
        self.dataset = None
        self.prediction_dir = None
        self.dtype = None
        self.class_weight = None

        # Save
        self.output_dir = None
        self.experiment_name = None
        self.result_dir = None
        self.tmp_dir = None

        ######## Configs ########
        self.args = None

    def run(self):
        # Set up logger and print out configurations
        self.model_dir = init_model_dir(self.output_dir, self.experiment_name)
        self.logger = set_logger(
            self.model_dir, self.experiment_name, self.__MODE, self.dtype
        )
        display_args(self.args, self.logger)

        # Set up GPUs
        if not torch.cuda.is_available():
            raise Exception("No GPU found.")
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)
        self.device = torch.device("cuda")
        self.logger.info(
            "Use {} GPU(s) for training".format(torch.cuda.device_count())
        )

        # Initialize model.
        self.load_model()
        self.logger.info("MODEL ARCHITECTURE:\n{}".format(self.model))

        # Create evaluation result folders.
        self.initialize_result_directories()

        # Start evaluation.
        self.logger.info("Start evaluating CRF classifier.")
        try:
            self.crf_evaluate()
            self.logger.info("CRF classifier evaluation finished.")
        except KeyboardInterrupt:
            self.logger.warning("Evaluation interrupted. Program exit.")

    ########################
    # Multiprocessing : main thread
    ########################
    def crf_evaluate(self):
        """Evaluate all checkpoints for a trained CRF classifier."""
        ckpt_tracker = CheckpointTracker(os.path.join(self.ckpt_path, "*.ckpt"))
        self.time_tracker = TimeTracker()

        # Initialize validation dataset
        params = {
            "batch_size": self.batch_size,
            "shuffle": False,
            "num_workers": self.num_workers,
            "collate_fn": CRF_collate_samples,
        }
        file_path, _ = merge_predictions(
            self.prediction_dir,
            self.logger,
            self.tmp_dir,
            target_path=self.dataset,
        )
        evaluate_dset = CRFDataset(file_path, self.class_weight)
        evaluate_iter = DataLoader(evaluate_dset, **params)
        num_batches = len(evaluate_iter)
        self.logger.info(
            "{:,} samples used for evaluation.".format(evaluate_dset.__len__())
        )

        # data preprocess worker
        preprocess_queue = mp.JoinableQueue(maxsize=128)
        preprocess_worker = mp.Process(
            name="preprocess",
            target=self.preprocess,
            args=(preprocess_queue, evaluate_iter),
        )
        preprocess_worker.start()
        self.logger.info("CRF evaluation data workder started")

        # Evaluate all checkpoints.
        while len(ckpt_tracker.remaining) > 0:
            for ckpt in ckpt_tracker.remaining:
                self.evaluate_checkpoint(ckpt, preprocess_queue, num_batches)
                ckpt_tracker.add_evaluated(ckpt)
            ckpt_tracker.reset_params()

        # Terminate data worker.
        preprocess_worker.terminate()

    def evaluate_checkpoint(self, checkpoint, preprocess_queue, num_batches):
        """Evaluate one checkpoint of a trained CRF classifier."""
        # Load checkpoint
        step = self.load_checkpoint(checkpoint)
        self.logger.info("Evaluating CRF step {}".format(step))

        # Return if the checkpoint is already evaluated.
        eval_path = os.path.join(
            self.eval_path, "{}_{}.json".format(self.experiment_name, step)
        )
        if os.path.exists(eval_path):
            self.logger.info("Step {} already evaluated".format(step))
            return

        # Initialize evaluation worker.
        evaluate_queue = mp.JoinableQueue(maxsize=64)
        evaluate_worker = mp.Process(
            name="evaluate_{}".format(step),
            target=self.evaluate,
            args=(checkpoint, evaluate_queue, eval_path, step, num_batches),
        )
        evaluate_worker.start()

        # Evaluate checkpoint.
        self.model.eval()
        with torch.no_grad():
            for b in tqdm(range(num_batches)):
                dset = preprocess_queue.get()
                feature, target = CRF_push_to_device(dset, self.device)
                loss = -self.model(feature, target, reduction="mean")
                evaluate_queue.put(loss.item())
        evaluate_queue.join()

    ########################
    # Multiprocessing : workers
    ########################
    # Preprocess worker
    def preprocess(self, queue, dataloader):
        """Set up multiprocessing data queue"""
        while True:
            for dset in dataloader:
                queue.put(dset)

    # Evaluate worker
    def evaluate(self, ckpt, queue, eval_path, step, num_batches):
        """Evaluate checkpoint and save evaluation to disk."""
        self.loss_avg = AverageMeter()
        for batch in range(num_batches):
            loss = queue.get()
            queue.task_done()
            self.loss_avg.update(loss)
        self.display_result(step)

        # Save evaluation
        result = {
            "step": step,
            "loss_avg": self.loss_avg.avg,
        }
        with open(eval_path, "w") as outfile:
            json.dump(result, outfile, indent=4)

        # Update best checkpoint
        self.update_best_ckpt(result, ckpt)

    ############################
    # Display and save evaluations
    ############################
    def display_result(self, step):
        """Display average evaluation loss."""
        self.logger.info(
            "EVALUATE CRF | step {:8d} | avg loss {:8.4f} "
            "| time elapse: {:>12} |".format(
                step, self.loss_avg.avg, self.time_tracker.elapse()
            )
        )

    def update_best_ckpt(self, result, checkpoint):
        """Update best checkpoint metrics."""
        result["ckpt_path"] = checkpoint
        result["metric"] = self.__METRIC
        path = os.path.join(
            self.best_ckpt_path, "best_{}.json".format(self.__METRIC)
        )

        if os.path.exists(path):
            with open(path, "r") as infile:
                metrics = json.load(infile)
            if metrics[self.__METRIC] <= result[self.__METRIC]:
                return
        with open(path, "w") as outfile:
            json.dump(result, outfile, indent=4)

    def initialize_result_directories(self):
        """Initialize output evaluation and checkpoint directories."""
        if self.result_dir is not None:
            self.eval_path = self.result_dir
        else:
            self.eval_path = os.path.join(
                self.model_dir,
                "{}_{}".format(self.dtype.lower(), self.__MODE.lower()),
            )
        create_dirs(self.eval_path, logger=self.logger)
        self.best_ckpt_path = os.path.join(self.eval_path, "best_checkpoint")
        create_dirs(self.best_ckpt_path, logger=self.logger)

        if self.ckpt_dir is not None:
            self.ckpt_path = self.ckpt_dir
        else:
            self.ckpt_path = os.path.join(self.model_dir, "crf_checkpoints")

    ############################
    # Loading model and checkpoint
    ############################
    def load_model(self):
        """Load args from .config file and initialize CRF classifier."""
        # Load model params
        if self.model_config is not None:
            path = self.model_config
        else:
            path = os.path.join(
                self.model_dir, "{}_crf.config".format(self.experiment_name)
            )
        params = torch.load(path, map_location="cpu")

        # Initialize classifier
        self.model = CRF(params["output_size"], batch_first=True)
        self.model.to(self.device)

    def load_checkpoint(self, ckpt_path):
        """Load CRF checkpoint state_dict."""
        ckpt_params = torch.load(ckpt_path, map_location=self.device)
        self.model.load_state_dict(ckpt_params["state_dict"])
        return ckpt_params["step"]
Пример #27
0
        train_list.append(train)

    test = copy.copy(whole_corpus).set_filter(test_ids)
    test_list.append(test)

if args.mode != 'eval':
    training_data = ConcatDataset(train_list)

testing_data = ConcatDataset(test_list)

print('----------------------------------')
end_loading = time.time()
print("Loading done:", end_loading - start_loading)
time_record['Load'] = end_loading - start_loading

model = CRF(len(valid_types), batch_first=True).to(device)

####################
# Training
####################
if args.mode == 'train':

    classifier.load_state_dict(
        torch.load(join(pre_trained_sherlock_loc, pre_trained_sherlock_path),
                   map_location=device))

    # Set initial transition parameters
    if init_transition is not None:
        model.transitions = torch.nn.Parameter(
            torch.tensor(init_transition).float().to(device))
class BERT_CRF(nn.Module):
    """
    官方模板<https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html>
    官方为cpu,要在gpu中运行,所有单独生成的tensor需要.to(device)导入gpu
    """

    def __init__(self, tag_to_ix, mask=False):
        super(BERT_CRF, self).__init__()
        self.hidden_dim = 768  # BERT最后一层维度=768
        self.mask = mask
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.hidden2tag = nn.Linear(self.hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def _get_sentence_features(self, sentences):
        """
        用BERT抽取特征,保持结构统一直接输出,[time_step,768]
        :param sentences:
        :return:
        """
        if self.mask:
            mask_idx = 1 - torch.eq(sentences, 0)
            mask_idx = (mask_idx.sum(dim=2) > 0)
            self.mask_idx = mask_idx

        return sentences

    def _get_sentence_feats(self, features):
        feats = self.hidden2tag(features)

        return feats

    def neg_log_likelihood(self, sentences, tags):
        """
        损失函数=所有序列得分-正确序列得分
        :param sentence:
        :param tags:
        :return:
        """
        features = self._get_sentence_features(sentences)
        feats = self._get_sentence_feats(features)
        if self.mask:
            loss = -self.crf(feats, tags, self.mask_idx, reduction='mean')
        else:
            loss = -self.crf(feats, tags, reduction='mean')

        return loss

    def _viterbi_decode(self, batch_feats):
        """
        维特比算法寻找最大得分序列,用于推断
        :param batch_feats:
        :return:
        """
        best_path = self.crf.decode(batch_feats)
        return best_path

    def forward(self, sentences):
        """
        前向传播过程
        :param sentence:
        :return:
        """
        features = self._get_sentence_features(sentences)
        feats = self._get_sentence_feats(features)
        tags = self._viterbi_decode(feats)
        return tags
Пример #29
0
class EnsembleCRFModel:
    def __init__(self,
                 model_path_list,
                 bert_dir_list,
                 num_tags,
                 device,
                 lamb=1 / 3):

        self.models = []
        self.crf_module = CRF(num_tags=num_tags, batch_first=True)
        self.lamb = lamb

        for idx, _path in enumerate(model_path_list):
            print(f'Load model from {_path}')

            print(f'Load model type: {bert_dir_list[0]}')
            model = CRFModel(bert_dir=bert_dir_list[0], num_tags=num_tags)

            model.load_state_dict(
                torch.load(_path, map_location=torch.device('cpu')))

            model.eval()
            model.to(device)

            self.models.append(model)
            if idx == 0:
                print(f'Load CRF weight from {_path}')
                self.crf_module.load_state_dict(model.crf_module.state_dict())
                self.crf_module.to(device)

    def weight(self, t):
        """
        牛顿冷却定律加权融合
        """
        return math.exp(-self.lamb * t)

    def predict(self, model_inputs):
        weight_sum = 0.
        logits = None
        attention_masks = model_inputs['attention_masks']

        for idx, model in enumerate(self.models):
            # 使用牛顿冷却概率融合
            weight = self.weight(idx)

            # 使用概率平均融合
            # weight = 1 / len(self.models)

            tmp_logits = model(**model_inputs)[1] * weight
            weight_sum += weight

            if logits is None:
                logits = tmp_logits
            else:
                logits += tmp_logits

        logits = logits / weight_sum

        tokens_out = self.crf_module.decode(emissions=logits,
                                            mask=attention_masks.byte())

        return tokens_out

    def vote_entities(self, model_inputs, sent, id2ent, threshold):
        entities_ls = []
        for idx, model in enumerate(self.models):
            tmp_tokens = model(**model_inputs)[0][0]
            tmp_entities = crf_decode(tmp_tokens, sent, id2ent)
            entities_ls.append(tmp_entities)

        return vote(entities_ls, threshold)
Пример #30
0
class BertTagger_with_LSTMCRF(nn.Module):
    def __init__(self, args, model):  # 传参传入了model
        super(BertTagger_with_LSTMCRF, self).__init__()
        self.embedding = model.embedding
        self.encoder = model.encoder
        self.target = model.target
        self.args = args
        self.need_birnn = args.need_birnn
        self.labels_num = args.labels_num
        out_dim = args.hidden_size

        # 如果为False,则不要BiLSTM层
        if self.need_birnn:
            self.birnn = nn.LSTM(args.hidden_size, args.rnn_dim, num_layers=1, bidirectional=True, batch_first=True)
            out_dim = args.rnn_dim * 2

        self.output_layer = nn.Linear(out_dim, self.labels_num)
        self.dropout = nn.Dropout(args.dropout)

        self.crf = CRF(args.labels_num, batch_first=True)



    def forward(self, src, label, mask, pos=None, vm=None):
        """
        Args:
            src: [batch_size x seq_length]
            label: [batch_size x seq_length]
            mask: [batch_size x seq_length]
        Returns:
            loss: Sequence labeling loss.
            correct: Number of labels that are predicted correctly.
            predict: Predicted label.
            label: Gold label.
        example:
            src size: torch.Size([8, 128])
            output size: torch.Size([8, 128, 768])
            output size: torch.Size([8, 128, 256])
            output size: torch.Size([8, 128, 256])
            output size: torch.Size([8, 128, 15])
            output size: torch.Size([8, 128])
            output size: torch.Size([1024, 1])
            label size: torch.Size([1024, 1])
            label size: torch.Size([1024])

        """
        # Embedding.
        emb = self.embedding(src, mask, pos)
        # Encoder.
        output = self.encoder(emb, mask, vm)
        if(self.need_birnn):
            output, _ = self.birnn(output)

        # Target.
        output = self.dropout(output)

        output = self.output_layer(output)

        loss = -1*self.crf(output,label, mask=mask.byte())
        output = torch.LongTensor(np.array(self.crf.decode(output))).to(self.args.device)

        output = output.contiguous().view(-1, 1)

        label = label.contiguous().view(-1, 1)

        label_mask = (label > 0).float().to(torch.device(label.device))



        label_mask = label_mask.contiguous().view(-1)
        label = label.contiguous().view(-1)
        predict = output.contiguous().view(-1)
        correct = torch.sum(
            label_mask * (predict.eq(label)).float()
        ) #torch nb

        return loss, correct, predict, label