예제 #1
0
 def __init__(self,
              embedding_dim,
              hidden_dim,
              vocab_size,
              tagset_size,
              char_size,
              pretrained_weight_embeddings,
              USE_CRF,
              BIDIRECTIONAL=False,
              USE_BIGRAM=False,
              bigram_size=0,
              CNN=False,
              use_gpu=0):
     super(BiLSTM_CRF_CNN, self).__init__()
     # include start and end tags
     self.gpu = use_gpu
     self.bidirectional = BIDIRECTIONAL
     self.lstm_cnn = BILSTM_CNN(embedding_dim,
                                hidden_dim,
                                vocab_size,
                                tagset_size,
                                char_size,
                                pretrained_weight_embeddings,
                                USE_CRF=False,
                                BIDIRECTIONAL=self.bidirectional,
                                USE_BIGRAM=False,
                                bigram_size=0,
                                CNN=False,
                                use_gpu=0)
     self.crf = CRF(tagset_size, self.gpu)
예제 #2
0
 def __init__(self):
     super(Model, self).__init__()
     self.bert = BertModel.from_pretrained(model_config.pretrain_model_path)
     self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
     self.classifier = nn.Linear(self.bert.config.hidden_size,
                                 config.num_labels)
     self.crf = CRF(tagset_size=config.num_labels,
                    tag_dictionary=config.label2id,
                    is_bert=True)
예제 #3
0
    def __init__(self, args, data):
        super(Token_Classification, self).__init__()

        self.pretrain_model = BertModel.from_pretrained(args.bert_file)
        self.dropout = nn.Dropout(args.hidden_dropout_prob)
        self.to_crf = nn.Linear(768, data.label_alphabet.size() + 2)
        self.crf = CRF(data.label_alphabet.size(), args.use_gpu,
                       args.average_batch)
        if args.use_gpu:
            self.to_crf = self.to_crf.cuda()
            self.pretrain_model = self.pretrain_model.cuda()
            self.gpu = True
예제 #4
0
class Token_Classification(nn.Module):
    def __init__(self, args, data):
        super(Token_Classification, self).__init__()

        self.pretrain_model = BertModel.from_pretrained(args.bert_file)
        self.dropout = nn.Dropout(args.hidden_dropout_prob)
        self.to_crf = nn.Linear(768, data.label_alphabet.size() + 2)
        self.crf = CRF(data.label_alphabet.size(), args.use_gpu,
                       args.average_batch)
        if args.use_gpu:
            self.to_crf = self.to_crf.cuda()
            self.pretrain_model = self.pretrain_model.cuda()
            self.gpu = True

    def forward(self, input_ids, attention_mask, crf_mask, scopes):
        crf_input = self.get_crf_input(input_ids, attention_mask, scopes)
        _, best_path = self.crf._viterbi_decode(crf_input, crf_mask)
        return best_path

    def neg_log_likelihood(self, input_ids, attention_mask, batch_label,
                           crf_mask, scopes):
        crf_input = self.get_crf_input(input_ids, attention_mask, scopes)
        total_loss = self.crf.neg_log_likelihood_loss(crf_input, crf_mask,
                                                      batch_label)
        _, best_path = self.crf._viterbi_decode(crf_input, crf_mask)
        return total_loss, best_path

    def get_crf_input(self, input_ids, attention_mask, scopes):
        pretrain_model_output = self.pretrain_model(
            input_ids, attention_mask=attention_mask)
        hidden_repr = self.to_crf(pretrain_model_output[0])
        max_len = max(map(len, scopes))
        repr_dim = hidden_repr.size()[-1]
        crf_input = []
        for scope, repr in zip(scopes, hidden_repr):
            c_repr = []
            for i in range(len(scope)):
                c_repr.append(torch.mean(repr[scope[i][0]:scope[i][1]], dim=0))
            c_repr = torch.stack(c_repr)
            if max_len - len(scope) > 0:
                if self.gpu:
                    pad_repr = torch.zeros(max_len - len(scope),
                                           repr_dim).cuda()
                else:
                    pad_repr = torch.zeros(max_len - len(scope), repr_dim)
                crf_input.append(torch.cat((c_repr, pad_repr), dim=0))
            else:
                crf_input.append(c_repr)
        return torch.stack(crf_input)
예제 #5
0
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(model_config.pretrain_model_path)
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size,
                                    config.num_labels)
        self.crf = CRF(tagset_size=config.num_labels,
                       tag_dictionary=config.label2id,
                       is_bert=True)

    def forward(self,
                input_ids,
                token_type_ids=None,
                attention_mask=None,
                input_lens=None,
                labels=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        outputs = (logits, )
        if labels is not None:
            loss = self.crf.calculate_loss(logits,
                                           tag_list=labels,
                                           lengths=input_lens)
            outputs = (loss, ) + outputs
        return outputs
예제 #6
0
 def __init__(self, config):
     super(BertCrfForNer, self).__init__(config)
     self.bert = BertModel(config)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.classifier = nn.Linear(config.hidden_size, config.num_labels)
     self.crf = CRF(num_tags=config.num_labels, batch_first=True)
     self.init_weights()
예제 #7
0
class Model(nn.Module):
    def __init__(self, ):
        super(Model, self).__init__()
        self.hidden_size = model_config.hidden_size
        self.embedding = nn.Embedding(config.num_vocab, config.embed_dim)
        self.bilstm = nn.LSTM(input_size=config.embed_dim,
                              hidden_size=self.hidden_size,
                              batch_first=True,
                              num_layers=2,
                              dropout=model_config.dropout,
                              bidirectional=True)
        # self.dropout = SpatialDropout(drop_p)
        self.dropout = nn.Dropout(model_config.dropout)
        self.layer_norm = LayerNorm(self.hidden_size * 2)
        self.classifier = nn.Linear(self.hidden_size * 2, config.num_labels)
        self.crf = CRF(tagset_size=config.num_labels,
                       tag_dictionary=config.label2id,
                       is_bert=True)

    def forward(self, input_ids, attention_mask, input_lens, labels=None):
        embs = self.embedding(input_ids)
        embs = self.dropout(embs)
        embs = embs * attention_mask.float().unsqueeze(2)
        seqence_output, _ = self.bilstm(embs)
        seqence_output = self.layer_norm(seqence_output)
        logits = self.classifier(seqence_output)
        outputs = (logits, )
        if labels is not None:
            loss = self.crf.calculate_loss(logits,
                                           tag_list=labels,
                                           lengths=input_lens)
            outputs = (loss, ) + outputs
        return outputs
예제 #8
0
 def __init__(self, ):
     super(Model, self).__init__()
     self.hidden_size = model_config.hidden_size
     self.embedding = nn.Embedding(config.num_vocab, config.embed_dim)
     self.bilstm = nn.LSTM(input_size=config.embed_dim,
                           hidden_size=self.hidden_size,
                           batch_first=True,
                           num_layers=2,
                           dropout=model_config.dropout,
                           bidirectional=True)
     # self.dropout = SpatialDropout(drop_p)
     self.dropout = nn.Dropout(model_config.dropout)
     self.layer_norm = LayerNorm(self.hidden_size * 2)
     self.classifier = nn.Linear(self.hidden_size * 2, config.num_labels)
     self.crf = CRF(tagset_size=config.num_labels,
                    tag_dictionary=config.label2id,
                    is_bert=True)
예제 #9
0
    def __init__(self, config):
        super().__init__()

        self.embedding = torch.nn.Embedding(len(config.tokenizer.vocab),
                                            config.emb_size)
        self.in_fc = nn.Linear(config.emb_size, config.d_model)
        self.transformer = TransformerEncoder(config)
        self.fc_dropout = nn.Dropout(config.fc_dropout)
        self.out_fc = nn.Linear(config.d_model, len(config.label2id))
        self.crf = CRF(num_tags=len(config.label2id), batch_first=True)
        self.apply(self.init_model_weights)
예제 #10
0
 def __init__(self,
              args):
     super(BilstmCrf, self).__init__()
     self.embedding = nn.Embedding(len(args.tokenizer.vocab), args.embedding_size)
     self.bilstm = nn.LSTM(input_size=args.embedding_size, hidden_size=args.hidden_size,
                           batch_first=True, num_layers=2, dropout=0.1,
                           bidirectional=True)
     self.dropout = SpatialDropout(0.1)
     self.layer_norm = LayerNorm(args.hidden_size * 2)
     self.classifier = nn.Linear(args.hidden_size * 2, args.num_labels)
     self.crf = CRF(num_tags=args.num_labels, batch_first=True)
     self.apply(self.init_model_weights)
예제 #11
0
class BiLSTM_CRF_CNN(nn.Module):
    def __init__(self,
                 embedding_dim,
                 hidden_dim,
                 vocab_size,
                 tagset_size,
                 char_size,
                 pretrained_weight_embeddings,
                 USE_CRF,
                 BIDIRECTIONAL=False,
                 USE_BIGRAM=False,
                 bigram_size=0,
                 CNN=False,
                 use_gpu=0):
        super(BiLSTM_CRF_CNN, self).__init__()
        # include start and end tags
        self.gpu = use_gpu
        self.bidirectional = BIDIRECTIONAL
        self.lstm_cnn = BILSTM_CNN(embedding_dim,
                                   hidden_dim,
                                   vocab_size,
                                   tagset_size,
                                   char_size,
                                   pretrained_weight_embeddings,
                                   USE_CRF=False,
                                   BIDIRECTIONAL=self.bidirectional,
                                   USE_BIGRAM=False,
                                   bigram_size=0,
                                   CNN=False,
                                   use_gpu=0)
        self.crf = CRF(tagset_size, self.gpu)

    def neg_ll_loss(self, sentence, gold_labels, chars):
        feats = self.lstm_cnn.forward(sentence, chars)
        return self.crf.neg_ll_loss(sentence, gold_labels, feats)

    def forward(self, sentence, feats):
        feats = self.lstm.forward(sentence)
        score, tag_seq = self.crf.forward(sentence, feats)
        return score, tag_seq
예제 #12
0
    def __init__(self, ner_processor, config):
        super().__init__()

        vocab_size = len(ner_processor.vocab)
        num_labels = len(ner_processor.idx2label)
        self.embedding = torch.nn.Embedding(vocab_size, config.emb_size)
        nn.init.normal_(self.embedding.weight, 0.0, 0.02)
        self.embed_size = config.emb_size
        self.in_fc = nn.Linear(config.emb_size, config.d_model)
        self.transformer = TransformerEncoder(config)
        self.fc_dropout = nn.Dropout(config.fc_dropout)
        self.out_fc = nn.Linear(config.d_model, num_labels)
        self.crf = CRF(num_tags=num_labels, batch_first=True)
예제 #13
0
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, tag_to_ix, USE_CRF=False,
                 BIDIRECTIONAL=False, USE_BIGRAM=False, bigram_size=0):
        super(LSTM_CRF, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # self.word_embeddings.weight.requires_grad = False
        print("Entered!!!!")
        # if pretrained_weight_embeddings != None:
        #  self.word_embeddings.weight.data.copy_(torch.from_numpy(pretrained_weight_embeddings))
        if BIDIRECTIONAL:
            self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, bidirectional=BIDIRECTIONAL)
        else:
            self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        self.crf = CRF(tagset_size, tag_to_ix, 0)

        self.bidirectional = BIDIRECTIONAL
        self.append_bigram = USE_BIGRAM
        self.hidden = self.init_hidden()
        if self.append_bigram:
            self.hidden2tag = nn.Linear(hidden_dim + bigram_size, tagset_size)
        else:
            self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
예제 #14
0
class LSTM_CRF(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, tag_to_ix, USE_CRF=False,
                 BIDIRECTIONAL=False, USE_BIGRAM=False, bigram_size=0):
        super(LSTM_CRF, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # self.word_embeddings.weight.requires_grad = False
        print("Entered!!!!")
        # if pretrained_weight_embeddings != None:
        #  self.word_embeddings.weight.data.copy_(torch.from_numpy(pretrained_weight_embeddings))
        if BIDIRECTIONAL:
            self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, bidirectional=BIDIRECTIONAL)
        else:
            self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        self.crf = CRF(tagset_size, tag_to_ix, 0)

        self.bidirectional = BIDIRECTIONAL
        self.append_bigram = USE_BIGRAM
        self.hidden = self.init_hidden()
        if self.append_bigram:
            self.hidden2tag = nn.Linear(hidden_dim + bigram_size, tagset_size)
        else:
            self.hidden2tag = nn.Linear(hidden_dim, tagset_size)



    def init_hidden(self):
        if not self.bidirectional:
            return (autograd.Variable(torch.randn(1, 1, self.hidden_dim)),
                    autograd.Variable(torch.randn(1, 1, self.hidden_dim)))
        else:
            return (autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)),
                    autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)))

    def forward_lstm(self, sentence, bigram_one_hot=None):
        self.hidden = self.init_hidden()
        embeds = self.word_embeddings(sentence)  # shape seq_length * emb_size
        # print(embeds.view(len(sentence), 1, -1).shape)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        # print("original shape before MLP "+str(lstm_out.view(len(sentence), -1).shape))
        # print("shape of onehot bigram "+str(bigram_one_hot))
        if self.append_bigram:
            # print("concatednated vector"+str(lstm_out.view(len(sentence), -1)))
            tag_space=self.hidden2tag(torch.cat([lstm_out.view(len(sentence), -1),bigram_one_hot],dim=1))
        else:
            tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))

        # tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_space

    def neg_ll_loss(self, sentence, gold_labels):
        feats = self.forward_lstm(sentence)
        return self.crf.neg_ll_loss(sentence, gold_labels, feats)

    def forward(self, sentence):
        feats = self.forward_lstm(sentence)
        score, tag_seq = self.crf.forward(sentence, feats)
        return score, tag_seq
    def __init__(self,
                 embedding_dim,
                 hidden_dim,
                 vocab_size,
                 tagset_size,
                 char_size,
                 pretrained_weight_embeddings,
                 tag_to_ix,
                 USE_CRF=False,
                 BIDIRECTIONAL=False,
                 USE_BIGRAM=False,
                 bigram_size=0,
                 CNN=False,
                 use_gpu=0):
        super(BILSTM_CNN, self).__init__()
        self.char_dim = 25
        self.char_lstm_dim = 25
        self.CNN = CNN
        self.use_gpu = use_gpu
        self.hidden_dim = hidden_dim
        self.n_cap = 4
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # self.word_embeddings.weight.requires_grad = False
        # if pretrained_weight_embeddings != None:
        self.word_embeddings.weight.data.copy_(
            torch.from_numpy(pretrained_weight_embeddings))
        #CHAR
        self.cap_embedding_dim = 25
        self.cap_embeds = nn.Embedding(self.n_cap, self.cap_embedding_dim)
        b = np.sqrt(3.0 / self.cap_embeds.weight.size(1))
        nn.init.uniform(self.cap_embeds.weight, -b, b)

        if self.CNN:
            print("Entered!!!!")
            self.char_embeds = nn.Embedding(char_size, self.char_dim)
            #as given in the paper, initialising
            b = np.sqrt(3.0 / self.char_embeds.weight.size(1))
            nn.init.uniform(self.char_embeds.weight, -b, b)

            # self.init_embedding(self.char_embeds.weight)
            self.char_cnn = nn.Conv2d(in_channels=1,
                                      out_channels=self.char_lstm_dim,
                                      kernel_size=(3, self.char_dim),
                                      padding=(2, 0))

        if BIDIRECTIONAL:
            print("Bidirectional")
            self.lstm = nn.LSTM(embedding_dim + self.char_lstm_dim +
                                self.cap_embedding_dim,
                                hidden_dim,
                                bidirectional=BIDIRECTIONAL)
        else:
            self.lstm = nn.LSTM(embedding_dim + self.char_lstm_dim, hidden_dim)

        self.drop_probout = nn.Dropout(0.5)
        self.bidirectional = BIDIRECTIONAL
        self.append_bigram = USE_BIGRAM
        self.hidden = self.init_hidden()
        # if self.append_bigram:
        #     self.hidden2tag = nn.Linear(hidden_dim + bigram_size, tagset_size)
        # else:
        #     self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

        self.crf = CRF(tagset_size, tag_to_ix, self.use_gpu)
        if self.use_gpu:
            self.crf = self.crf.cuda()
        if self.bidirectional:
            self.hidden2tag = nn.Linear(2 * hidden_dim, tagset_size)
            b = np.sqrt(6.0 / (self.hidden2tag.weight.size(0) +
                               self.hidden2tag.weight.size(1)))
            nn.init.uniform(self.hidden2tag.weight, -b, b)
            self.hidden2tag.bias.data.zero_()
        else:
            self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
class BILSTM_CNN(nn.Module):
    def __init__(self,
                 embedding_dim,
                 hidden_dim,
                 vocab_size,
                 tagset_size,
                 char_size,
                 pretrained_weight_embeddings,
                 tag_to_ix,
                 USE_CRF=False,
                 BIDIRECTIONAL=False,
                 USE_BIGRAM=False,
                 bigram_size=0,
                 CNN=False,
                 use_gpu=0):
        super(BILSTM_CNN, self).__init__()
        self.char_dim = 25
        self.char_lstm_dim = 25
        self.CNN = CNN
        self.use_gpu = use_gpu
        self.hidden_dim = hidden_dim
        self.n_cap = 4
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # self.word_embeddings.weight.requires_grad = False
        # if pretrained_weight_embeddings != None:
        self.word_embeddings.weight.data.copy_(
            torch.from_numpy(pretrained_weight_embeddings))
        #CHAR
        self.cap_embedding_dim = 25
        self.cap_embeds = nn.Embedding(self.n_cap, self.cap_embedding_dim)
        b = np.sqrt(3.0 / self.cap_embeds.weight.size(1))
        nn.init.uniform(self.cap_embeds.weight, -b, b)

        if self.CNN:
            print("Entered!!!!")
            self.char_embeds = nn.Embedding(char_size, self.char_dim)
            #as given in the paper, initialising
            b = np.sqrt(3.0 / self.char_embeds.weight.size(1))
            nn.init.uniform(self.char_embeds.weight, -b, b)

            # self.init_embedding(self.char_embeds.weight)
            self.char_cnn = nn.Conv2d(in_channels=1,
                                      out_channels=self.char_lstm_dim,
                                      kernel_size=(3, self.char_dim),
                                      padding=(2, 0))

        if BIDIRECTIONAL:
            print("Bidirectional")
            self.lstm = nn.LSTM(embedding_dim + self.char_lstm_dim +
                                self.cap_embedding_dim,
                                hidden_dim,
                                bidirectional=BIDIRECTIONAL)
        else:
            self.lstm = nn.LSTM(embedding_dim + self.char_lstm_dim, hidden_dim)

        self.drop_probout = nn.Dropout(0.5)
        self.bidirectional = BIDIRECTIONAL
        self.append_bigram = USE_BIGRAM
        self.hidden = self.init_hidden()
        # if self.append_bigram:
        #     self.hidden2tag = nn.Linear(hidden_dim + bigram_size, tagset_size)
        # else:
        #     self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

        self.crf = CRF(tagset_size, tag_to_ix, self.use_gpu)
        if self.use_gpu:
            self.crf = self.crf.cuda()
        if self.bidirectional:
            self.hidden2tag = nn.Linear(2 * hidden_dim, tagset_size)
            b = np.sqrt(6.0 / (self.hidden2tag.weight.size(0) +
                               self.hidden2tag.weight.size(1)))
            nn.init.uniform(self.hidden2tag.weight, -b, b)
            self.hidden2tag.bias.data.zero_()
        else:
            self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def init_hidden(self):
        if self.use_gpu:
            if not self.bidirectional:
                return (autograd.Variable(
                    torch.randn(1, 1, self.hidden_dim).cuda()),
                        autograd.Variable(torch.randn(1, 1,
                                                      self.hidden_dim)).cuda())
            else:
                return (autograd.Variable(
                    torch.randn(2, 1, self.hidden_dim).cuda()),
                        autograd.Variable(torch.randn(2, 1,
                                                      self.hidden_dim)).cuda())
        else:
            if not self.bidirectional:
                return (autograd.Variable(torch.randn(1, 1, self.hidden_dim)),
                        autograd.Variable(torch.randn(1, 1, self.hidden_dim)))
            else:
                return (autograd.Variable(torch.randn(2, 1, self.hidden_dim)),
                        autograd.Variable(torch.randn(2, 1, self.hidden_dim)))

    def forward_lstm(self, sentence, chars, caps, drop_prob):
        d = nn.Dropout(p=drop_prob)
        self.hidden = self.init_hidden()
        embeds = self.word_embeddings(sentence)  # shape seq_length * emb_size
        # embeds = self.word_embeddings(sentence)  # shape seq_length * emb_size
        cap_embedding = self.cap_embeds(caps)

        if self.CNN == True:
            chars_embeds = self.char_embeds(chars).unsqueeze(1)
            cnn_output = self.char_cnn(d(chars_embeds))

            chars_embeds = nn.functional.max_pool2d(
                cnn_output, kernel_size=(cnn_output.size(2),
                                         1)).view(cnn_output.size(0),
                                                  self.char_lstm_dim)
            if self.use_gpu:
                embeds = torch.cat((embeds, chars_embeds, cap_embedding),
                                   1).cuda()
            else:
                embeds = torch.cat((embeds, chars_embeds, cap_embedding), 1)
        # print(embeds.view(len(sentence), 1, -1).shape)
        #lstm_out, self.hidden = self.lstm(embeds.unsqueeze(1), self.hidden)
        lstm_out, _ = self.lstm(d(embeds).unsqueeze(1))
        # lstm_out, _ = self.lstm(embeds.unsqueeze(1))
        lstm_out = d(lstm_out.view(len(sentence), self.hidden_dim * 2))
        # lstm_out = lstm_out.view(len(sentence), self.hidden_dim*2)
        # print("original shape before MLP "+str(lstm_out.view(len(sentence), -1).shape))
        # print("shape of onehot bigram "+str(bigram_one_hot))
        if self.append_bigram:
            # print("concatednated vector"+str(lstm_out.view(len(sentence), -1)))
            tag_space = self.hidden2tag(
                torch.cat([lstm_out.view(len(sentence), -1), bigram_one_hot],
                          dim=1))
        else:
            tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))

        ## uncomment for crf
        # return tag_space
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

    def neg_ll_loss(self, sentence, gold_labels, chars, caps, drop_prob):
        feats = self.forward_lstm(sentence, chars, caps, drop_prob)
        return self.crf.neg_ll_loss(sentence, gold_labels, feats)

    def forward(self, sentence, chars, caps, drop_prob):
        # feats = self.forward_lstm(sentence, chars, caps, drop_prob)
        # score, tag_seq = self.crf.forward(sentence, feats)

        scores = self.forward_lstm(sentence, chars, caps, drop_prob)

        # return score, tag_seq
        return scores