class SeqLabel(nn.Module): def __init__(self, data): super(SeqLabel, self).__init__() label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.word_hidden = WordSequence(data) self.crf = CRF(label_size, data.gpu) def calculate_loss(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): outs = self.word_hidden(word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return total_loss, tag_seq def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): outs = self.word_hidden(word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return tag_seq
class BiLSTM_CRF(nn.Module): def __init__(self, data): super(BiLSTM_CRF, self).__init__() print("build batched lstmcrf...") self.gpu = data.HP_gpu # For CRF, we need to add extra two label START and END for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.lstm = BiLSTM(data) self.crf = CRF(label_size, self.gpu) def neg_log_likelihood_loss(self, gaz_list, char_inputs, bichar_inputs, char_seq_lengths, batch_label, mask): outs = self.lstm.get_output_score(gaz_list, char_inputs, bichar_inputs, char_seq_lengths) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return total_loss, tag_seq def forward(self, gaz_list, char_inputs, bichar_inputs, char_seq_lengths, mask): outs = self.lstm.get_output_score(gaz_list, char_inputs, bichar_inputs, char_seq_lengths) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return tag_seq def get_lstm_features(self, gaz_list, char_inputs, bichar_inputs, char_seq_lengths): return self.lstm.get_lstm_features(gaz_list, char_inputs, bichar_inputs, char_seq_lengths)
class BiLstmCrf(nn.Module): def __init__(self, data, configs): super(BiLstmCrf, self).__init__() if configs['random_embedding']: self.word_embeddings = nn.Embedding(data.word_alphabet_size, configs['word_emb_dim']) self.word_embeddings.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet_size, configs['word_emb_dim']))) self.word_drop = nn.Dropout(configs['dropout']) else: pass self.lstm = nn.LSTM(configs['word_emb_dim'], configs['hidden_dim'] // 2, num_layers=configs['num_layers'], batch_first=configs['batch_first'], bidirectional=configs['bidirectional']) self.drop_lstm = nn.Dropout(configs['dropout']) # data.label_alphabet_size大小比label数量大1,是合理的,与label_alphabet的初始化策略有关 # data.train_ids中,没有一个label值是0,所以softmax_logits[0]也一定是一个非常小的值,取不到 self.hidden2tag = nn.Linear(configs['hidden_dim'], data.label_alphabet_size + 2) self.crf = CRF(data.label_alphabet_size, configs['gpu']) def forward(self, batch_input, batch_len, batch_recover, mask, batch_label=None): word_embeds = self.word_drop(self.word_embeddings(batch_input)) packed_words = pack_padded_sequence(word_embeds, batch_len.cpu().numpy(), batch_first=True) hidden = None lstm_out, hidden = self.lstm(packed_words, hidden) lstm_out, _ = pad_packed_sequence(lstm_out) lstm_out = self.drop_lstm(lstm_out.transpose(1, 0)) outputs = self.hidden2tag(lstm_out) if batch_label is not None: total_loss = self.crf.neg_log_likelihood_loss( outputs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outputs, mask) return total_loss, tag_seq else: scores, tag_seq = self.crf._viterbi_decode(outputs, mask) return tag_seq @staticmethod def random_embedding(vocab_size, embedding_dim): pretrain_emb = np.empty([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb
class BiLSTM_CRF(nn.Module): def __init__(self, data): super(BiLSTM_CRF, self).__init__() print ("build batched lstmcrf...") self.gpu = data.HP_gpu ## add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.lstm = BiLSTM(data) self.crf = CRF(label_size, self.gpu) def neg_log_likelihood_loss(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return total_loss, tag_seq def forward(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return tag_seq def get_lstm_features(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): return self.lstm.get_lstm_features(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
class BERT_LSTM_CRF(nn.Module): def __init__(self, bert_config, tagset_size, embedding_dim, hidden_dim, rnn_layers, dropout_ratio, dropout1, use_cuda): super(BERT_LSTM_CRF, self).__init__() self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.word_embeds = BertModel.from_pretrained(bert_config) self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=rnn_layers, bidirectional=True, dropout=dropout_ratio, batch_first=True) self.rnn_layers = rnn_layers self.dropout1 = nn.Dropout(p=dropout1) self.crf = CRF(target_size=tagset_size, average_batch=True, use_cuda=use_cuda) self.liner = nn.Linear(hidden_dim*2, tagset_size+2) self.tagset_size = tagset_size self.use_cuda = use_cuda def rand_init_hidden(self, batch_size): if self.use_cuda: return Variable( torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)).cuda(), Variable( torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)).cuda() else: return Variable( torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)), Variable( torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)) def get_output_score(self, sentence, attention_mask=None): batch_size = sentence.size(0) seq_length = sentence.size(1) embeds, _ = self.word_embeds(sentence, attention_mask=attention_mask, output_all_encoded_layers=False) hidden = self.rand_init_hidden(batch_size) # if embeds.is_cuda: # hidden = (i.cuda() for i in hidden) lstm_out, hidden = self.lstm(embeds, hidden) lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim * 2) d_lstm_out = self.dropout1(lstm_out) l_out = self.liner(d_lstm_out) lstm_feats = l_out.contiguous().view(batch_size, seq_length, -1) return lstm_feats def forward(self, sentence, masks): lstm_feats = self.get_output_score(sentence) scores, tag_seq = self.crf._viterbi_decode(lstm_feats, masks.byte()) return tag_seq def neg_log_likelihood_loss(self, sentence, mask, tags): lstm_feats = self.get_output_score(sentence) loss_value = self.crf.neg_log_likelihood_loss(lstm_feats, mask, tags) batch_size = lstm_feats.size(0) loss_value /= float(batch_size) return loss_value
class BertCRF(BertPreTrainedModel): def __init__(self, config, model_configs): super(BertCRF, self).__init__(config) self.num_labels = config.num_labels self.max_seq_length = model_configs['max_seq_length'] self.bert = BertModel(config) self.use_cuda = model_configs['use_cuda'] and torch.cuda.is_available() self.crf = CRF(target_size=self.num_labels, use_cuda=self.use_cuda, average_batch=False) bert_embedding = config.hidden_size # hidden_dim即输出维度 # lstm的hidden_dim和init_hidden的hidden_dim是一致的 # 是输出层hidden_dim的1/2 self.hidden_dim = config.hidden_size self.dropout = nn.Dropout(model_configs['dropout_rate']) self.hidden2label = nn.Linear(self.hidden_dim, self.num_labels + 2) self.apply(self.init_weights) def forward(self, input_ids, segment_ids, input_mask): # outputs = sequence_output, pooled_output, (hidden_states), (attentions) # sequence_output = encoder_outputs[0] # pooled_output = pooler(sequence_output) outputs = self.bert(input_ids=input_ids, position_ids=None, token_type_ids=segment_ids, attention_mask=input_mask, head_mask=None) # bert_embeds: shape = [batch_size, max_seq_length, bert_embedding] bert_embeds = outputs[0].contiguous().view(-1, self.hidden_dim) bert_embeds = self.dropout(bert_embeds) logits = self.hidden2label(bert_embeds) return logits.view(-1, self.max_seq_length, self.num_labels + 2) def loss_fn(self, feats, mask, labels): batch_size = feats.size(0) loss_value = self.crf.neg_log_likelihood_loss( feats, mask, labels) / float(batch_size) return loss_value def predict(self, feats, mask): path_score, best_path = self.crf(feats, mask.byte()) return best_path
class BiLSTM_CRF(nn.Module): def __init__(self, data): super(BiLSTM_CRF, self).__init__() label_size = data.label_alphabet.size() data.label_size = label_size + 2 self.lstm = BiLSTM(data).to(device) self.crf = CRF(target_size=label_size, use_cuda=use_cuda, average_batch=True).to(device) def neg_log_likelihood_loss(self, batch_label, mask, *args): outs = self.lstm.get_output_score(*args) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) return total_loss def forward(self, mask, *args): outs = self.lstm.get_output_score(*args) scores, tag_seq = self.crf(outs, mask) return tag_seq
class CrfTagger2(nn.Module): # based on SeqLabel in NCRFpp def __init__(self, kwargs): super(CrfTagger2, self).__init__() self.gpu = kwargs.pop("use_gpu", False) self.average_batch = kwargs.pop("average_batch", True) self.crf = NCRFpp_CRF(kwargs["tagset_size"], self.gpu) if kwargs.pop("use_lstm", False): kwargs["tagset_size"] += 2 self.lstm = LstmTagger(**kwargs) @staticmethod def _get_mask(X_lens, batch_size, seq_len): mask = Variable(torch.zeros((batch_size, seq_len))).byte() for idx, X_len in enumerate(X_lens): mask[idx, :X_len] = torch.ones(X_len) return mask def forward(self, input, input_lens): logits = self.lstm.forward(input, input_lens, apply_softmax=False) batch_size, seq_len, _ = logits.size() mask = __class__._get_mask(input_lens, batch_size, seq_len) return logits, mask def loss(self, logits, mask, target): total_loss = self.crf.neg_log_likelihood_loss(logits, mask, target) batch_size, seq_len, _ = logits.size() if self.average_batch: total_loss = total_loss / batch_size return total_loss def decode(self, logits, mask, return_scores=False): scores, tag_seq = self.crf.viterbi_decode(logits, mask) if return_scores: return scores, tag_seq return tag_seq def decode_nbest(self, logits, mask, nbest, return_scores=False): scores, tag_seq = self.crf.viterbi_decode_nbest(logits, mask, nbest) if return_scores: return scores, tag_seq return tag_seq
class CWS(nn.Module): def __init__(self, data): super(CWS, self).__init__() print("build batched vallina lstmcrf...") self.gpu = data.HP_gpu # add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.lstm = Seq(data) self.crf = CRF(label_size, self.gpu) print("finished built model: ", self) def neg_log_likelihood_loss(self, word_text, word_inputs, biword_inputs, batch_label, mask, word_seq_lens, batch_pos): outs = self.lstm.get_output_score(mask, word_text, word_inputs, biword_inputs, word_seq_lens, batch_pos, external_pos={}) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) _, tag_seq = self.crf._viterbi_decode(outs, mask) return total_loss, tag_seq def forward(self, word_text, word_inputs, biword_inputs, word_seq_lens, mask, batch_pos, external_pos): outs = self.lstm.get_output_score(mask, word_text, word_inputs, biword_inputs, word_seq_lens, batch_pos, external_pos) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return tag_seq
class BiLSTMCRF(BaseModel): def __init__(self, configs, pretrained_word_embed=None): super(BiLSTMCRF, self).__init__() self.configs = configs self.num_labels = configs['num_labels'] self.max_seq_length = configs['max_seq_length'] self.use_cuda = configs['use_cuda'] and torch.cuda.is_available() self.bilstm = BiLSTM(configs, pretrained_word_embed) self.crf = CRF(target_size=self.num_labels, use_cuda=self.use_cuda, average_batch=False) self.hidden2label = nn.Linear(self.bilstm.hidden_dim * 2, self.num_labels + 2) def forward(self, input_ids, segment_ids, input_mask): lstm_outputs = self.bilstm.get_lstm_outputs(input_ids) logits = self.hidden2label(lstm_outputs) return logits.view(-1, self.max_seq_length, self.num_labels + 2) def loss_fn(self, feats, mask, labels): """ Args: feats: size=(batch_size, seq_len, tag_size) mask: size=(batch_size, seq_len) tags: size=(batch_size, seq_len) """ batch_size = feats.size(0) loss_value = self.crf.neg_log_likelihood_loss( feats, mask, labels) / float(batch_size) return loss_value def predict(self, feats, mask): path_score, best_path = self.crf(feats, mask.byte()) return best_path
class RoleFiller(nn.Module): r""" Intergrated Model of Document Event Role Filler. Model contents: 1. word embedding, including pre-trained Glove and Bert. 2. 2 LSTMs for both sentence level and paragraph level. 3. Concat. 4. CRF calculates score. Args: reader: Reader instance from Main script. """ def __init__(self, reader): super(RoleFiller, self).__init__() # reader = Reader('') self.embedding = Glove_Bert_Embedding( reader.word_dict.word_size, reader.config.parser['word_embed_dim'], reader.config.parser['HP_dropout'], reader.build_pre_embedding(use_saved_embed=True), reader.word_dict.idx2word, reader.config.parser['bert_dir']) self.drop_lstm_sent = nn.Dropout(reader.config.parser['HP_dropout'] - 0.1) self.drop_lstm_para = nn.Dropout(reader.config.parser['HP_dropout']) self.batch_average = reader.config.parser['batch_average'] self.embedding_dim = reader.config.parser['word_embed_dim'] + 768 # 768 is set to be the statical bert dimension # LSTM self.hidden_dim = reader.config.parser['HP_hidden_dim'] if reader.config.parser['HP_bilstm']: self.hidden_dim //= 2 # LSTM for paragraph level self.lstm_para = nn.LSTM( self.embedding_dim, self.hidden_dim, reader.config.parser['HP_lstm_layers_num'], batch_first=True, bidirectional=reader.config.parser['HP_bilstm']) # LSTM for sentence level self.lstm_sent = nn.LSTM( self.embedding_dim, self.hidden_dim, reader.config.parser['HP_lstm_layers_num'], batch_first=True, bidirectional=reader.config.parser['HP_bilstm']) # gate-sigmoid sum self.gate = nn.Linear(2 * reader.config.parser['HP_hidden_dim'], reader.config.parser['HP_hidden_dim']) self.sigmoid = nn.Sigmoid() self.hidden2tag = nn.Linear(reader.config.parser['HP_hidden_dim'], reader.tag_dict.word_size + 2) self.softmax = nn.Softmax(dim=-1) self.crf = CRF(reader.tag_dict.word_size) def sentence_level_process(self, sentence): r""" LSTM Processing for sentence level. Args: sentence: Tensor of one sentence. Return: Tensor shape of [1, sentence len, hidden dim] """ sentence_tensor = self.embedding(sentence.view(1, -1)) # sentence: [1, sentence len, embedding dim] sentence_tensor = pack_padded_sequence( sentence_tensor, lengths=[sentence_tensor.shape[1]], batch_first=True) hidden = None sentence_tensor, hidden = self.lstm_sent(sentence_tensor, hidden) sentence_tensor, _ = pad_packed_sequence(sentence_tensor, batch_first=True) # sentence_tensor: [1, sentence len, hidden dim] return self.drop_lstm_sent(sentence_tensor) def forward(self, inputs_sent, inputs_para, lengths, batch_labels, mask): r""" Args: inputs_sent: [batch size, k(which equals 3), sentence len] inputs_para: [batch size, k X max_len] lengths: list of sequences length batch_labels: labels coordinate with inputs_para. mask: mask tensor. Return: loss: loss value. tag_sequence: best tag-sequence. """ # process paragraph level embedding and lstm para_tensor = self.embedding(inputs_para) # para_tensor: [batch size, sentence len, embedding dim] para_tensor = pack_padded_sequence(para_tensor, lengths=lengths, batch_first=True) hidden = None para_tensor, hidden = self.lstm_para(para_tensor, hidden) para_tensor, _ = pad_packed_sequence(para_tensor, batch_first=True) # para_tensor: [batch_size, sentence len, hidden dim] para_tensor = self.drop_lstm_para(para_tensor) # para_tensor: [batch size, sentence len, hidden dim] max_len = para_tensor.shape[1] sent_tensor = torch.zeros(para_tensor.shape) for idx, sentences in enumerate(inputs_sent): sample_sent = [ self.sentence_level_process(sentence).squeeze(0) for sentence in sentences ] sent_tensor[idx][:lengths[idx]][:] = torch.cat(sample_sent, dim=0) # sent_tensor: [batch size, max len, hidden dim] gamma = self.sigmoid( self.gate(torch.cat([sent_tensor, para_tensor], dim=2))) # gamma: [batch size, max len, hidden dim] gamma = self.hidden2tag(gamma * sent_tensor + (1 - gamma) * para_tensor) # gamma: [batch size, max len, tag num] # crf loss = self.crf.neg_log_likelihood_loss(gamma, mask, batch_labels) scores, tag_seq = self.crf._viterbi_decode(gamma, mask) if self.batch_average: loss /= gamma.shape[0] return loss, tag_seq
class CnnLstmAttnCrf(nn.Module): def __init__(self, data): super(CnnLstmAttnCrf, self).__init__() self.char_embeddings = nn.Embedding(data.char_alphabet_size, config.char_emb_dim) self.char_embeddings.weight.data.copy_( torch.from_numpy(self.random_embedding(data.char_alphabet_size, config.char_emb_dim))) self.char_drop = nn.Dropout(config.dropout) self.char_cnn = nn.Conv1d( in_channels=config.char_emb_dim, out_channels=config.char_hidden_dim, kernel_size=3, padding=1) self.word_embeddings = nn.Embedding(data.word_alphabet_size, config.word_emb_dim) self.word_embeddings.weight.data.copy_( torch.from_numpy(self.random_embedding(data.word_alphabet_size, config.word_emb_dim))) self.word_drop = nn.Dropout(config.dropout) self.feature_embeddings = nn.Embedding(data.feat_alphabet_size, config.feature_emb_dim) # 加载预训练的feat_emb: if len(data.pretrain_feature_embeddings) > 1: self.feature_embeddings.weight.data.copy_(torch.from_numpy(data.pretrain_feature_embeddings)) self.lstm = nn.LSTM( config.word_emb_dim + config.feature_emb_dim, config.hidden_dim // 2, num_layers=1, batch_first=True, bidirectional=True) self.droplstm = nn.Dropout(config.dropout) # attn层 self.attention = ScaledDotProductAttention(temperature=np.power(config.d_k, 0.5)) self.hidden2tag = nn.Linear(config.hidden_dim, data.label_alphabet_size + 2) self.crf = CRF(data.label_alphabet_size, config.gpu) def forward(self, batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, batch_label=None): char_batch_size = batch_char.size(0) char_embeds = self.char_embeddings(batch_char) char_embeds = self.char_drop(char_embeds) char_embeds = char_embeds.transpose(1, 2) # 将max_length和embedding_dim转置 char_cnn_out = self.char_cnn(char_embeds) # char_cnn_out = torch.max_pool1d(char_cnn_out, kernel_size=char_cnn_out.size(2)).view(char_batch_size, -1) char_cnn_out = char_cnn_out[batch_charrecover] # 还原排序之前的batch char_features = char_cnn_out.view(batch_word.size(0), batch_word.size(1), -1) feat_embs = self.feature_embeddings(batch_features) word_embs = self.word_embeddings(batch_word) word_embs = torch.cat([word_embs, feat_embs], 2) word_represent = self.word_drop(word_embs) packed_words = pack_padded_sequence(word_represent, batch_wordlen.cpu().numpy(), batch_first=True) hidden = None lstm_out, hidden = self.lstm(packed_words, hidden) lstm_out, _ = pad_packed_sequence(lstm_out) lstm_out = self.droplstm(lstm_out.transpose(1, 0)) q, k, v = char_features, lstm_out, lstm_out attn_output, attn = self.attention(q, k, v) outputs = self.hidden2tag(attn_output) if batch_label is not None: total_loss = self.crf.neg_log_likelihood_loss(outputs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outputs, mask) return total_loss, tag_seq else: scores, tag_seq = self.crf._viterbi_decode(outputs, mask) return tag_seq def random_embedding(self, vocab_size, embedding_dim): pretrain_emb = np.empty([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb
class CNNmodel(nn.Module): def __init__(self, data): super(CNNmodel, self).__init__() self.gpu = data.HP_gpu self.use_biword = data.use_bigram self.use_posi = data.HP_use_posi self.hidden_dim = data.HP_hidden_dim self.gaz_alphabet = data.gaz_alphabet self.gaz_emb_dim = data.gaz_emb_dim self.word_emb_dim = data.word_emb_dim self.posi_emb_dim = data.posi_emb_dim self.biword_emb_dim = data.biword_emb_dim self.rethink_iter = data.HP_rethink_iter scale = np.sqrt(3.0 / self.gaz_emb_dim) data.pretrain_gaz_embedding[0, :] = np.random.uniform( -scale, scale, [1, self.gaz_emb_dim]) self.gaz_embedding = nn.Embedding(data.gaz_alphabet.size(), self.gaz_emb_dim) self.gaz_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_gaz_embedding)) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.word_emb_dim) self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) if data.HP_use_posi: data.posi_alphabet_size += 1 self.position_embedding = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(data.posi_alphabet_size, self.posi_emb_dim), freeze=True) if self.use_biword: self.biword_embedding = nn.Embedding(data.biword_alphabet.size(), self.biword_emb_dim) self.biword_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_biword_embedding)) self.drop = nn.Dropout(p=data.HP_dropout) self.num_layer = data.HP_num_layer input_dim = self.word_emb_dim if self.use_biword: input_dim += self.biword_emb_dim if self.use_posi: input_dim += self.posi_emb_dim self.cnn_layer0 = nn.Conv1d(input_dim, self.hidden_dim, kernel_size=1, padding=0) self.cnn_layers = [ nn.Conv1d(self.hidden_dim, self.hidden_dim, kernel_size=2, padding=0) for i in range(self.num_layer - 1) ] self.cnn_layers_back = [ nn.Conv1d(self.hidden_dim, self.hidden_dim, kernel_size=2, padding=0) for i in range(self.num_layer - 1) ] self.res_cnn_layers = [ nn.Conv1d(self.hidden_dim, self.hidden_dim, kernel_size=i + 2, padding=0) for i in range(1, self.num_layer - 1) ] self.res_cnn_layers_back = [ nn.Conv1d(self.hidden_dim, self.hidden_dim, kernel_size=i + 2, padding=0) for i in range(1, self.num_layer - 1) ] self.layer_gate = LayerGate(self.hidden_dim, self.gaz_emb_dim) self.global_gate = GlobalGate(self.hidden_dim) self.exper2gate = nn.Linear(self.hidden_dim, self.hidden_dim * 4) self.multiscale_layer = MultiscaleAttention(self.num_layer, data.HP_dropout) self.hidden2tag = nn.Linear(self.hidden_dim, data.label_alphabet_size + 2) self.crf = CRF(data.label_alphabet_size, self.gpu) if self.gpu: self.gaz_embedding = self.gaz_embedding.cuda() self.word_embedding = self.word_embedding.cuda() if self.use_posi: self.position_embedding = self.position_embedding.cuda() if self.use_biword: self.biword_embedding = self.biword_embedding.cuda() self.cnn_layer0 = self.cnn_layer0.cuda() self.multiscale_layer = self.multiscale_layer.cuda() self.hidden2tag = self.hidden2tag.cuda() self.layer_gate = self.layer_gate.cuda() self.global_gate = self.global_gate.cuda() self.crf = self.crf.cuda() for i in range(self.num_layer - 1): self.cnn_layers[i] = self.cnn_layers[i].cuda() self.cnn_layers_back[i] = self.cnn_layers_back[i].cuda() if i >= 1: self.res_cnn_layers[i - 1] = self.res_cnn_layers[i - 1].cuda() self.res_cnn_layers_back[i - 1] = self.res_cnn_layers_back[ i - 1].cuda() def get_tags(self, gaz_list, word_inputs, biword_inputs, layer_gaz, gaz_mask_input, mask): batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) word_embs = self.word_embedding(word_inputs) if self.use_biword: biword_embs = self.biword_embedding(biword_inputs) word_embs = torch.cat([word_embs, biword_embs], dim=2) if self.use_posi: posi_inputs = torch.zeros(batch_size, seq_len).long() posi_inputs[:, :] = torch.LongTensor( [i + 1 for i in range(seq_len)]) if self.gpu: posi_inputs = posi_inputs.cuda() position_embs = self.position_embedding(posi_inputs) word_embs = torch.cat([word_embs, position_embs], dim=2) word_inputs_d = self.drop(word_embs) word_inputs_d = word_inputs_d.transpose(2, 1).contiguous() X_pre = self.cnn_layer0( word_inputs_d) #(batch_size,hidden_size,seq_len) X_pre = torch.tanh(X_pre) X_trans = X_pre.transpose(2, 1).contiguous() global_matrix0 = self.global_gate(X_trans) # G0 X_list = X_trans.unsqueeze( 2) #(batch_size,seq_len,num_layer,hidden_size) padding = torch.zeros(batch_size, self.hidden_dim, 1) if self.gpu: padding = padding.cuda() feed_back = None for iteration in range(self.rethink_iter): global_matrix = global_matrix0 X_pre = self.drop(X_pre) X_pre_padding = torch.cat( [X_pre, padding], dim=2) #(batch_size,hidden_size,seq_len+1) X_pre_padding_back = torch.cat([padding, X_pre], dim=2) for layer in range(self.num_layer - 1): X = self.cnn_layers[layer]( X_pre_padding) # X: (batch_size,hidden_size,seq_len) X = torch.tanh(X) X_back = self.cnn_layers_back[layer]( X_pre_padding_back) # X: (batch_size,hidden_size,seq_len) X_back = torch.tanh(X_back) if layer > 0: windowpad = torch.cat([padding for i in range(layer)], dim=2) X_pre_padding_w = torch.cat([X_pre, windowpad, padding], dim=2) X_res = self.res_cnn_layers[layer - 1](X_pre_padding_w) X_res = torch.tanh(X_res) X_pre_padding_w_back = torch.cat( [padding, windowpad, X_pre], dim=2) X_res_back = self.res_cnn_layers_back[layer - 1]( X_pre_padding_w_back) X_res_back = torch.tanh(X_res_back) layer_gaz_back = torch.zeros(batch_size, seq_len).long() if seq_len > layer + 1: layer_gaz_back[:, layer + 1:] = layer_gaz[:, :seq_len - layer - 1, layer] if self.gpu: layer_gaz_back = layer_gaz_back.cuda() gazs_embeds = self.gaz_embedding(layer_gaz[:, :, layer]) gazs_embeds_back = self.gaz_embedding(layer_gaz_back) mask_gaz = (mask == 0).unsqueeze(-1).repeat( 1, 1, self.gaz_emb_dim) gazs_embeds = gazs_embeds.masked_fill(mask_gaz, 0) gazs_embeds_back = gazs_embeds_back.masked_fill(mask_gaz, 0) gazs_embeds = self.drop(gazs_embeds) gazs_embeds_back = self.drop(gazs_embeds_back) if layer > 0: #res X_input = torch.cat([X, X_back, X_res, X_res_back], dim=-1).transpose( 2, 1).contiguous() #(b,4l,h) X, X_back, X_res, X_res_back = self.layer_gate( X_input, gazs_embeds, gazs_embeds_back, global_matrix, exper_input=feed_back, gaz_mask=None) X = X + X_back + X_res + X_res_back else: X_input = torch.cat([X, X_back, X, X_back], dim=-1).transpose( 2, 1).contiguous() #(b,4l,h) X, X_back, _, _ = self.layer_gate(X_input, gazs_embeds, gazs_embeds_back, global_matrix, exper_input=feed_back, gaz_mask=None) X = X + X_back global_matrix = self.global_gate(X, global_matrix) if iteration == self.rethink_iter - 1: X_list = torch.cat([X_list, X.unsqueeze(2)], dim=2) if layer == self.num_layer - 2: feed_back = X X = X.transpose(2, 1).contiguous() X_d = self.drop(X) X_pre_padding = torch.cat([X_d, padding], dim=2) #padding padding_back = torch.cat( [padding for _ in range(min(layer + 2, seq_len + 1))], dim=2) if seq_len > layer + 1: X_pre_padding_back = torch.cat( [padding_back, X_d[:, :, :seq_len - layer - 1]], dim=2) #(b,h,seqlen+1) else: X_pre_padding_back = padding_back X_attention = self.multiscale_layer(X_list) tags = self.hidden2tag(X_attention) #(b,l,t) return tags def neg_log_likelihood_loss(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, layer_gaz, gaz_mask, mask, batch_label): tags = self.get_tags(gaz_list, word_inputs, biword_inputs, layer_gaz, gaz_mask, mask) total_loss = self.crf.neg_log_likelihood_loss(tags, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(tags, mask) return total_loss, tag_seq def forward(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, layer_gaz, gaz_mask, mask): tags = self.get_tags(gaz_list, word_inputs, biword_inputs, layer_gaz, gaz_mask, mask) scores, tag_seq = self.crf._viterbi_decode(tags, mask) return tag_seq
class BiLSTM_CRF(nn.Module): def __init__(self, data): super(BiLSTM_CRF, self).__init__() print("build batched lstmcrf...") self.gpu = data.HP_gpu ## add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.crf = CRF(label_size, self.gpu) label_size_ner = data.label_alphabet_size_ner data.label_alphabet_size_ner += 2 self.crf_ner = CRF(label_size_ner, self.gpu) label_size_general = data.label_alphabet_size_general data.label_alphabet_size_general += 2 self.crf_general = CRF(label_size_general, self.gpu) self.lstm = BiLSTM(data) def neg_log_likelihood_loss(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return total_loss, tag_seq def neg_log_likelihood_loss_ner(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): outs = self.lstm.get_output_score_ner(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) total_loss = self.crf_ner.neg_log_likelihood_loss( outs, mask, batch_label) scores, tag_seq = self.crf_ner._viterbi_decode(outs, mask) return total_loss, tag_seq def neg_log_likelihood_loss_general(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): outs = self.lstm.get_output_score_general( gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) total_loss = self.crf_general.neg_log_likelihood_loss( outs, mask, batch_label) scores, tag_seq = self.crf_general._viterbi_decode(outs, mask) return total_loss, tag_seq def forward(self, is_ner, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): if not is_ner: outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) scores, tag_seq = self.crf._viterbi_decode(outs, mask) else: outs = self.lstm.get_output_score_ner( gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) scores, tag_seq = self.crf_ner._viterbi_decode(outs, mask) return tag_seq
class Graph(nn.Module): def __init__(self, data, args): super(Graph, self).__init__() self.gpu = args.use_gpu self.char_emb_dim = args.char_dim self.word_emb_dim = args.word_dim self.hidden_dim = args.hidden_dim self.num_head = args.num_head # 5 10 20 self.head_dim = args.head_dim # 10 20 self.tf_dropout_rate = args.tf_drop_rate self.iters = args.iters self.bmes_dim = 10 self.length_dim = 10 self.max_word_length = 5 self.emb_dropout_rate = args.emb_drop_rate self.cell_dropout_rate = args.cell_drop_rate self.use_crf = args.use_crf self.use_global = args.use_global self.use_edge = args.use_edge self.bidirectional = args.bidirectional self.label_size = args.label_alphabet_size # char embedding self.char_embedding = nn.Embedding(args.char_alphabet_size, self.char_emb_dim) if data.pretrain_char_embedding is not None: self.char_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_char_embedding)) if self.use_edge: # word embedding self.word_embedding = nn.Embedding(args.word_alphabet_size, self.word_emb_dim) if data.pretrain_word_embedding is not None: scale = np.sqrt(3.0 / self.word_emb_dim) data.pretrain_word_embedding[0, :] = np.random.uniform( -scale, scale, [1, self.word_emb_dim]) self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) # bmes embedding self.bmes_embedding = nn.Embedding(4, self.bmes_dim) """ self.edge_emb_linear = nn.Sequential( nn.Linear(self.word_emb_dim, self.hidden_dim), nn.ELU() ) """ # lstm self.emb_rnn_f = nn.LSTM(self.char_emb_dim, self.hidden_dim, batch_first=True) self.emb_rnn_b = nn.LSTM(self.char_emb_dim, self.hidden_dim, batch_first=True) # length embedding self.length_embedding = nn.Embedding(self.max_word_length, self.length_dim) self.dropout = nn.Dropout(self.emb_dropout_rate) self.norm = nn.LayerNorm(self.hidden_dim) if self.use_edge: # Node aggregation module self.edge2node_f = nn.ModuleList([ MultiHeadAtt(self.hidden_dim, self.hidden_dim * 2 + self.length_dim, nhead=self.num_head, head_dim=self.head_dim, dropout=self.tf_dropout_rate) for _ in range(self.iters) ]) # Edge aggregation module self.node2edge_f = nn.ModuleList([ MultiHeadAtt(self.hidden_dim, self.hidden_dim + self.bmes_dim, nhead=self.num_head, head_dim=self.head_dim, dropout=self.tf_dropout_rate) for _ in range(self.iters) ]) else: # Node aggregation module self.edge2node_f = nn.ModuleList([ MultiHeadAtt(self.hidden_dim, self.hidden_dim + self.length_dim, nhead=self.num_head, head_dim=self.head_dim, dropout=self.tf_dropout_rate) for _ in range(self.iters) ]) if self.use_global: # Global Node aggregation module self.glo_att_f_node = nn.ModuleList([ GloAtt(self.hidden_dim, nhead=self.num_head, head_dim=self.head_dim, dropout=self.tf_dropout_rate) for _ in range(self.iters) ]) if self.use_edge: self.glo_att_f_edge = nn.ModuleList([ GloAtt(self.hidden_dim, nhead=self.num_head, head_dim=self.head_dim, dropout=self.tf_dropout_rate) for _ in range(self.iters) ]) # Updating modules if self.use_edge: self.glo_rnn_f = Global_Cell(self.hidden_dim * 3, self.hidden_dim, dropout=self.cell_dropout_rate) self.node_rnn_f = Nodes_Cell(self.hidden_dim * 5, self.hidden_dim, dropout=self.cell_dropout_rate) self.edge_rnn_f = Edges_Cell(self.hidden_dim * 4, self.hidden_dim, dropout=self.cell_dropout_rate) else: self.glo_rnn_f = Global_Cell(self.hidden_dim * 2, self.hidden_dim, dropout=self.cell_dropout_rate) self.node_rnn_f = Nodes_Cell(self.hidden_dim * 4, self.hidden_dim, dropout=self.cell_dropout_rate) else: # Updating modules self.node_rnn_f = Nodes_Cell(self.hidden_dim * 3, self.hidden_dim, use_global=False, dropout=self.cell_dropout_rate) if self.use_edge: self.edge_rnn_f = Edges_Cell(self.hidden_dim * 2, self.hidden_dim, use_global=False, dropout=self.cell_dropout_rate) if self.bidirectional: if self.use_edge: # Node aggregation module self.edge2node_b = nn.ModuleList([ MultiHeadAtt(self.hidden_dim, self.hidden_dim * 2 + self.length_dim, nhead=self.num_head, head_dim=self.head_dim, dropout=self.tf_dropout_rate) for _ in range(self.iters) ]) # Edge aggregation module self.node2edge_b = nn.ModuleList([ MultiHeadAtt(self.hidden_dim, self.hidden_dim + self.bmes_dim, nhead=self.num_head, head_dim=self.head_dim, dropout=self.tf_dropout_rate) for _ in range(self.iters) ]) else: # Node aggregation module self.edge2node_b = nn.ModuleList([ MultiHeadAtt(self.hidden_dim, self.hidden_dim + self.length_dim, nhead=self.num_head, head_dim=self.head_dim, dropout=self.tf_dropout_rate) for _ in range(self.iters) ]) if self.use_global: # Global Node aggregation module self.glo_att_b_node = nn.ModuleList([ GloAtt(self.hidden_dim, nhead=self.num_head, head_dim=self.head_dim, dropout=self.tf_dropout_rate) for _ in range(self.iters) ]) if self.use_edge: self.glo_att_b_edge = nn.ModuleList([ GloAtt(self.hidden_dim, nhead=self.num_head, head_dim=self.head_dim, dropout=self.tf_dropout_rate) for _ in range(self.iters) ]) # Updating modules if self.use_edge: self.glo_rnn_b = Global_Cell(self.hidden_dim * 3, self.hidden_dim, self.cell_dropout_rate) self.node_rnn_b = Nodes_Cell(self.hidden_dim * 5, self.hidden_dim, self.cell_dropout_rate) self.edge_rnn_b = Edges_Cell(self.hidden_dim * 4, self.hidden_dim, self.cell_dropout_rate) else: self.glo_rnn_b = Global_Cell(self.hidden_dim * 2, self.hidden_dim, self.cell_dropout_rate) self.node_rnn_b = Nodes_Cell(self.hidden_dim * 4, self.hidden_dim, self.cell_dropout_rate) else: # Updating modules self.node_rnn_b = Nodes_Cell(self.hidden_dim * 3, self.hidden_dim, use_global=False, dropout=self.cell_dropout_rate) if self.use_edge: self.edge_rnn_b = Edges_Cell( self.hidden_dim * 2, self.hidden_dim, use_global=False, dropout=self.cell_dropout_rate) if self.bidirectional: output_dim = self.hidden_dim * 2 else: output_dim = self.hidden_dim self.layer_att_W = nn.Linear(output_dim, 1) if self.use_crf: self.hidden2tag = nn.Linear(output_dim, self.label_size + 2) self.crf = CRF(self.label_size, self.gpu) else: self.hidden2tag = nn.Linear(output_dim, self.label_size) self.criterion = nn.CrossEntropyLoss() def construct_graph(self, batch_size, seq_len, word_list): if self.cuda: device = 'cuda' else: device = 'cpu' if self.use_edge: unk_index = torch.tensor(0, device=device) unk_emb = self.word_embedding(unk_index) bmes_emb_b = self.bmes_embedding(torch.tensor(0, device=device)) bmes_emb_m = self.bmes_embedding(torch.tensor(1, device=device)) bmes_emb_e = self.bmes_embedding(torch.tensor(2, device=device)) bmes_emb_s = self.bmes_embedding(torch.tensor(3, device=device)) sen_nodes_mask_list = [] sen_words_length_list = [] sen_words_mask_f_list = [] sen_words_mask_b_list = [] sen_word_embed_list = [] sen_bmes_embed_list = [] max_edge_num = -1 for sen in range(batch_size): sen_nodes_mask = torch.zeros([1, seq_len], device=device).byte() sen_words_length = torch.zeros([1, self.length_dim], device=device) sen_words_mask_f = torch.zeros([1, seq_len], device=device).byte() sen_words_mask_b = torch.zeros([1, seq_len], device=device).byte() if self.use_edge: sen_word_embed = unk_emb[None, :] sen_bmes_embed = torch.zeros([1, seq_len, self.bmes_dim], device=device) for w in range(seq_len): if w < len(word_list[sen]) and word_list[sen][w]: for word, word_len in zip(word_list[sen][w][0], word_list[sen][w][1]): if word_len <= self.max_word_length: word_length_index = torch.tensor(word_len - 1, device=device) else: word_length_index = torch.tensor( self.max_word_length - 1, device=device) word_length = self.length_embedding(word_length_index) sen_words_length = torch.cat( [sen_words_length, word_length[None, :]], 0) # mask: Masked elements are marked by 1, batch_size * word_num * seq_len nodes_mask = torch.ones([1, seq_len], device=device).byte() words_mask_f = torch.ones([1, seq_len], device=device).byte() words_mask_b = torch.ones([1, seq_len], device=device).byte() words_mask_f[0, w + word_len - 1] = 0 sen_words_mask_f = torch.cat( [sen_words_mask_f, words_mask_f], 0) words_mask_b[0, w] = 0 sen_words_mask_b = torch.cat( [sen_words_mask_b, words_mask_b], 0) if self.use_edge: word_index = torch.tensor(word, device=device) word_embedding = self.word_embedding(word_index) sen_word_embed = torch.cat( [sen_word_embed, word_embedding[None, :]], 0) bmes_embed = torch.zeros( [1, seq_len, self.bmes_dim], device=device) for index in range(word_len): nodes_mask[0, w + index] = 0 if word_len == 1: bmes_embed[0, w + index, :] = bmes_emb_s elif index == 0: bmes_embed[0, w + index, :] = bmes_emb_b elif index == word_len - 1: bmes_embed[0, w + index, :] = bmes_emb_e else: bmes_embed[0, w + index, :] = bmes_emb_m sen_bmes_embed = torch.cat( [sen_bmes_embed, bmes_embed], 0) sen_nodes_mask = torch.cat( [sen_nodes_mask, nodes_mask], 0) if sen_words_mask_f.size(0) > max_edge_num: max_edge_num = sen_words_mask_f.size(0) sen_words_mask_f_list.append(sen_words_mask_f.unsqueeze_(0)) sen_words_mask_b_list.append(sen_words_mask_b.unsqueeze_(0)) sen_words_length_list.append(sen_words_length.unsqueeze_(0)) if self.use_edge: sen_nodes_mask_list.append(sen_nodes_mask.unsqueeze_(0)) sen_word_embed_list.append(sen_word_embed.unsqueeze_(0)) sen_bmes_embed_list.append(sen_bmes_embed.unsqueeze_(0)) edges_mask = torch.zeros([batch_size, max_edge_num], device=device) batch_words_mask_f = torch.ones([batch_size, max_edge_num, seq_len], device=device).byte() batch_words_mask_b = torch.ones([batch_size, max_edge_num, seq_len], device=device).byte() batch_words_length = torch.zeros( [batch_size, max_edge_num, self.length_dim], device=device) if self.use_edge: batch_nodes_mask = torch.zeros([batch_size, max_edge_num, seq_len], device=device).byte() batch_word_embed = torch.zeros( [batch_size, max_edge_num, self.word_emb_dim], device=device) batch_bmes_embed = torch.zeros( [batch_size, max_edge_num, seq_len, self.bmes_dim], device=device) else: batch_word_embed = None batch_bmes_embed = None batch_nodes_mask = None for index in range(batch_size): curr_edge_num = sen_words_mask_f_list[index].size(1) edges_mask[index, 0:curr_edge_num] = 1. batch_words_mask_f[ index, 0:curr_edge_num, :] = sen_words_mask_f_list[index] batch_words_mask_b[ index, 0:curr_edge_num, :] = sen_words_mask_b_list[index] batch_words_length[ index, 0:curr_edge_num, :] = sen_words_length_list[index] if self.use_edge: batch_nodes_mask[ index, 0:curr_edge_num, :] = sen_nodes_mask_list[index] batch_word_embed[ index, 0:curr_edge_num, :] = sen_word_embed_list[index] batch_bmes_embed[ index, 0:curr_edge_num, :, :] = sen_bmes_embed_list[index] return batch_word_embed, batch_bmes_embed, batch_nodes_mask, batch_words_mask_f, \ batch_words_mask_b, batch_words_length, edges_mask def update_graph(self, word_list, word_inputs, mask): mask = mask.float() node_embeds = self.char_embedding( word_inputs) # batch_size, max_seq_len, embedding B, L, _ = node_embeds.size() edge_embs, bmes_embs, nodes_mask, words_mask_f, words_mask_b, words_length, edges_mask = \ self.construct_graph(B, L, word_list) node_embeds = self.dropout(node_embeds) _, N, _ = words_mask_f.size() if self.use_edge: edge_embs = self.dropout(edge_embs) # forward direction digraph nodes_f, _ = self.emb_rnn_f(node_embeds) nodes_f = nodes_f * mask.unsqueeze(2) nodes_f_cat = nodes_f[:, None, :, :] _, _, H = nodes_f.size() if self.use_edge: edges_f = edge_embs * edges_mask.unsqueeze(2) edges_f_cat = edges_f[:, None, :, :] if self.use_global: glo_f = edges_f.sum(1, keepdim=True) / edges_mask.sum(1, keepdim=True).unsqueeze_(2) + \ nodes_f.sum(1, keepdim=True) / mask.sum(1, keepdim=True).unsqueeze_(2) glo_f_cat = glo_f[:, None, :, :] else: if self.use_global: glo_f = (nodes_f * mask.unsqueeze(2)).sum( 1, keepdim=True) / mask.sum(1, keepdim=True).unsqueeze_(2) glo_f_cat = glo_f[:, None, :, :] for i in range(self.iters): # Attention-based aggregation if self.use_edge and N > 1: bmes_nodes_f = torch.cat([ nodes_f.unsqueeze(2).expand(B, L, N, H), bmes_embs.transpose(1, 2) ], -1) edges_att_f = self.node2edge_f[i](edges_f, bmes_nodes_f, nodes_mask.transpose(1, 2)) nodes_begin_f = torch.sum( nodes_f[:, None, :, :] * (1 - words_mask_b)[:, :, :, None].float(), 2) nodes_begin_f = torch.cat([ torch.zeros([B, 1, H], device=nodes_f.device), nodes_begin_f[:, 1:N, :] ], 1) if self.use_edge: nodes_att_f = self.edge2node_f[i]( nodes_f, torch.cat([edges_f, nodes_begin_f, words_length], -1).unsqueeze(2), words_mask_f) if self.use_global: glo_att_f = torch.cat([ self.glo_att_f_node[i](glo_f, nodes_f, (1 - mask).byte()), self.glo_att_f_edge[i](glo_f, edges_f, (1 - edges_mask).byte()) ], -1) else: nodes_att_f = self.edge2node_f[i]( nodes_f, torch.cat([nodes_begin_f, words_length], -1).unsqueeze(2), words_mask_f) if self.use_global: glo_att_f = self.glo_att_f_node[i](glo_f, nodes_f, (1 - mask).byte()) # RNN-based update if self.use_edge and N > 1: if self.use_global: edges_f = torch.cat([ edges_f[:, 0:1, :], self.edge_rnn_f(edges_f[:, 1:N, :], edges_att_f[:, 1:N, :], glo_att_f.expand(B, N - 1, H * 2)) ], 1) else: edges_f = torch.cat([ edges_f[:, 0:1, :], self.edge_rnn_f(edges_f[:, 1:N, :], edges_att_f[:, 1:N, :]) ], 1) edges_f_cat = torch.cat([edges_f_cat, edges_f[:, None, :, :]], 1) edges_f = torch.cat([ edges_f[:, 0:1, :], self.norm(torch.sum(edges_f_cat[:, :, 1:N, :], 1)) ], 1) nodes_f_r = torch.cat([ torch.zeros([B, 1, self.hidden_dim], device=nodes_f.device), nodes_f[:, 0:(L - 1), :] ], 1) if self.use_global: nodes_f = self.node_rnn_f(nodes_f, nodes_f_r, nodes_att_f, glo_att_f.expand(B, L, -1)) else: nodes_f = self.node_rnn_f(nodes_f, nodes_f_r, nodes_att_f) nodes_f_cat = torch.cat([nodes_f_cat, nodes_f[:, None, :, :]], 1) nodes_f = self.norm(torch.sum(nodes_f_cat, 1)) if self.use_global: glo_f = self.glo_rnn_f(glo_f, glo_att_f) glo_f_cat = torch.cat([glo_f_cat, glo_f[:, None, :, :]], 1) glo_f = self.norm(torch.sum(glo_f_cat, 1)) nodes_cat = nodes_f_cat # backward direction digraph if self.bidirectional: nodes_b, _ = self.emb_rnn_b(torch.flip(node_embeds, [1])) nodes_b = torch.flip(nodes_b, [1]) nodes_b = nodes_b * mask.unsqueeze(2) nodes_b_cat = nodes_b[:, None, :, :] if self.use_edge: edges_b = edge_embs * edges_mask.unsqueeze(2) edges_b_cat = edges_b[:, None, :, :] if self.use_global: glo_b = edges_b.sum(1, keepdim=True) / edges_mask.sum(1, keepdim=True).unsqueeze_(2) + \ nodes_b.sum(1, keepdim=True) / mask.sum(1, keepdim=True).unsqueeze_(2) glo_b_cat = glo_b[:, None, :, :] else: if self.use_global: glo_b = nodes_b.sum(1, keepdim=True) / mask.sum( 1, keepdim=True).unsqueeze_(2) glo_b_cat = glo_b[:, None, :, :] for i in range(self.iters): # Attention-based aggregation if self.use_edge and N > 1: bmes_nodes_b = torch.cat([ nodes_b.unsqueeze(2).expand(B, L, N, H), bmes_embs.transpose(1, 2) ], -1) edges_att_b = self.node2edge_b[i](edges_b, bmes_nodes_b, nodes_mask.transpose( 1, 2)) nodes_begin_b = torch.sum( nodes_b[:, None, :, :] * (1 - words_mask_f)[:, :, :, None].float(), 2) nodes_begin_b = torch.cat([ torch.zeros([B, 1, H], device=nodes_b.device), nodes_begin_b[:, 1:N, :] ], 1) if self.use_edge: nodes_att_b = self.edge2node_b[i]( nodes_b, torch.cat([edges_b, nodes_begin_b, words_length], -1).unsqueeze(2), words_mask_b) if self.use_global: glo_att_b = torch.cat([ self.glo_att_b_node[i](glo_b, nodes_b, (1 - mask).byte()), self.glo_att_b_edge[i](glo_b, edges_b, (1 - edges_mask).byte()) ], -1) else: nodes_att_b = self.edge2node_b[i]( nodes_b, torch.cat([nodes_begin_b, words_length], -1).unsqueeze(2), words_mask_b) if self.use_global: glo_att_b = self.glo_att_b_node[i](glo_b, nodes_b, (1 - mask).byte()) # RNN-based update if self.use_edge and N > 1: if self.use_global: edges_b = torch.cat([ edges_b[:, 0:1, :], self.edge_rnn_b(edges_b[:, 1:N, :], edges_att_b[:, 1:N, :], glo_att_b.expand(B, N - 1, H * 2)) ], 1) else: edges_b = torch.cat([ edges_b[:, 0:1, :], self.edge_rnn_b(edges_b[:, 1:N, :], edges_att_b[:, 1:N, :]) ], 1) edges_b_cat = torch.cat( [edges_b_cat, edges_b[:, None, :, :]], 1) edges_b = torch.cat([ edges_b[:, 0:1, :], self.norm(torch.sum(edges_b_cat[:, :, 1:N, :], 1)) ], 1) nodes_b_r = torch.cat([ nodes_b[:, 1:L, :], torch.zeros([B, 1, self.hidden_dim], device=nodes_b.device) ], 1) if self.use_global: nodes_b = self.node_rnn_b(nodes_b, nodes_b_r, nodes_att_b, glo_att_b.expand(B, L, -1)) else: nodes_b = self.node_rnn_b(nodes_b, nodes_b_r, nodes_att_b) nodes_b_cat = torch.cat([nodes_b_cat, nodes_b[:, None, :, :]], 1) nodes_b = self.norm(torch.sum(nodes_b_cat, 1)) if self.use_global: glo_b = self.glo_rnn_b(glo_b, glo_att_b) glo_b_cat = torch.cat([glo_b_cat, glo_b[:, None, :, :]], 1) glo_b = self.norm(torch.sum(glo_b_cat, 1)) nodes_cat = torch.cat([nodes_f_cat, nodes_b_cat], -1) layer_att = torch.sigmoid(self.layer_att_W(nodes_cat)) layer_alpha = F.softmax(layer_att, 1) nodes = torch.sum(layer_alpha * nodes_cat, 1) tags = self.hidden2tag(nodes) return tags def forward(self, word_list, batch_inputs, mask, batch_label=None): tags = self.update_graph(word_list, batch_inputs, mask) if batch_label is not None: if self.use_crf: total_loss = self.crf.neg_log_likelihood_loss( tags, mask, batch_label) else: total_loss = self.criterion(tags.view(-1, self.label_size), batch_label.view(-1)) else: total_loss = None if self.use_crf: _, tag_seq = self.crf._viterbi_decode(tags, mask) else: tag_seq = tags.argmax(-1) return total_loss, tag_seq
class SeqModel(nn.Module): def __init__( self, config_dic: dict, word_vocab_dim: int, char_vocab_dim: int, sw_vocab_dim_list: List[int], label_vocab_dim: int, pretrain_word_embedding: np.ndarray, ): super().__init__() self.gpu = config_dic.get("gpu") self.label_vocab_dim = label_vocab_dim self.word_lstm = WordLSTM(config_dic, word_vocab_dim, char_vocab_dim, sw_vocab_dim_list, pretrain_word_embedding, config_dic.get("use_modality_attention"), config_dic.get("ner_dropout")) self.hidden2tag = nn.Linear(config_dic.get("word_hidden_dim"), self.label_vocab_dim + 2) # for START and END tag self.crf = CRF(self.label_vocab_dim, self.gpu) if self.gpu: self.word_lstm.cuda() self.hidden2tag.cuda() def neg_log_likelihood_loss(self, word_features, char_features, sw_features_list, label_features, size_average=True): self.zero_grad() mask = word_features.get("masks") lstm_out = self.word_lstm(word_features, char_features, sw_features_list) out = self.hidden2tag(lstm_out) total_loss = self.crf.neg_log_likelihood_loss( out, mask, label_features.get("label_ids")) # outとmaskの並びはあっているっぽい。 _, tag_seq = self.crf._viterbi_decode(out, mask) # if size_average: # total_loss = total_loss / (torch.sum(mask) / out.shape[0]) return total_loss, tag_seq def forward(self, word_features, char_features, sw_features_list): self.zero_grad() mask = word_features.get("masks") lstm_out = self.word_lstm(word_features, char_features, sw_features_list) out = self.hidden2tag(lstm_out) _, tag_seq = self.crf._viterbi_decode(out, mask) return tag_seq def load_expanded_state_dict(self, lm_state_dict): """lmとnerの時ではvocab_sizeが異なるのでrandom valueでexpandしてload """ expanded_state_dict = self.state_dict() for lm_key, lm_value in lm_state_dict.items(): if lm_key in expanded_state_dict.keys(): if expanded_state_dict.get(lm_key).shape == lm_value.shape: expanded_state_dict[lm_key] = lm_value else: expanded_state_dict[lm_key] = expand_weight( lm_value, expanded_state_dict.get(lm_key).shape, self.gpu) self.load_state_dict(expanded_state_dict)
class CnnAttnLstmCRF(nn.Module): def __init__(self, data, model_config): super(CnnAttnLstmCRF, self).__init__() if model_config['random_embedding']: self.char_embeddings = nn.Embedding(data.char_alphabet_size, model_config['char_emb_dim']) self.char_embeddings.weight.data.copy_( torch.from_numpy( self.random_embedding(data.char_alphabet_size, model_config['char_emb_dim']))) self.char_drop = nn.Dropout(model_config['dropout']) else: char_emb_path = model_config['char_emb_file'] self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( char_emb_path, data.char_alphabet) self.char_embeddings = nn.Embedding(data.char_alphabet_size, model_config['char_emb_dim']) self.char_embeddings.weight.data.copy_( torch.from_numpy(self.pretrain_char_embedding)) self.char_drop = nn.Dropout(model_config['dropout']) self.word_embeddings = nn.Embedding(data.word_alphabet_size, model_config['word_emb_dim']) self.word_embeddings.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet_size, model_config['word_emb_dim']))) self.intent_embeddings = nn.Embedding(data.intent_alphabet_size, model_config['intent_emb_dim']) self.intent_embeddings.weight.data.copy_( torch.from_numpy( self.random_embedding(data.intent_alphabet_size, model_config['intent_emb_dim']))) self.lexi_embeddings = nn.Embedding(data.lexicon_alphabet_size, model_config['lexi_emb_dim']) self.lexi_embeddings.weight.data.copy_( torch.from_numpy( self.random_embedding(data.lexicon_alphabet_size, model_config['lexi_emb_dim']))) self.word_drop = nn.Dropout(model_config['dropout']) self.char_cnn = nn.Conv1d(in_channels=model_config['char_emb_dim'], out_channels=model_config['cnn_hidden_dim'], kernel_size=3, padding=1) self.lstm = nn.LSTM(model_config['cnn_hidden_dim'] + model_config['intent_emb_dim'], model_config['lstm_hidden_dim'] // 2, num_layers=model_config['num_layers'], batch_first=model_config['batch_first'], bidirectional=model_config['bidirectional']) self.num_layers = model_config['num_layers'] self.hidden_size = model_config['lstm_hidden_dim'] // 2 self.drop_lstm = nn.Dropout(model_config['dropout']) self.hidden2tag = nn.Linear(model_config['lstm_hidden_dim'], data.label_alphabet_size + 2) self.crf = CRF(data.label_alphabet_size, model_config['gpu']) # multi-head-attention的分母,取值根号d_k temperature = np.power(model_config['char_emb_dim'], 0.5) self.attention = ScaledDotProductAttention(temperature) self.device = model_config['device'] def forward(self, batch_char, batch_word, batch_intent, batch_lexicon, batch_char_len, mask, batch_lexicon_indices, batch_word_indices, batch_label=None): # char char_embeds = self.char_drop( self.char_embeddings(batch_char)).transpose(1, 2) char_cnn_out = self.char_cnn(char_embeds).transpose(1, 2) intent_embeds = self.intent_embeddings(batch_intent) char_intent_embeds = torch.repeat_interleave(intent_embeds, batch_char.size(1), dim=1) char_features = torch.cat([char_cnn_out, char_intent_embeds], 2) # word word_embeds = self.word_embeddings(batch_word) lexi_embeds = self.lexi_embeddings(batch_lexicon) batch_lexicon_indices, batch_word_indices = batch_lexicon_indices.unsqueeze( -1), batch_word_indices.unsqueeze(-1) # replace embedding word_features = word_embeds * batch_word_indices + lexi_embeds * batch_lexicon_indices # 第一个参数:源序列,第二个参数:目标序列 attn_mask = get_attn_key_pad_mask(batch_word, batch_char) q = char_features # (b, 32, 400) k = word_features # (b, 32, 400) v = word_features # (b, 32, 400) attn_output, _ = self.attention(q, k, v, attn_mask) # 由于固定padding长度,这里不采用动态RNN策略 h0 = torch.zeros(self.num_layers * 2, batch_char.size(0), self.hidden_size).to(self.device) c0 = torch.zeros(self.num_layers * 2, batch_char.size(0), self.hidden_size).to(self.device) lstm_out, _ = self.lstm(attn_output, (h0, c0)) # fc outputs = self.hidden2tag(lstm_out) # crf if batch_label is not None: total_loss = self.crf.neg_log_likelihood_loss( outputs, mask, batch_label) _, tag_seq = self.crf._viterbi_decode(outputs, mask) return total_loss, tag_seq else: _, tag_seq = self.crf._viterbi_decode(outputs, mask) return tag_seq @staticmethod def random_embedding(vocab_size, embedding_dim): pretrain_emb = np.empty([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb
class Elmo_SeqLabel(nn.Module): def __init__(self, data): super(Elmo_SeqLabel, self).__init__() self.use_crf = data.use_crf print("build elmo sequence labeling network...") print("use crf: ", self.use_crf) self.gpu = data.HP_gpu self.average_batch = data.average_batch_loss ## add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.word_hidden = Elmo(data.elmo_options_file, data.elmo_weight_file, 1, requires_grad=data.elmo_tune, dropout=data.elmo_dropout) with open(data.elmo_options_file, 'r') as fin: self._options = json.load(fin) self.hidden2tag = nn.Linear( self._options['lstm']['projection_dim'] * 2, data.label_alphabet_size) if self.use_crf: self.crf = CRF(label_size, self.gpu) if self.gpu >= 0 and torch.cuda.is_available(): self.word_hidden = self.word_hidden.cuda(self.gpu) self.hidden2tag = self.hidden2tag.cuda(self.gpu) def calculate_loss(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): elmo_outputs = self.word_hidden(char_inputs) outs = elmo_outputs['elmo_representations'][0] # mask = elmo_outputs['mask'] batch_size = char_inputs.size(0) seq_len = char_inputs.size(1) outs = self.hidden2tag(outs) if self.use_crf: total_loss = self.crf.neg_log_likelihood_loss( outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) else: loss_function = nn.NLLLoss(ignore_index=0, size_average=False) outs = outs.view(batch_size * seq_len, -1) score = F.log_softmax(outs, 1) total_loss = loss_function(score, batch_label.view(batch_size * seq_len)) _, tag_seq = torch.max(score, 1) tag_seq = tag_seq.view(batch_size, seq_len) if self.average_batch: total_loss = total_loss / batch_size return total_loss, tag_seq def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): elmo_outputs = self.word_hidden(char_inputs) outs = elmo_outputs['elmo_representations'][0] # mask = elmo_outputs['mask'] batch_size = char_inputs.size(0) seq_len = char_inputs.size(1) outs = self.hidden2tag(outs) if self.use_crf: scores, tag_seq = self.crf._viterbi_decode(outs, mask) else: outs = outs.view(batch_size * seq_len, -1) _, tag_seq = torch.max(outs, 1) tag_seq = tag_seq.view(batch_size, seq_len) ## filter padded position with zero tag_seq = mask.long() * tag_seq return tag_seq def decode_nbest(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask, nbest): if not self.use_crf: print("Nbest output is currently supported only for CRF! Exit...") exit(0) elmo_outputs = self.word_hidden(char_inputs) outs = elmo_outputs['elmo_representations'][0] # mask = elmo_outputs['mask'] outs = self.hidden2tag(outs) scores, tag_seq = self.crf._viterbi_decode_nbest(outs, mask, nbest) return scores, tag_seq
class SeqModel(nn.Module): def __init__(self, data): super(SeqModel, self).__init__() self.use_crf = data.use_crf self.gpu = data.HP_gpu self.average_batch = data.average_batch_loss ## add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size # data.label_alphabet_size += 2 # self.word_hidden = WordSequence(data, False, True, data.use_char) # The linear layer that maps from hidden state space to tag space self.hidden2tag = nn.Linear(data.HP_hidden_dim, label_size + 2) if self.use_crf: self.crf = CRF(label_size, self.gpu) if torch.cuda.is_available(): self.hidden2tag = self.hidden2tag.cuda(self.gpu) self.frozen = False def neg_log_likelihood_loss(self, hidden, batch_label, mask): outs = self.hidden2tag(hidden) batch_size = hidden.size(0) seq_len = hidden.size(1) if self.use_crf: total_loss = self.crf.neg_log_likelihood_loss( outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) else: loss_function = nn.NLLLoss(ignore_index=0, size_average=False) outs = outs.view(batch_size * seq_len, -1) score = F.log_softmax(outs, 1) total_loss = loss_function(score, batch_label.view(batch_size * seq_len)) _, tag_seq = torch.max(score, 1) tag_seq = tag_seq.view(batch_size, seq_len) if self.average_batch: total_loss = total_loss / batch_size return total_loss, tag_seq def forward(self, hidden, mask): outs = self.hidden2tag(hidden) batch_size = hidden.size(0) seq_len = hidden.size(1) if self.use_crf: scores, tag_seq = self.crf._viterbi_decode(outs, mask) else: outs = outs.view(batch_size * seq_len, -1) _, tag_seq = torch.max(outs, 1) tag_seq = tag_seq.view(batch_size, seq_len) ## filter padded position with zero tag_seq = mask.long() * tag_seq return tag_seq def decode_nbest(self, hidden, mask, nbest): if not self.use_crf: print "Nbest output is currently supported only for CRF! Exit..." exit(0) outs = self.hidden2tag(hidden) batch_size = hidden.size(0) seq_len = hidden.size(1) scores, tag_seq = self.crf._viterbi_decode_nbest(outs, mask, nbest) return scores, tag_seq
class Joint_Bert_BiLSTM_CRF(nn.Module): def __init__(self, labelIndex, GPU): super(Joint_Bert_BiLSTM_CRF, self).__init__() print("build batched Joint_Bert_BiLSTM_CRF...") dropout=0.3 self.alpha=0.2 self.embedding_dim = 768 self.hidden_dim = 300 self.attention=attention.BertAttention() self.drop = nn.Dropout(dropout) self.droplstm = nn.Dropout(dropout) # 声明LSTM self.bilstm_flag = True self.lstm_layer = 1 if self.bilstm_flag: lstm_hidden = self.hidden_dim // 2#整除 else: lstm_hidden = self.hidden_dim self.lstm = nn.LSTM(self.embedding_dim, lstm_hidden, num_layers=self.lstm_layer, batch_first=True, bidirectional=self.bilstm_flag) # 声明CRF self.index2label = {}#将instance2index的键值调换 for ele in labelIndex: self.index2label[labelIndex[ele]] = ele self.hidden2tag = nn.Linear(self.hidden_dim, len(self.index2label)+2) self.crf = CRF(len(self.index2label), GPU) # 将模型载入到GPU中 self.gpu = GPU if self.gpu: self.attention=self.attention.cuda() self.drop = self.drop.cuda() self.droplstm = self.droplstm.cuda() self.hidden2tag = self.hidden2tag.cuda() self.lstm = self.lstm.cuda() def _get_lstm_features(self, batch_word, batch_knowledge, knowledge_mask, batch_wordlen, dynanmic_meta_embedding): batch_size,max_seq_length,max_entity_num= knowledge_mask.size() batch_wordlen=batch_wordlen.cpu() if dynanmic_meta_embedding: batch_knowledge=batch_knowledge.view(batch_size*max_seq_length, max_entity_num, 768) knowledge_mask=knowledge_mask.view(batch_size*max_seq_length, max_entity_num) batch_knowledge=self.attention(batch_knowledge,knowledge_mask) batch_knowledge=batch_knowledge.view(batch_size,max_seq_length,max_entity_num,768)[:,:,-1,:] else: batch_knowledge=batch_knowledge.sum(axis=2) merged_batch_word=self.alpha*batch_word+(1-self.alpha)*batch_knowledge embeds_pack = pack_padded_sequence(merged_batch_word, batch_wordlen, batch_first=True) #LSTM的输出 out_packed, (h, c) = self.lstm(embeds_pack) lstm_feature, _ = pad_packed_sequence(out_packed, batch_first=True) # lstm_feature: ([batch_size, max_word_length, HP_hidden_dim]) lstm_feature = self.droplstm(lstm_feature) lstm_feature = self.hidden2tag(lstm_feature) # lstm_feature: ([batch_size, max_word_length, len(self.index2label)+2]) return lstm_feature def neg_log_likelihood(self, batch_word, batch_knowledge, mask, knowledge_mask, batch_label, batch_wordlen, dynanmic_meta_embedding): lstm_feature = self._get_lstm_features(batch_word, batch_knowledge, knowledge_mask, batch_wordlen, dynanmic_meta_embedding) total_loss = self.crf.neg_log_likelihood_loss(lstm_feature, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(lstm_feature, mask) return total_loss, tag_seq def forward(self, batch_word, batch_knowledge, mask, knowledge_mask, batch_label, batch_wordlen, dynanmic_meta_embedding): lstm_feature = self._get_lstm_features(batch_word, batch_knowledge, knowledge_mask, batch_wordlen, dynanmic_meta_embedding) scores, best_path = self.crf._viterbi_decode(lstm_feature, mask) return best_path
class BERT_LSTM_CRF(nn.Module): def __init__(self, bert_config, tagset_size, embedding_dim, hidden_dim, rnn_layers, dropout_ratio, dropout1, use_cuda): super(BERT_LSTM_CRF, self).__init__() self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.word_embeds = BertModel.from_pretrained(bert_config) self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=rnn_layers, bidirectional=True, dropout=dropout_ratio, batch_first=True) self.rnn_layers = rnn_layers self.dropout1 = nn.Dropout(p=dropout1) self.crf = CRF(target_size=tagset_size, average_batch=True, use_cuda=use_cuda) self.embed_drop = nn.Dropout(0.5) self.lin = nn.Linear(2 * hidden_dim, hidden_dim) self.liner = nn.Linear(hidden_dim, tagset_size+2) self.tagset_size = tagset_size self.use_cuda = use_cuda def rand_init_hidden(self, batch_size): if self.use_cuda: return Variable( torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)).cuda(), Variable( torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)).cuda() else: return Variable( torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)), Variable( torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)) def get_output_score(self, sentence, attention_mask=None): batch_size = sentence.size(0) seq_length = sentence.size(1) embeds, _ = self.word_embeds(sentence, attention_mask=attention_mask, output_all_encoded_layers=False) embed = self.embed_drop(embeds) hidden = self.rand_init_hidden(batch_size) # if embeds.is_cuda: # hidden = (i.cuda() for i in hidden) lstm_out, hidden = self.lstm(embed, hidden) lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim * 2) d_lstm_out = self.dropout1(lstm_out) lin_out = self.lin(d_lstm_out) l_lstm_out = self.dropout1(lin_out) l_out = self.liner(l_lstm_out) lstm_feats = l_out.contiguous().view(batch_size, seq_length, -1) return lstm_feats def forward(self, sentence, masks): lstm_feats = self.get_output_score(sentence) scores, tag_seq = self.crf._viterbi_decode(lstm_feats, masks.byte()) return tag_seq def neg_log_likelihood_loss(self, sentence, mask, tags): lstm_feats = self.get_output_score(sentence) loss_value = self.crf.neg_log_likelihood_loss(lstm_feats, mask, tags) batch_size = lstm_feats.size(0) loss_value /= float(batch_size) return loss_value def test(self, crf_scores, lengths, tag2id): """使用维特比算法进行解码""" start_id = tag2id['<start>'] end_id = tag2id['<eos>'] pad = tag2id['<pad>'] tagset_size = len(tag2id) # crf_scores,_ = self.forward(test_sents_tensor, lengths) # B:batch_size, L:max_len, T:target set size B, L, T = 16,450,16 # viterbi[i, j, k]表示第i个句子,第j个字对应第k个标记的最大分数 viterbi = torch.zeros(B, L, T) # backpointer[i, j, k]表示第i个句子,第j个字对应第k个标记时前一个标记的id,用于回溯 backpointer = (torch.zeros(B, L, T).long() * end_id) lengths = torch.LongTensor(lengths) # 向前递推 for step in range(L): batch_size_t = (lengths > step).sum().item() if step == 0: # 第一个字它的前一个标记只能是start_id viterbi[:batch_size_t, step, :] = crf_scores[: batch_size_t, step, start_id, :] backpointer[: batch_size_t, step, :] = start_id else: max_scores, prev_tags = torch.max( viterbi[:batch_size_t, step-1, :].unsqueeze(2) + crf_scores[:batch_size_t, step, :, :], # [B, T, T] dim=1 ) viterbi[:batch_size_t, step, :] = max_scores backpointer[:batch_size_t, step, :] = prev_tags # 在回溯的时候我们只需要用到backpointer矩阵 backpointer = backpointer.view(B, -1) # [B, L * T] tagids = [] # 存放结果 tags_t = None for step in range(L-1, 0, -1): batch_size_t = (lengths > step).sum().item() if step == L-1: index = torch.ones(batch_size_t).long() * (step * tagset_size) index = index index += end_id else: prev_batch_size_t = len(tags_t) new_in_batch = torch.LongTensor([end_id] * (batch_size_t - prev_batch_size_t)) offset = torch.cat( [tags_t, new_in_batch], dim=0 ) # 这个offset实际上就是前一时刻的 index = torch.ones(batch_size_t).long() * (step * tagset_size) index = index index += offset.long() try: tags_t = backpointer[:batch_size_t].gather( dim=1, index=index.unsqueeze(1).long()) except RuntimeError: import pdb pdb.set_trace() tags_t = tags_t.squeeze(1) tagids.append(tags_t.tolist()) # tagids:[L-1](L-1是因为扣去了end_token),大小的liebiao # 其中列表内的元素是该batch在该时刻的标记 # 下面修正其顺序,并将维度转换为 [B, L] tagids = list(zip_longest(*reversed(tagids), fillvalue=pad)) tagids = torch.Tensor(tagids).long() # 返回解码的结果 return tagids
class BertBiLSTMCRF(BertPreTrainedModel): def __init__(self, config, model_configs): super(BertBiLSTMCRF, self).__init__(config) self.num_labels = config.num_labels self.max_seq_length = model_configs['max_seq_length'] self.bert = BertModel(config) self.use_cuda = model_configs['use_cuda'] and torch.cuda.is_available() self.crf = CRF(target_size=self.num_labels, use_cuda=self.use_cuda, average_batch=False) bert_embedding = config.hidden_size # hidden_dim即输出维度 # lstm的hidden_dim和init_hidden的hidden_dim是一致的 # 是输出层hidden_dim的1/2 self.hidden_dim = config.hidden_size self.rnn_layers = model_configs['rnn_layers'] self.lstm = nn.LSTM( input_size=bert_embedding, # bert embedding hidden_size=self.hidden_dim, num_layers=self.rnn_layers, batch_first=True, # dropout = model_configs['train']['dropout_rate'], bidirectional=True) self.dropout = nn.Dropout(model_configs['dropout_rate']) self.hidden2label = nn.Linear(self.hidden_dim * 2, self.num_labels + 2) self.apply(self.init_weights) def rand_init_hidden(self, batch_size): """ random initialize hidden variable 双向是2,单向是1 """ if self.use_cuda: return (torch.zeros(2 * self.rnn_layers, batch_size, self.hidden_dim).cuda(), torch.zeros(2 * self.rnn_layers, batch_size, self.hidden_dim).cuda()) else: return (torch.zeros(2 * self.rnn_layers, batch_size, self.hidden_dim), torch.zeros(2 * self.rnn_layers, batch_size, self.hidden_dim)) def forward(self, input_ids, segment_ids, input_mask): # outputs = sequence_output, pooled_output, (hidden_states), (attentions) # sequence_output = encoder_outputs[0] # pooled_output = pooler(sequence_output) outputs = self.bert(input_ids=input_ids, position_ids=None, token_type_ids=segment_ids, attention_mask=input_mask, head_mask=None) # bert_embeds: shape = [batch_size, max_seq_length, bert_embedding] bert_embeds = outputs[0] batch_size = input_ids.size(0) hidden = self.rand_init_hidden(batch_size) lstm_output, hidden = self.lstm(bert_embeds, hidden) lstm_output = lstm_output.contiguous().view(-1, self.hidden_dim * 2) # lstm_output = self.dropout(lstm_output) logits = self.hidden2label(lstm_output) return logits.view(-1, self.max_seq_length, self.num_labels + 2) def loss_fn(self, feats, mask, labels): batch_size = feats.size(0) loss_value = self.crf.neg_log_likelihood_loss( feats, mask, labels) / float(batch_size) return loss_value def predict(self, feats, mask): path_score, best_path = self.crf(feats, mask.byte()) return best_path
class GazLSTM(nn.Module): def __init__(self, data): super(GazLSTM, self).__init__() self.gpu = data.HP_gpu self.use_biword = data.use_bigram self.hidden_dim = data.HP_hidden_dim self.gaz_alphabet = data.gaz_alphabet self.gaz_emb_dim = data.gaz_emb_dim self.word_emb_dim = data.word_emb_dim self.biword_emb_dim = data.biword_emb_dim self.use_char = data.HP_use_char self.bilstm_flag = data.HP_bilstm self.lstm_layer = data.HP_lstm_layer self.use_count = data.HP_use_count self.num_layer = data.HP_num_layer self.model_type = data.model_type self.use_bert = data.use_bert # self.use_gazcount = data.use_gazcount #设置是否使用词典 self.use_dictionary = data.use_dictionary self.simi_dic_emb = data.simi_dic_emb self.simi_dic_dim = data.simi_dic_dim scale = np.sqrt(3.0 / self.gaz_emb_dim) data.pretrain_gaz_embedding[0, :] = np.random.uniform( -scale, scale, [1, self.gaz_emb_dim]) if self.use_char: scale = np.sqrt(3.0 / self.word_emb_dim) data.pretrain_word_embedding[0, :] = np.random.uniform( -scale, scale, [1, self.word_emb_dim]) self.gaz_embedding = nn.Embedding(data.gaz_alphabet.size(), self.gaz_emb_dim) #初始化gaz随机矩阵 self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.word_emb_dim) #初始化word随机矩阵 if self.use_biword: self.biword_embedding = nn.Embedding(data.biword_alphabet.size(), self.biword_emb_dim) if data.pretrain_gaz_embedding is not None: self.gaz_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_gaz_embedding) ) #将data.pretrain_gaz_embedding的值拷贝到gaz_embedding中 else: self.gaz_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.gaz_alphabet.size(), self.gaz_emb_dim))) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.word_emb_dim))) if self.use_biword: if data.pretrain_biword_embedding is not None: self.biword_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_biword_embedding)) else: self.biword_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.biword_alphabet.size(), self.word_emb_dim))) use_gazcount = True #字符的特征纬度 char_feature_dim = self.word_emb_dim + 4 * self.gaz_emb_dim if self.use_dictionary: if use_gazcount: char_feature_dim += self.simi_dic_dim else: char_feature_dim = self.word_emb_dim #+ self.simi_dic_dim if self.use_biword: char_feature_dim += self.biword_emb_dim if self.use_bert: char_feature_dim = char_feature_dim + 768 ## lstm model if self.model_type == 'lstm': lstm_hidden = self.hidden_dim if self.bilstm_flag: self.hidden_dim *= 2 self.NERmodel = NERmodel(model_type='lstm', input_dim=char_feature_dim, hidden_dim=lstm_hidden, num_layer=self.lstm_layer, biflag=self.bilstm_flag) ## cnn model if self.model_type == 'cnn': self.NERmodel = NERmodel(model_type='cnn', input_dim=char_feature_dim, hidden_dim=self.hidden_dim, num_layer=self.num_layer, dropout=data.HP_dropout, gpu=self.gpu) ## attention model if self.model_type == 'transformer': self.NERmodel = NERmodel(model_type='transformer', input_dim=char_feature_dim, hidden_dim=self.hidden_dim, num_layer=self.num_layer, dropout=data.HP_dropout) self.drop = nn.Dropout(p=data.HP_dropout) #按照0.5的概率改为零 self.hidden2tag = nn.Linear(self.hidden_dim, data.label_alphabet_size + 2) self.crf = CRF(data.label_alphabet_size, self.gpu) if self.use_bert: self.bert_encoder = BertModel.from_pretrained('bert-base-chinese') for p in self.bert_encoder.parameters(): p.requires_grad = False if self.gpu: self.gaz_embedding = self.gaz_embedding.cuda() self.word_embedding = self.word_embedding.cuda() if self.use_biword: self.biword_embedding = self.biword_embedding.cuda() self.NERmodel = self.NERmodel.cuda() self.hidden2tag = self.hidden2tag.cuda() self.crf = self.crf.cuda() if self.use_bert: self.bert_encoder = self.bert_encoder.cuda() #获取嵌入的函数 def get_tags(self, gaz_list, word_inputs, biword_inputs, layer_gaz, gaz_count, gaz_chars, gaz_mask_input, gazchar_mask_input, mask, word_seq_lengths, batch_bert, bert_mask, simi_value): use_gazcount = True batch_size = word_inputs.size()[0] seq_len = word_inputs.size()[1] max_gaz_num = layer_gaz.size(-1) gaz_match = [] word_embs = self.word_embedding(word_inputs) if self.use_biword: biword_embs = self.biword_embedding(biword_inputs) word_embs = torch.cat([word_embs, biword_embs], dim=-1) if self.model_type != 'transformer': word_inputs_d = self.drop(word_embs) #(b,l,we) print(type(word_inputs_d)) else: word_inputs_d = word_embs if self.use_char: gazchar_embeds = self.word_embedding(gaz_chars) gazchar_mask = gazchar_mask_input.unsqueeze(-1).repeat( 1, 1, 1, 1, 1, self.word_emb_dim) gazchar_embeds = gazchar_embeds.data.masked_fill_( gazchar_mask.data, 0) #(b,l,4,gl,cl,ce) # gazchar_mask_input:(b,l,4,gl,cl) gaz_charnum = (gazchar_mask_input == 0).sum( dim=-1, keepdim=True).float() #(b,l,4,gl,1) gaz_charnum = gaz_charnum + (gaz_charnum == 0).float() gaz_embeds = gazchar_embeds.sum(-2) / gaz_charnum #(b,l,4,gl,ce) if self.model_type != 'transformer': gaz_embeds = self.drop(gaz_embeds) else: gaz_embeds = gaz_embeds else: #use gaz embedding gaz_embeds = self.gaz_embedding(layer_gaz) if self.model_type != 'transformer': gaz_embeds_d = self.drop(gaz_embeds) else: gaz_embeds_d = gaz_embeds gaz_mask = gaz_mask_input.unsqueeze(-1).repeat( 1, 1, 1, 1, self.gaz_emb_dim) gaz_embeds = gaz_embeds_d.data.masked_fill_( gaz_mask.data, 0) #(b,l,4,g,ge) ge:gaz_embed_dim if self.use_count: count_sum = torch.sum(gaz_count, dim=3, keepdim=True) #(b,l,4,gn) count_sum = torch.sum(count_sum, dim=2, keepdim=True) #(b,l,1,1) weights = gaz_count.div(count_sum) #(b,l,4,g) weights = weights * 4 weights = weights.unsqueeze(-1) gaz_embeds = weights * gaz_embeds #(b,l,4,g,e) gaz_embeds = torch.sum(gaz_embeds, dim=3) #(b,l,4,e) else: gaz_num = (gaz_mask_input == 0).sum( dim=-1, keepdim=True).float() #(b,l,4,1) gaz_embeds = gaz_embeds.sum(-2) / gaz_num #(b,l,4,ge)/(b,l,4,1) gaz_embeds_cat = gaz_embeds.view(batch_size, seq_len, -1) #(b,l,4*ge) print(type(gaz_embeds_cat)) if self.use_dictionary: #拼接词典的向量 simi_embeds = [] for key in simi_value: for value in key: simi = [value for i in range(self.simi_dic_dim)] simi_embeds.append(simi) print(simi_embeds) simi_embeds = torch.Tensor(simi_embeds) simi_embeds = simi_embeds.cuda() print(simi_embeds) simi_embeds_cat = simi_embeds.view(batch_size, seq_len, -1) self.simi_dic_emb = simi_embeds if use_gazcount: word_input_cat = torch.cat( [word_inputs_d, gaz_embeds_cat, simi_embeds_cat], dim=-1) else: #word_input_cat = torch.cat([word_inputs_d, simi_embeds_cat],dim = -1) word_input_cat = torch.cat([word_inputs_d], dim=-1) else: word_input_cat = torch.cat([word_inputs_d, gaz_embeds_cat], dim=-1) #(b,l,we+4*ge) #print(len(word_input_cat)) # if only_char: # word_input_cat= torch.cat() ### cat bert feature if self.use_bert: seg_id = torch.zeros(bert_mask.size()).long().cuda() outputs = self.bert_encoder(batch_bert, bert_mask, seg_id) outputs = outputs[0][:, 1:-1, :] word_input_cat = torch.cat([word_input_cat, outputs], dim=-1) feature_out_d = self.NERmodel(word_input_cat) tags = self.hidden2tag(feature_out_d) return tags, gaz_match def neg_log_likelihood_loss(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, layer_gaz, gaz_count, gaz_chars, gaz_mask, gazchar_mask, mask, batch_label, batch_bert, bert_mask, simi_value): tags, _ = self.get_tags(gaz_list, word_inputs, biword_inputs, layer_gaz, gaz_count, gaz_chars, gaz_mask, gazchar_mask, mask, word_seq_lengths, batch_bert, bert_mask, simi_value) total_loss = self.crf.neg_log_likelihood_loss(tags, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(tags, mask) return total_loss, tag_seq def forward(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, layer_gaz, gaz_count, gaz_chars, gaz_mask, gazchar_mask, mask, batch_bert, bert_mask, simi_value): tags, gaz_match = self.get_tags(gaz_list, word_inputs, biword_inputs, layer_gaz, gaz_count, gaz_chars, gaz_mask, gazchar_mask, mask, word_seq_lengths, batch_bert, bert_mask, simi_value) scores, tag_seq = self.crf._viterbi_decode(tags, mask) return tag_seq, gaz_match
class BiLSTM_CRF(nn.Module): def __init__(self, data): super(BiLSTM_CRF, self).__init__() print("build batched BiLSTM CRF...") data.show_data_summary() self.embedding_dim = data.word_emb_dim self.hidden_dim = data.HP_hidden_dim self.drop = nn.Dropout(data.HP_dropout) self.droplstm = nn.Dropout(data.HP_dropout) # 声明embedding self.word_embeddings = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) # 将预训练词向量载入self.word_embeddings中 if data.pretrain_word_embedding is not None: self.word_embeddings.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embeddings.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) # 声明LSTM self.bilstm_flag = data.HP_bilstm self.lstm_layer = data.HP_lstm_layer if self.bilstm_flag: lstm_hidden = data.HP_hidden_dim // 2 else: lstm_hidden = data.HP_hidden_dim self.lstm = nn.LSTM(self.embedding_dim, lstm_hidden, num_layers=self.lstm_layer, batch_first=True, bidirectional=self.bilstm_flag) # 声明CRF self.index2label = {} for ele in data.label_alphabet.instance2index: self.index2label[data.label_alphabet.instance2index[ele]] = ele self.hidden2tag = nn.Linear(data.HP_hidden_dim, len(self.index2label) + 2) self.crf = CRF(len(self.index2label), data.HP_gpu) # 将模型载入到GPU中 self.gpu = data.HP_gpu if self.gpu: self.drop = self.drop.cuda() self.droplstm = self.droplstm.cuda() self.word_embeddings = self.word_embeddings.cuda() self.hidden2tag = self.hidden2tag.cuda() self.lstm = self.lstm.cuda() def random_embedding(self, vocab_size, embedding_dim): """ 可以用来随机初始化word embedding """ pretrain_emb = np.empty([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb def _get_lstm_features(self, batch_word, batch_wordlen): # batch_word: ([batch_size, max_sentence_length]) # batch_wordlen: ([batch_size]) embeds = self.word_embeddings(batch_word) # embeds: ([batch_size, max_word_length, embedding_dim]) embeds = self.drop(embeds) #LSTM内置函数,确保每个batch能顺利运行,原理就是每个batch中的样本按长到短排序 #batch_first,LSTM的输入是batch_size,句子长度,embedding_dim,如果是flase则为句子长度,batch_size embeds_pack = pack_padded_sequence(embeds, batch_wordlen, batch_first=True) #LSTM的输出 out_packed, (h, c) = self.lstm(embeds_pack) lstm_feature, _ = pad_packed_sequence(out_packed, batch_first=True) # lstm_feature: ([batch_size, max_word_length, HP_hidden_dim]) lstm_feature = self.droplstm(lstm_feature) lstm_feature = self.hidden2tag(lstm_feature) # lstm_feature: ([batch_size, max_word_length, len(self.index2label)+2]) return lstm_feature def neg_log_likelihood(self, batch_word, mask, batch_label, batch_wordlen): """ :param batch_word: ([batch_size, max_sentence_length]) :param mask: ([batch_size, max_sentence_length]) :param batch_label: ([batch_size, max_sentence_length]) :param batch_wordlen: ([batch_size]) :return: loss : 类似 tensor(3052.6426, device='cuda:0', grad_fn=<SubBackward0>) tag_seq: ([batch_size, max_sentence_length]) """ lstm_feature = self._get_lstm_features(batch_word, batch_wordlen) total_loss = self.crf.neg_log_likelihood_loss(lstm_feature, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(lstm_feature, mask) return total_loss, tag_seq def forward(self, batch_word, mask, batch_label, batch_wordlen): """ :param batch_word: ([batch_size, max_sentence_length]) :param mask: ([batch_size, max_sentence_length]) :param batch_label: ([batch_size, max_sentence_length]) :param batch_wordlen: ([batch_size]) :return: tag_seq: ([batch_size, max_sentence_length]) """ lstm_feature = self._get_lstm_features(batch_word, batch_wordlen) scores, best_path = self.crf._viterbi_decode(lstm_feature, mask) return best_path
class BilstmCrf(nn.Module): def __init__(self, data, model_config): super(BilstmCrf, self).__init__() if model_config['random_embedding'] == 'True': self.char_embeddings = nn.Embedding(data.char_alphabet_size, model_config['char_emb_dim']) self.char_embeddings.weight.data.copy_( torch.from_numpy( self.random_embedding(data.char_alphabet_size, model_config['char_emb_dim']))) self.char_drop = nn.Dropout(model_config['dropout']) else: char_emb_path = model_config['char_emb_file'] self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( char_emb_path, data.char_alphabet) self.char_embeddings = nn.Embedding(data.char_alphabet_size, model_config['char_emb_dim']) self.char_embeddings.weight.data.copy_( torch.from_numpy(self.pretrain_char_embedding)) # set 'inf' to 0: self.char_embeddings.weight.data[0] = torch.zeros(200) self.char_drop = nn.Dropout(model_config['dropout']) self.intent_embeddings = nn.Embedding(data.intent_alphabet_size, model_config['intent_emb_dim']) self.intent_embeddings.weight.data.copy_( torch.from_numpy( self.random_embedding(data.intent_alphabet_size, model_config['intent_emb_dim']))) self.input_drop = nn.Dropout(model_config['dropout']) self.lstm = nn.LSTM(model_config['char_emb_dim'] + model_config['intent_emb_dim'], model_config['lstm_hidden_dim'] // 2, num_layers=model_config['num_layers'], batch_first=model_config['batch_first'], bidirectional=model_config['bidirectional']) self.drop_lstm = nn.Dropout(model_config['dropout']) self.hidden2tag = nn.Linear(model_config['lstm_hidden_dim'], data.label_alphabet_size + 2) self.crf = CRF(data.label_alphabet_size, model_config['gpu']) self.num_layers = model_config['num_layers'] self.hidden_size = model_config['lstm_hidden_dim'] // 2 self.device = model_config['device'] def forward(self, batch_char, batch_intent, batch_char_len, mask, batch_label=None): char_embeds = self.char_embeddings(batch_char) intent_embeds = self.intent_embeddings(batch_intent) intent_embeds = torch.repeat_interleave(intent_embeds, batch_char.size(1), dim=1) input_embeds = torch.cat([char_embeds, intent_embeds], 2) input_represent = self.input_drop(input_embeds) # 不采用动态Rnn h0 = torch.zeros(self.num_layers * 2, batch_char.size(0), self.hidden_size).to(self.device) c0 = torch.zeros(self.num_layers * 2, batch_char.size(0), self.hidden_size).to(self.device) lstm_out, _ = self.lstm(input_represent, (h0, c0)) outputs = self.hidden2tag(lstm_out) if batch_label is not None: total_loss = self.crf.neg_log_likelihood_loss( outputs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outputs, mask) return total_loss, tag_seq else: scores, tag_seq = self.crf._viterbi_decode(outputs, mask) return tag_seq @staticmethod def random_embedding(vocab_size, embedding_dim): pretrain_emb = np.empty([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb
class CnnLstmCrf(nn.Module): def __init__(self, data): super(CnnLstmCrf, self).__init__() self.char_embeddings = nn.Embedding(data.char_alphabet_size, config.char_emb_dim) self.char_embeddings.weight.data.copy_( torch.from_numpy(self.random_embedding(data.char_alphabet_size, config.char_emb_dim))) self.char_drop = nn.Dropout(config.dropout) self.char_cnn = nn.Conv1d( in_channels=config.char_emb_dim, out_channels=config.char_hidden_dim, kernel_size=3, padding=1) self.word_embeddings = nn.Embedding(data.word_alphabet_size, config.word_emb_dim) self.word_embeddings.weight.data.copy_( torch.from_numpy(self.random_embedding(data.word_alphabet_size, config.word_emb_dim))) self.word_drop = nn.Dropout(config.dropout) self.feature_embeddings = nn.Embedding(data.feat_alphabet_size, config.feature_emb_dim) # 加载预训练的feat_emb: if len(data.pretrain_feature_embeddings) > 1: self.feature_embeddings.weight.data.copy_(torch.from_numpy(data.pretrain_feature_embeddings)) self.lstm = nn.LSTM( config.char_hidden_dim + config.word_emb_dim + config.feature_emb_dim, config.hidden_dim // 2, num_layers=1, batch_first=True, bidirectional=True) self.droplstm = nn.Dropout(config.dropout) self.hidden2tag = nn.Linear(config.hidden_dim, data.label_alphabet_size + 2) # label_size + 2 (crf的start和end) self.crf = CRF(data.label_alphabet_size, config.gpu) # char_inputs:(batch_size * max_seq_len, max_char_len) def calculate_loss(self, batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask): char_batch_size = batch_char.size(0) char_embeds = self.char_embeddings(batch_char) # (530, 10, 300) # 10表示最大字符长度 char_embeds = self.char_drop(char_embeds) # (530, 10, 300) char_embeds = char_embeds.transpose(1, 2) # 将max_length和embedding_dim转置 (batch*max_char_len, dim, max_length) char_cnn_out = self.char_cnn(char_embeds) # (530,50,10) char_cnn_out = torch.max_pool1d(char_cnn_out, kernel_size=char_cnn_out.size(2)).view(char_batch_size, -1) # (530, 50) 在词的维度做池化 char_cnn_out = char_cnn_out[batch_charrecover] # 还原到word降序的时刻 char_features = char_cnn_out.view(batch_word.size(0), batch_word.size(1), -1) # (10,53,50) # 还原到词的维度 feat_embs = self.feature_embeddings(batch_features) # (10,53,5) word_embs = self.word_embeddings(batch_word) # (10,53,300) word_embs = torch.cat([word_embs, char_features, feat_embs], 2) # (10,53,355) word_represent = self.word_drop(word_embs) # lstm packed_words = pack_padded_sequence(word_represent, batch_wordlen.cpu().numpy(), batch_first=True) hidden = None lstm_out, hidden = self.lstm(packed_words, hidden) lstm_out, _ = pad_packed_sequence(lstm_out) lstm_out = self.droplstm(lstm_out.transpose(1, 0)) outputs = self.hidden2tag(lstm_out) total_loss = self.crf.neg_log_likelihood_loss(outputs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outputs, mask) return total_loss, tag_seq def forward(self, batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask): char_batch_size = batch_char.size(0) char_embeds = self.char_embeddings(batch_char) char_embeds = self.char_drop(char_embeds) char_embeds = char_embeds.transpose(1, 2) # 将max_length和embedding_dim转置 char_cnn_out = self.char_cnn(char_embeds) # char_cnn_out = torch.max_pool1d(char_cnn_out, kernel_size=char_cnn_out.size(2)).view(char_batch_size, -1) char_cnn_out = char_cnn_out[batch_charrecover] # 还原排序之前的batch char_features = char_cnn_out.view(batch_word.size(0), batch_word.size(1), -1) feat_embs = self.feature_embeddings(batch_features) word_embs = self.word_embeddings(batch_word) word_embs = torch.cat([word_embs, char_features, feat_embs], 2) word_represent = self.word_drop(word_embs) # lstm packed_words = pack_padded_sequence(word_represent, batch_wordlen.cpu().numpy(), batch_first=True) hidden = None lstm_out, hidden = self.lstm(packed_words, hidden) lstm_out, _ = pad_packed_sequence(lstm_out) lstm_out = self.droplstm(lstm_out.transpose(1, 0)) outputs = self.hidden2tag(lstm_out) scores, tag_seq = self.crf._viterbi_decode(outputs, mask) return tag_seq @staticmethod def random_embedding(vocab_size, embedding_dim): pretrain_emb = np.empty([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb
class GazLSTM(nn.Module): def __init__(self, data): super(GazLSTM, self).__init__() self.gpu = data.HP_gpu self.use_biword = data.use_bigram self.hidden_dim = data.HP_hidden_dim self.gaz_alphabet = data.gaz_alphabet self.gaz_emb_dim = data.gaz_emb_dim self.word_emb_dim = data.word_emb_dim self.biword_emb_dim = data.biword_emb_dim self.use_char = data.HP_use_char self.bilstm_flag = data.HP_bilstm self.lstm_layer = data.HP_lstm_layer self.use_count = data.HP_use_count self.num_layer = data.HP_num_layer self.model_type = data.model_type scale = np.sqrt(3.0 / self.gaz_emb_dim) data.pretrain_gaz_embedding[0, :] = np.random.uniform( -scale, scale, [1, self.gaz_emb_dim]) if self.use_char: scale = np.sqrt(3.0 / self.word_emb_dim) data.pretrain_word_embedding[0, :] = np.random.uniform( -scale, scale, [1, self.word_emb_dim]) self.gaz_embedding = nn.Embedding(data.gaz_alphabet.size(), self.gaz_emb_dim) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.word_emb_dim) if self.use_biword: self.biword_embedding = nn.Embedding(data.biword_alphabet.size(), self.biword_emb_dim) if data.pretrain_gaz_embedding is not None: self.gaz_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_gaz_embedding)) else: self.gaz_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.gaz_alphabet.size(), self.gaz_emb_dim))) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.word_emb_dim))) if self.use_biword: if data.pretrain_biword_embedding is not None: self.biword_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_biword_embedding)) else: self.biword_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.biword_alphabet.size(), self.word_emb_dim))) char_feature_dim = self.word_emb_dim + 4 * self.gaz_emb_dim if self.use_biword: char_feature_dim += self.biword_emb_dim ## lstm model if self.model_type == 'lstm': lstm_hidden = self.hidden_dim if self.bilstm_flag: self.hidden_dim *= 2 self.NERmodel = NERmodel(model_type='lstm', input_dim=char_feature_dim, hidden_dim=lstm_hidden, num_layer=self.lstm_layer, biflag=self.bilstm_flag) ## cnn model if self.model_type == 'cnn': self.NERmodel = NERmodel(model_type='cnn', input_dim=char_feature_dim, hidden_dim=self.hidden_dim, num_layer=self.num_layer, dropout=data.HP_dropout, gpu=self.gpu) ## attention model if self.model_type == 'transformer': self.NERmodel = NERmodel(model_type='transformer', input_dim=char_feature_dim, hidden_dim=self.hidden_dim, num_layer=self.num_layer, dropout=data.HP_dropout) self.drop = nn.Dropout(p=data.HP_dropout) self.hidden2tag = nn.Linear(self.hidden_dim, data.label_alphabet_size + 2) self.crf = CRF(data.label_alphabet_size, self.gpu) if self.gpu: #self.drop = self.drop.cuda() self.gaz_embedding = self.gaz_embedding.cuda() self.word_embedding = self.word_embedding.cuda() if self.use_biword: self.biword_embedding = self.biword_embedding.cuda() self.NERmodel = self.NERmodel.cuda() self.hidden2tag = self.hidden2tag.cuda() self.crf = self.crf.cuda() def get_tags(self, gaz_list, word_inputs, biword_inputs, layer_gaz, gaz_count, gaz_chars, gaz_mask_input, gazchar_mask_input, mask): batch_size = word_inputs.size()[0] seq_len = word_inputs.size()[1] max_gaz_num = layer_gaz.size(-1) gaz_match = [] word_embs = self.word_embedding(word_inputs) if self.use_biword: biword_embs = self.biword_embedding(biword_inputs) word_embs = torch.cat([word_embs, biword_embs], dim=-1) if self.model_type != 'transformer': word_inputs_d = self.drop(word_embs) #(b,l,we) else: word_inputs_d = word_embs if self.use_char: gazchar_embeds = self.word_embedding(gaz_chars) gazchar_mask = gazchar_mask_input.unsqueeze(-1).repeat( 1, 1, 1, 1, 1, self.word_emb_dim) gazchar_embeds = gazchar_embeds.data.masked_fill_( gazchar_mask.data, 0) #(b,l,4,gl,cl,ce) # gazchar_mask_input:(b,l,4,gl,cl) gaz_charnum = (gazchar_mask_input == 0).sum( dim=-1, keepdim=True).float() #(b,l,4,gl,1) gaz_charnum = gaz_charnum + (gaz_charnum == 0).float() gaz_embeds = gazchar_embeds.sum(-2) / gaz_charnum #(b,l,4,gl,ce) if self.model_type != 'transformer': gaz_embeds = self.drop(gaz_embeds) else: gaz_embeds = gaz_embeds else: #use gaz embedding gaz_embeds = self.gaz_embedding(layer_gaz) if self.model_type != 'transformer': gaz_embeds_d = self.drop(gaz_embeds) else: gaz_embeds_d = gaz_embeds gaz_mask = gaz_mask_input.unsqueeze(-1).repeat( 1, 1, 1, 1, self.gaz_emb_dim) gaz_embeds = gaz_embeds_d.data.masked_fill_( gaz_mask.data, 0) #(b,l,4,g,ge) ge:gaz_embed_dim if self.use_count: count_sum = torch.sum(gaz_count, dim=3, keepdim=True) #(b,l,4,gn) count_sum = torch.sum(count_sum, dim=2, keepdim=True) #(b,l,1,1) weights = gaz_count.div(count_sum) #(b,l,4,g) weights = weights * 4 weights = weights.unsqueeze(-1) gaz_embeds = weights * gaz_embeds #(b,l,4,g,e) gaz_embeds = torch.sum(gaz_embeds, dim=3) #(b,l,4,e) else: gaz_num = (gaz_mask_input == 0).sum( dim=-1, keepdim=True).float() #(b,l,4,1) gaz_embeds = gaz_embeds.sum(-2) / gaz_num #(b,l,4,ge)/(b,l,4,1) gaz_embeds_cat = gaz_embeds.view(batch_size, seq_len, -1) #(b,l,4*ge) word_input_cat = torch.cat([word_inputs_d, gaz_embeds_cat], dim=-1) #(b,l,we+4*ge) feature_out_d = self.NERmodel(word_input_cat) tags = self.hidden2tag(feature_out_d) return tags, gaz_match def neg_log_likelihood_loss(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, layer_gaz, gaz_count, gaz_chars, gaz_mask, gazchar_mask, mask, batch_label): tags, _ = self.get_tags(gaz_list, word_inputs, biword_inputs, layer_gaz, gaz_count, gaz_chars, gaz_mask, gazchar_mask, mask) total_loss = self.crf.neg_log_likelihood_loss(tags, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(tags, mask) return total_loss, tag_seq def forward(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, layer_gaz, gaz_count, gaz_chars, gaz_mask, gazchar_mask, mask): tags, gaz_match = self.get_tags(gaz_list, word_inputs, biword_inputs, layer_gaz, gaz_count, gaz_chars, gaz_mask, gazchar_mask, mask) scores, tag_seq = self.crf._viterbi_decode(tags, mask) return tag_seq, gaz_match