def __init__(self, config, data_bundle, embed, num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=True, attn_type='adatrans', bi_embed=None, fc_dropout=0.3, pos_embed=None, scale=False, dropout_attn=None): """ :param tag_vocab: fastNLP Vocabulary :param embed: fastNLP TokenEmbedding :param num_layers: number of self-attention layers :param d_model: input size :param n_head: number of head :param feedforward_dim: the dimension of ffn :param dropout: dropout in self-attention :param after_norm: normalization place :param attn_type: adatrans, naive :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None :param bi_embed: Used in Chinese scenerio :param fc_dropout: dropout rate before the fc layer """ super().__init__() self.config = config self.data_bundle = data_bundle tag_vocab = data_bundle.get_vocab('target') self.embed = embed embed_size = self.embed.embed_size self.bi_embed = None if bi_embed is not None: self.bi_embed = bi_embed embed_size += self.bi_embed.embed_size self.in_fc = nn.Linear(embed_size, d_model) self.transformer = TransformerEncoder(num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=after_norm, attn_type=attn_type, scale=scale, dropout_attn=dropout_attn, pos_embed=pos_embed) self.fc_dropout = nn.Dropout(fc_dropout) self.out_fc = nn.Linear(d_model, len(tag_vocab)) trans = allowed_transitions(tag_vocab, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans)
def __init__(self, char_embed, hidden_size, num_layers, target_vocab=None, bigram_embed=None, trigram_embed=None, dropout=0.5): super().__init__() embed_size = char_embed.embed_size self.char_embed = char_embed if bigram_embed: embed_size += bigram_embed.embed_size self.bigram_embed = bigram_embed if trigram_embed: embed_size += trigram_embed.embed_size self.trigram_embed = trigram_embed self.lstm = LSTM(embed_size, hidden_size=hidden_size // 2, bidirectional=True, batch_first=True, num_layers=num_layers) self.dropout = nn.Dropout(p=dropout) self.fc = nn.Linear(hidden_size, len(target_vocab)) transitions = None if target_vocab: transitions = allowed_transitions(target_vocab, include_start_end=True, encoding_type='bmes') self.crf = ConditionalRandomField(num_tags=len(target_vocab), allowed_transitions=transitions)
def __init__(self, embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): super().__init__() self.embedding = embed self.lstm = LSTM(input_size=self.embedding.embedding_dim, hidden_size=hidden_size // 2, num_layers=num_layers, bidirectional=True, batch_first=True) self.fc = nn.Linear(hidden_size, len(tag_vocab)) transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=transitions) self.dropout = nn.Dropout(dropout, inplace=True) for name, param in self.named_parameters(): if 'fc' in name: if param.data.dim() > 1: nn.init.xavier_uniform_(param) else: nn.init.constant_(param, 0) if 'crf' in name: nn.init.zeros_(param)
def get_crf_zero_init(label_size, include_start_end_trans=False, allowed_transitions=None, initial_method=None): crf = ConditionalRandomField(label_size, include_start_end_trans) crf.trans_m = nn.Parameter(torch.zeros(size=[label_size, label_size], requires_grad=True)) if crf.include_start_end_trans: crf.start_scores = nn.Parameter(torch.zeros(size=[label_size], requires_grad=True)) crf.end_scores = nn.Parameter(torch.zeros(size=[label_size], requires_grad=True)) return crf
def __init__(self, embed, tag_vocab, encoding_type='bio'): super().__init__() self.embed = embed self.fc = nn.Linear(self.embed.embed_size, len(tag_vocab)) trans = allowed_transitions(tag_vocab, encoding_type=encoding_type, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans)
def __init__(self, tag_vocab, embed, d_model, n_heads, d_k, d_v, n_layers, d_label=10, fc_dropout=0.3, dropout=0.15, gpu=0, pos_embed=None, scale=False): """ :param tag_vocab: fastNLP Vocabulary :param embed: fastNLP TokenEmbedding :param num_layers: number of self-attention layers :param d_model: input size :param n_head: number of head :param feedforward_dim: the dimension of ffn :param dropout: dropout in self-attention :param after_norm: normalization place :param attn_type: adatrans, naive :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None :param bi_embed: Used in Chinese scenerio :param fc_dropout: dropout rate before the fc layer """ super().__init__() self.embed = embed embed_size = self.embed.embed_size self.in_fc = nn.Linear(embed_size, d_model) self.encoder = Encoder(d_model, n_heads, d_k, d_v, n_layers, d_label, dropout, feedforward_dim=int(2 * d_model)) self.fc_dropout = nn.Dropout(fc_dropout) self.out_fc = nn.Linear(d_model, len(tag_vocab)) trans = allowed_transitions(tag_vocab, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans)
def __init__(self, char_embed, bigram_embed, word_embed, hidden_size, label_size, bias=True, bidirectional=False, device=None, embed_dropout=0, output_dropout=0, use_bigram=True): if device is None: self.device = torch.device('cpu') else: self.device = torch.device(device) super().__init__() self.char_embed_size = char_embed.embedding.weight.size(1) self.bigram_embed_size = bigram_embed.embedding.weight.size(1) self.word_embed_size = word_embed.embedding.weight.size(1) self.hidden_size = hidden_size self.label_size = label_size self.bidirectional = bidirectional self.use_bigram = use_bigram self.char_embed = char_embed self.bigram_embed = bigram_embed self.word_embed = word_embed if self.use_bigram: self.input_size = self.char_embed_size + self.bigram_embed_size else: self.input_size = self.char_embed_size self.encoder = LSTM(self.input_size, self.hidden_size, bidirectional=self.bidirectional) better_init_rnn(self.encoder.lstm) self.output = nn.Linear( self.hidden_size * (2 if self.bidirectional else 1), self.label_size) self.debug = False self.loss_func = nn.CrossEntropyLoss() self.embed_dropout = nn.Dropout(embed_dropout) self.output_dropout = nn.Dropout(output_dropout) self.crf = ConditionalRandomField(label_size, True)
def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, num_classes, num_layers, inner_size, key_size, value_size, num_head, dropout=0.1, id2words=None, encoding_type='bieso'): super().__init__() # self.Embedding = nn.Embedding(init_embed) #print(char_init_embed) self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1]) self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1]) self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1]) # spo embed size: 50 self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim self.norm1 = torch.nn.LayerNorm(self.embed_dim) # self.Rnn = encoder.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2, # dropout=dropout, bidirectional=True, batch_first=True) self.transformer = encoder.TransformerEncoder( num_layers=num_layers, model_size=self.embed_dim, inner_size=inner_size, key_size=key_size, value_size=value_size, num_head=num_head, dropout=dropout) self.Linear1 = nn.Linear(self.embed_dim, self.embed_dim // 3) self.norm2 = torch.nn.LayerNorm(self.embed_dim // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(self.embed_dim // 3, num_classes) self.Linear = nn.Linear(self.embed_dim, num_classes) if id2words is None: self.Crf = CRF(num_classes, include_start_end_trans=False) else: self.Crf = CRF(num_classes, include_start_end_trans=False, allowed_transitions=allowed_transitions( id2words, encoding_type=encoding_type))
class BertCRF(nn.Module): def __init__(self, embed, tag_vocab, encoding_type='bio'): super().__init__() self.embed = embed self.fc = nn.Linear(self.embed.embed_size, len(tag_vocab)) trans = allowed_transitions(tag_vocab, encoding_type=encoding_type, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans) def _forward(self, words, target): mask = words.ne(0) words = self.embed(words) words = self.fc(words) logits = F.log_softmax(words, dim=-1) if target is not None: loss = self.crf(logits, target, mask) return {'loss': loss} else: paths, _ = self.crf.viterbi_decode(logits, mask) return {'pred': paths} def forward(self, words, target): return self._forward(words, target) def predict(self, words): return self._forward(words, None)
def __init__(self, tag_vocabs, embed, num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=True, attn_type='adatrans', bi_embed=None, fc_dropout=0.3, pos_embed=None, scale=False, dropout_attn=None): super().__init__() self.embed = embed embed_size = self.embed.embed_size self.bi_embed = None if bi_embed is not None: self.bi_embed = bi_embed embed_size += self.bi_embed.embed_size self.tag_vocabs = [] self.out_fcs = nn.ModuleList() self.crfs = nn.ModuleList() for i in range(len(tag_vocabs)): self.tag_vocabs.append(tag_vocabs[i]) out_fc = nn.Linear(1536, len(tag_vocabs[i])) self.out_fcs.append(out_fc) trans = allowed_transitions( tag_vocabs[i], encoding_type='bioes', include_start_end=True) crf = ConditionalRandomField( len(tag_vocabs[i]), include_start_end_trans=True, allowed_transitions=trans) self.crfs.append(crf) self.in_fc = nn.Linear(embed_size, d_model) self.transformer = TransformerEncoder(num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=after_norm, attn_type=attn_type, scale=scale, dropout_attn=dropout_attn, pos_embed=pos_embed) self.fc_dropout = nn.Dropout(fc_dropout)
def __init__(self, char_embed, num_classes, bigram_embed=None, trigram_embed=None, num_layers=1, hidden_size=100, dropout=0.5, target_vocab=None, encoding_type=None): super().__init__() self.char_embed = get_embeddings(char_embed) embed_size = self.char_embed.embedding_dim if bigram_embed: self.bigram_embed = get_embeddings(bigram_embed) embed_size += self.bigram_embed.embedding_dim if trigram_embed: self.trigram_ebmbed = get_embeddings(trigram_embed) embed_size += self.bigram_embed.embedding_dim if num_layers > 1: self.lstm = LSTM(embed_size, num_layers=num_layers, hidden_size=hidden_size // 2, bidirectional=True, batch_first=True, dropout=dropout) else: self.lstm = LSTM(embed_size, num_layers=num_layers, hidden_size=hidden_size // 2, bidirectional=True, batch_first=True) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(hidden_size, num_classes) trans = None if target_vocab is not None and encoding_type is not None: trans = allowed_transitions(target_vocab.idx2word, encoding_type=encoding_type, include_start_end=True) self.crf = ConditionalRandomField(num_classes, include_start_end_trans=True, allowed_transitions=trans)
class TransformerCWS(nn.Module): def __init__(self, vocab_num, max_len, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None, hidden_size=200, embed_drop_p=0.3, num_layers=2, num_heads=6, tag_size=4): super().__init__() input_size = embed_dim if bigram_vocab_num: self.bigram_embedding = nn.Embedding(bigram_vocab_num, bigram_embed_dim) input_size += num_bigram_per_char*bigram_embed_dim self.drop = nn.Dropout(embed_drop_p, inplace=True) self.fc1 = nn.Linear(input_size, hidden_size) self.transformer = StarTransEnc(nn.Embedding(vocab_num, embed_dim), num_layers=num_layers, hidden_size=hidden_size, num_head=num_heads, head_dim=32, emb_dropout=0.3, dropout=0.1, max_len=max_len) self.fc2 = nn.Linear(hidden_size, tag_size) # allowed_trans = allowed_transitions({0:'b', 1:'m', 2:'e', 3:'s'}, encoding_type='bmes') allowed_trans = None self.crf = ConditionalRandomField(num_tags=tag_size, include_start_end_trans=False, allowed_transitions=allowed_trans) def forward(self, chars, target, seq_lens, bigrams=None): masks = seq_len_to_mask(seq_lens) batch_size = x.size(0) length = x.size(1) if hasattr(self, 'bigram_embedding'): bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1) feats, _ = self.transformer(chars, masks) feats = self.fc2(feats) losses = self.crf(feats, target, masks.float()) pred_dict = {} pred_dict['seq_lens'] = seq_lens pred_dict['loss'] = torch.mean(losses) return pred_dict def predict(self, chars, seq_lens, bigrams=None): masks = seq_len_to_mask(seq_lens) x = self.embedding(chars) batch_size = x.size(0) length = x.size(1) if hasattr(self, 'bigram_embedding'): bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1) self.drop(x) x = self.fc1(x) feats = self.transformer(x, masks) feats = self.fc2(feats) probs = self.crf.viterbi_decode(feats, masks, get_score=False) return {'pred': probs, 'seq_lens':seq_lens}
def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, sentence_length, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bmes'): super().__init__() # self.Embedding = nn.Embedding(init_embed) #print(char_init_embed) self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1]) self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1]) self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1]) # spo embed size: 50 self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim # sentence length #self.sen_len = sentence_length #self.zeros = torch.zeros(self.sen_len, dtype=torch.long) self.norm1 = torch.nn.LayerNorm(self.embed_dim) self.Rnn = encoder.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True) self.Linear1 = nn.Linear(hidden_size * 2, hidden_size * 2 // 3) self.norm2 = torch.nn.LayerNorm(hidden_size * 2 // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(hidden_size * 2 // 3, num_classes) if id2words is None: self.Crf = CRF(num_classes, include_start_end_trans=False) else: self.Crf = CRF(num_classes, include_start_end_trans=False, allowed_transitions=allowed_transitions( id2words, encoding_type=encoding_type))
class CNNBiLSTMCRF(nn.Module): def __init__(self, embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): super().__init__() self.embedding = embed self.lstm = LSTM(input_size=self.embedding.embedding_dim, hidden_size=hidden_size // 2, num_layers=num_layers, bidirectional=True, batch_first=True) self.fc = nn.Linear(hidden_size, len(tag_vocab)) transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=transitions) self.dropout = nn.Dropout(dropout, inplace=True) for name, param in self.named_parameters(): if 'fc' in name: if param.data.dim() > 1: nn.init.xavier_uniform_(param) else: nn.init.constant_(param, 0) if 'crf' in name: nn.init.zeros_(param) def _forward(self, words, seq_len, target=None): words = self.embedding(words) outputs, _ = self.lstm(words, seq_len) self.dropout(outputs) logits = F.log_softmax(self.fc(outputs), dim=-1) if target is not None: loss = self.crf(logits, target, seq_len_to_mask(seq_len, max_len=logits.size(1))).mean() return {Const.LOSS: loss} else: pred, _ = self.crf.viterbi_decode( logits, seq_len_to_mask(seq_len, max_len=logits.size(1))) return {Const.OUTPUT: pred} def forward(self, words, seq_len, target): return self._forward(words, seq_len, target) def predict(self, words, seq_len): return self._forward(words, seq_len, None)
class KnowledgePointExtractionModel(BertPreTrainedModel): """知识抽取---参照序列标注模型 1. Embedding - 8 layer以下bert model, 2. multi layer MLP 线性变换 3. CRF layer 修正""" def __init__(self, config: BertConfig): super(KnowledgePointExtractionModel, self).__init__(config=config) self.bert = BertModel( config=config, add_pooling_layer=False) # word to vector(embeddings) # MLP输入输出向量size, mlp_layer_sizes: [hidden_size, middle_size1, middle_size2, len(config.crf_labels)] self.kpe_mlp = MLP(size_layer=config.mlp_layer_sizes, activation='relu', output_activation=None) # crf_labels = {0:"<pad>", 1: "S", 2: "B", 3: "M", 4: "E"} (id2label) tag_labels = {} for key, value in config.crf_labels.items(): if not isinstance(key, int): tag_labels[int(key)] = value if tag_labels: config.crf_labels = tag_labels trans = allowed_transitions(tag_vocab=config.crf_labels, include_start_end=True) self.kpe_crf = ConditionalRandomField(num_tags=len(config.crf_labels), include_start_end_trans=True, allowed_transitions=trans) def forward(self, input_ids, labels=None, attention_mask=None): """前向传播""" bert_outputs = self.bert(input_ids, attention_mask=attention_mask, return_dict=True) embedding_output = bert_outputs.last_hidden_state mlp_outputs = self.kpe_mlp(embedding_output) logits = F.log_softmax(mlp_outputs, dim=-1) if attention_mask is None: attention_mask = input_ids.ne(0) if labels is not None: # train crf_outputs = self.kpe_crf(logits, labels, mask=attention_mask) # logger.info("loss shape: {}".format(crf_outputs.shape)) loss = crf_outputs.sum() / attention_mask.type_as( input_ids).sum() # token loss # logger.info("loss value: {}".format(loss)) return (loss, ) # {"loss": loss} # 4.0以上版本 else: # inference paths, _ = self.kpe_crf.viterbi_decode(logits, mask=attention_mask) return {"pred": paths} return logits
def __init__(self, config: BertConfig): super(KnowledgePointExtractionModel, self).__init__(config=config) self.bert = BertModel( config=config, add_pooling_layer=False) # word to vector(embeddings) # MLP输入输出向量size, mlp_layer_sizes: [hidden_size, middle_size1, middle_size2, len(config.crf_labels)] self.kpe_mlp = MLP(size_layer=config.mlp_layer_sizes, activation='relu', output_activation=None) # crf_labels = {0:"<pad>", 1: "S", 2: "B", 3: "M", 4: "E"} (id2label) tag_labels = {} for key, value in config.crf_labels.items(): if not isinstance(key, int): tag_labels[int(key)] = value if tag_labels: config.crf_labels = tag_labels trans = allowed_transitions(tag_vocab=config.crf_labels, include_start_end=True) self.kpe_crf = ConditionalRandomField(num_tags=len(config.crf_labels), include_start_end_trans=True, allowed_transitions=trans)
def __init__(self, vocab_num, max_len, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None, hidden_size=200, embed_drop_p=0.3, num_layers=2, num_heads=6, tag_size=4): super().__init__() input_size = embed_dim if bigram_vocab_num: self.bigram_embedding = nn.Embedding(bigram_vocab_num, bigram_embed_dim) input_size += num_bigram_per_char*bigram_embed_dim self.drop = nn.Dropout(embed_drop_p, inplace=True) self.fc1 = nn.Linear(input_size, hidden_size) self.transformer = StarTransEnc(nn.Embedding(vocab_num, embed_dim), num_layers=num_layers, hidden_size=hidden_size, num_head=num_heads, head_dim=32, emb_dropout=0.3, dropout=0.1, max_len=max_len) self.fc2 = nn.Linear(hidden_size, tag_size) # allowed_trans = allowed_transitions({0:'b', 1:'m', 2:'e', 3:'s'}, encoding_type='bmes') allowed_trans = None self.crf = ConditionalRandomField(num_tags=tag_size, include_start_end_trans=False, allowed_transitions=allowed_trans)
def __init__(self, tag_vocab, bert_config, bi_embed=None): """ :param tag_vocab: fastNLP Vocabulary :param embed: fastNLP TokenEmbedding :param num_layers: number of self-attention layers :param d_model: input size :param n_head: number of head :param feedforward_dim: the dimension of ffn :param dropout: dropout in self-attention :param after_norm: normalization place :param attn_type: adatrans, naive :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None :param bi_embed: Used in Chinese scenerio :param fc_dropout: dropout rate before the fc layer """ super().__init__() self.embed = BertModel.from_pretrained(bert_config) embed_size = self.embed.embeddings.word_embeddings.weight.shape[1] self.bi_embed = None if bi_embed is not None: self.bi_embed = bi_embed embed_size += self.bi_embed.embed_size self.configuration = TransfoXLConfig(d_model=768, d_head=16, n_head=16, n_layer=4, mem_len=1000) self.xl_model = TransfoXLModel(self.configuration) self.liner = nn.Linear(768, len(tag_vocab)) # trans = allowed_transitions(tag_vocab, include_start_end=True, encoding_type = "bioes") #TODO: trans 为限制转移的数组,非常有用,过后加上 self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=None)
def __init__(self, embed, tag_vocabs, encoding_type='bio'): super().__init__() self.embed = embed self.tag_vocabs = [] self.fcs = nn.ModuleList() self.crfs = nn.ModuleList() for i in range(len(tag_vocabs)): self.tag_vocabs.append(tag_vocabs[i]) linear = nn.Linear(self.embed.embed_size, len(tag_vocabs[i])) self.fcs.append(linear) trans = allowed_transitions( tag_vocabs[i], encoding_type=encoding_type, include_start_end=True) crf = ConditionalRandomField( len(tag_vocabs[i]), include_start_end_trans=True, allowed_transitions=trans) self.crfs.append(crf)
def __init__(self, batch_size, word_vocab_size, char_vocab_size, pos_vocab_size, spo_vocab_size, embed_dim, hidden_dim, id2words, dropout=0.5): super().__init__() self.batch_size = batch_size self.word_embeds = nn.Embedding(word_vocab_size, embed_dim) self.char_embeds = nn.Embedding(char_vocab_size, embed_dim) self.pos_embeds = nn.Embedding(pos_vocab_size, embed_dim) self.spo_embeds = nn.Embedding(spo_vocab_size, embed_dim) self.norm1 = torch.nn.LayerNorm(embed_dim) self.Rnn = nn.LSTM(embed_dim, hidden_dim, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True) self.Linear1 = nn.Linear(hidden_dim * 2, hidden_dim * 2 // 3) self.norm2 = torch.nn.LayerNorm(hidden_dim * 2 // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(hidden_dim * 2 // 3, len(id2words)) self.Crf = CRF(len(id2words), allowed_transitions=allowed_transitions(id2words))
def __init__(self, embed,label_vocab,pos_idx=31, Parsing_rnn_layers=3, Parsing_arc_mlp_size=500, Parsing_label_mlp_size=100,Parsing_use_greedy_infer=False, encoding_type='bmeso',embedding_dim=768,dropout=0.1,use_pos_embedding=True, use_average=True): super().__init__() self.embed = embed self.use_pos_embedding=use_pos_embedding self.use_average=use_average self.label_vocab=label_vocab self.pos_idx=pos_idx self.user_dict_weight=0.05 embedding_dim_1=512 embedding_dim_2=256 self.layers_map={'CWS':'-1','POS':'-1','Parsing':'-1','NER':'-1'} #NER self.ner_linear=nn.Linear(embedding_dim,len(label_vocab['NER'])) trans = allowed_transitions(label_vocab['NER'], encoding_type='bmeso', include_start_end=True) self.ner_crf = ConditionalRandomField(len(label_vocab['NER']), include_start_end_trans=True, allowed_transitions=trans) #parsing self.biaffine_parser=BertCharParser( app_index=self.label_vocab['Parsing'].to_index('APP'), vector_size=768, num_label=len(label_vocab['Parsing']), rnn_layers=Parsing_rnn_layers, arc_mlp_size=Parsing_arc_mlp_size, label_mlp_size=Parsing_label_mlp_size, dropout=dropout, use_greedy_infer=Parsing_use_greedy_infer) if self.use_pos_embedding: self.pos_embedding=nn.Embedding(len(self.label_vocab['pos']),embedding_dim, padding_idx=0) self.loss=CrossEntropyLoss(padding_idx=0) #CWS self.cws_mlp=MLP([embedding_dim, embedding_dim_1,embedding_dim_2, len(label_vocab['CWS'])], 'relu', output_activation=None) trans=allowed_transitions(label_vocab['CWS'],include_start_end=True) self.cws_crf = ConditionalRandomField(len(label_vocab['CWS']), include_start_end_trans=True, allowed_transitions=trans) #POS self.pos_mlp=MLP([embedding_dim, embedding_dim_1,embedding_dim_2, len(label_vocab['POS'])], 'relu', output_activation=None) trans=allowed_transitions(label_vocab['POS'],include_start_end=True) self.pos_crf = ConditionalRandomField(len(label_vocab['POS']), include_start_end_trans=True, allowed_transitions=trans)
class TransformerSeqLabel(nn.Module): def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, num_classes, num_layers, inner_size, key_size, value_size, num_head, dropout=0.1, id2words=None, encoding_type='bieso'): super().__init__() # self.Embedding = nn.Embedding(init_embed) #print(char_init_embed) self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1]) self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1]) self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1]) # spo embed size: 50 self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim self.norm1 = torch.nn.LayerNorm(self.embed_dim) # self.Rnn = encoder.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2, # dropout=dropout, bidirectional=True, batch_first=True) self.transformer = encoder.TransformerEncoder( num_layers=num_layers, model_size=self.embed_dim, inner_size=inner_size, key_size=key_size, value_size=value_size, num_head=num_head, dropout=dropout) self.Linear1 = nn.Linear(self.embed_dim, self.embed_dim // 3) self.norm2 = torch.nn.LayerNorm(self.embed_dim // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(self.embed_dim // 3, num_classes) self.Linear = nn.Linear(self.embed_dim, num_classes) if id2words is None: self.Crf = CRF(num_classes, include_start_end_trans=False) else: self.Crf = CRF(num_classes, include_start_end_trans=False, allowed_transitions=allowed_transitions( id2words, encoding_type=encoding_type)) def _decode(self, x): """ :param torch.FloatTensor x: [batch_size, max_len, tag_size] :return torch.LongTensor, [batch_size, max_len] """ tag_seq, _ = self.Crf.viterbi_decode(x, self.mask) return tag_seq def _internal_loss(self, x, y): """ Negative log likelihood loss. :param x: Tensor, [batch_size, max_len, tag_size] :param y: Tensor, [batch_size, max_len] :return loss: a scalar Tensor """ x = x.float() y = y.long() assert x.shape[:2] == y.shape assert y.shape == self.mask.shape total_loss = self.Crf(x, y, self.mask) return torch.mean(total_loss) def _make_mask(self, x, seq_len): batch_size, max_len = x.size(0), x.size(1) mask = seq_len_to_mask(seq_len) mask = mask.view(batch_size, max_len) mask = mask.to(x).float() return mask def _forward(self, char, word, pos, spo, seq_len, tag=None): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len:[batch_size, ] :param torch.LongTensor target: [batch_size, max_len] :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. If truth is not None, return loss, a scalar. Used in training. """ char = char.long() #word = word.long() #pos = pos.long() #spo = spo.long() seq_len = seq_len.long() self.mask = self._make_mask(char, seq_len) # seq_len = seq_len.long() tag = tag.long() if tag is not None else None #if next(self.parameters()).is_cuda: # char = char.cuda() # self.mask = self.mask.cuda() # x = self.Embedding(words) char = self.char_embed(char) word = self.word_embed(word) pos = self.pos_embed(pos) #print(spo) #print(self.zeros) spo = spo.unsqueeze(1).repeat(1, char.shape[1], 1).float() #print(char.shape) #print(word.shape) #print(pos.shape) #print(spo.shape) x = torch.cat((char, word, pos, spo), dim=2) #print(x.shape) x = self.norm1(x) # [batch_size, max_len, char_embed_dim + word_embed_dim + pos_embed_dim + spo_embed_dim ] x = self.transformer(x, seq_mask=self.mask) #x = self.Linear1(x) #x = self.norm2(x) #x = self.relu(x) #x = self.drop(x) #x = self.Linear2(x) x = self.Linear(x) if tag is not None: return {"loss": self._internal_loss(x, tag)} else: return {"pred": self._decode(x)} #return {"pred": self._decode(x)} def forward(self, char, word, pos, spo, seq_len, tag): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :param torch.LongTensor target: [batch_size, max_len], 目标 :return torch.Tensor: a scalar loss """ return self._forward(char, word, pos, spo, seq_len, tag) def predict(self, char, word, pos, spo, seq_len): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :return torch.LongTensor: [batch_size, max_len] """ return self._forward(char, word, pos, spo, seq_len)
class AdvSeqLabel(nn.Module): """ 别名::class:`fastNLP.models.AdvSeqLabel` :class:`fastNLP.models.sequence_labeling.AdvSeqLabel` 更复杂的Sequence Labelling模型。结构为Embedding, LayerNorm, 双向LSTM(两层),FC,LayerNorm,DropOut,FC,CRF。 :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding :param int hidden_size: LSTM的隐层大小 :param int num_classes: 有多少个类 :param float dropout: LSTM中以及DropOut层的drop概率 :param dict id2words: tag id转为其tag word的表。用于在CRF解码时防止解出非法的顺序,比如'BMES'这个标签规范中,'S' 不能出现在'B'之后。这里也支持类似与'B-NN',即'-'前为标签类型的指示,后面为具体的tag的情况。这里不但会保证 'B-NN'后面不为'S-NN'还会保证'B-NN'后面不会出现'M-xx'(任何非'M-NN'和'E-NN'的情况。) :param str encoding_type: 支持"BIO", "BMES", "BEMSO", 只有在id2words不为None的情况有用。 """ def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, sentence_length, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bmes'): super().__init__() # self.Embedding = nn.Embedding(init_embed) #print(char_init_embed) self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1]) self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1]) self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1]) # spo embed size: 50 self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim # sentence length #self.sen_len = sentence_length #self.zeros = torch.zeros(self.sen_len, dtype=torch.long) self.norm1 = torch.nn.LayerNorm(self.embed_dim) self.Rnn = encoder.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True) self.Linear1 = nn.Linear(hidden_size * 2, hidden_size * 2 // 3) self.norm2 = torch.nn.LayerNorm(hidden_size * 2 // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(hidden_size * 2 // 3, num_classes) if id2words is None: self.Crf = CRF(num_classes, include_start_end_trans=False) else: self.Crf = CRF(num_classes, include_start_end_trans=False, allowed_transitions=allowed_transitions( id2words, encoding_type=encoding_type)) def _decode(self, x): """ :param torch.FloatTensor x: [batch_size, max_len, tag_size] :return torch.LongTensor, [batch_size, max_len] """ tag_seq, _ = self.Crf.viterbi_decode(x, self.mask) return tag_seq def _internal_loss(self, x, y): """ Negative log likelihood loss. :param x: Tensor, [batch_size, max_len, tag_size] :param y: Tensor, [batch_size, max_len] :return loss: a scalar Tensor """ x = x.float() y = y.long() assert x.shape[:2] == y.shape assert y.shape == self.mask.shape total_loss = self.Crf(x, y, self.mask) return torch.mean(total_loss) def _make_mask(self, x, seq_len): batch_size, max_len = x.size(0), x.size(1) mask = seq_len_to_mask(seq_len) mask = mask.view(batch_size, max_len) mask = mask.to(x).float() return mask def _forward(self, char, word, pos, spo, seq_len, tag=None): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len:[batch_size, ] :param torch.LongTensor target: [batch_size, max_len] :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. If truth is not None, return loss, a scalar. Used in training. """ char = char.long() #word = word.long() #pos = pos.long() #spo = spo.long() seq_len = seq_len.long() self.mask = self._make_mask(char, seq_len) # seq_len = seq_len.long() tag = tag.long() if tag is not None else None #if next(self.parameters()).is_cuda: # char = char.cuda() # self.mask = self.mask.cuda() # x = self.Embedding(words) char = self.char_embed(char) word = self.word_embed(word) pos = self.pos_embed(pos) #print(spo) #print(self.zeros) spo = spo.unsqueeze(1).repeat(1, char.shape[1], 1).float() #print(char.shape) #print(word.shape) #print(pos.shape) #print(spo.shape) x = torch.cat((char, word, pos, spo), dim=2) #print(x.shape) x = self.norm1(x) # [batch_size, max_len, char_embed_dim + word_embed_dim + pos_embed_dim + spo_embed_dim ] x, _ = self.Rnn(x, seq_len=seq_len) x = self.Linear1(x) x = self.norm2(x) x = self.relu(x) x = self.drop(x) x = self.Linear2(x) if tag is not None: return {"loss": self._internal_loss(x, tag)} else: return {"pred": self._decode(x)} #return {"pred": self._decode(x)} def forward(self, char, word, pos, spo, seq_len, tag): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :param torch.LongTensor target: [batch_size, max_len], 目标 :return torch.Tensor: a scalar loss """ return self._forward(char, word, pos, spo, seq_len, tag) def predict(self, char, word, pos, spo, seq_len): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :return torch.LongTensor: [batch_size, max_len] """ return self._forward(char, word, pos, spo, seq_len)
class CharModel(nn.Module): def __init__(self, embed, label_vocab, pos_idx, Parsing_rnn_layers, Parsing_arc_mlp_size, Parsing_label_mlp_size, Parsing_use_greedy_infer=False, encoding_type='bmeso', embedding_dim=768, dropout=0.1, use_pos_embedding=False, use_average=False): super().__init__() self.embed = embed self.use_pos_embedding = use_pos_embedding self.use_average = use_average self.label_vocab = label_vocab self.pos_idx = pos_idx embedding_dim_1 = 512 embedding_dim_2 = 256 self.layers_map = {'CWS': '1', 'POS': '2', 'Parsing': '3', 'NER': '2'} #NER self.ner_linear = nn.Linear(embedding_dim, len(label_vocab['NER'])) trans = allowed_transitions(label_vocab['NER'], encoding_type='bmeso', include_start_end=True) self.ner_crf = ConditionalRandomField(len(label_vocab['NER']), include_start_end_trans=True, allowed_transitions=trans) #parsing self.biaffine_parser = BertCharParser( vector_size=768, num_label=len(label_vocab['Parsing']), rnn_layers=Parsing_rnn_layers, arc_mlp_size=Parsing_arc_mlp_size, label_mlp_size=Parsing_label_mlp_size, dropout=dropout, use_greedy_infer=Parsing_use_greedy_infer) if self.use_pos_embedding: self.pos_embedding = nn.Embedding(len(self.label_vocab['pos']), embedding_dim, padding_idx=0) self.loss = CrossEntropyLoss(padding_idx=0) #CWS self.cws_mlp = MLP([ embedding_dim, embedding_dim_1, embedding_dim_2, len(label_vocab['CWS']) ], 'relu', output_activation=None) #POS self.pos_mlp = MLP([ embedding_dim, embedding_dim_1, embedding_dim_2, len(label_vocab['POS']) ], 'relu', output_activation=None) def _generate_embedding(self, feats, word_lens, seq_len, pos): new_feats = [] batch_size = feats.size()[0] sentence_length = feats.size()[1] device = feats.device if self.use_average == False: for i in range(batch_size): new_feats.append(torch.index_select(feats[i], 0, word_lens[i])) new_feats = torch.stack(new_feats, 0) else: for i in range(batch_size): feats_for_one_sample = [] for j in range(word_lens.size()[1]): if word_lens[i][j] == 0 and j != 0: feats_for_one_word = torch.zeros(feats.size()[-1]) else: if j == word_lens.size()[1] - 1 or word_lens[i][ j + 1] == 0: index = range(word_lens[i][j], seq_len[i]) else: index = range(word_lens[i][j], word_lens[i][j + 1]) index = torch.tensor(index).to(device) feats_for_one_word = torch.index_select( feats[i], 0, index) word_len = feats_for_one_word.size()[0] feats_for_one_word = torch.mean(feats_for_one_word, dim=0) feats_for_one_sample.append(feats_for_one_word) feats_for_one_sample = torch.stack(feats_for_one_sample, dim=0) new_feats.append(feats_for_one_sample) new_feats = torch.stack(new_feats, 0) if self.use_pos_embedding: pos_feats = self.pos_embedding(pos) new_feats = new_feats + pos_feats return new_feats def _generate_from_pos(self, paths, seq_len): device = paths.device word_lens = [] batch_size = paths.size()[0] new_seq_len = [] batch_pos = [] for i in range(batch_size): word_len = [] pos = [] for j in range(seq_len[i]): tag = paths[i][j] tag = self.label_vocab['POS'].to_word(int(tag)) if tag.startswith('<'): continue tag1, tag2 = tag.split('-') tag2 = self.label_vocab['pos'].to_index(tag2) if tag1 == 'S' or tag1 == 'B': word_len.append(j) pos.append(tag2) if len(pos) == 1: word_len.append(seq_len[i] - 1) pos.append(tag2) new_seq_len.append(len(pos)) word_lens.append(word_len) batch_pos.append(pos) max_len = max(new_seq_len) for i in range(batch_size): word_lens[i] = word_lens[i] + [0] * (max_len - new_seq_len[i]) batch_pos[i] = batch_pos[i] + [0] * (max_len - new_seq_len[i]) word_lens = torch.tensor(word_lens, device=device) batch_pos = torch.tensor(batch_pos, device=device) new_seq_len = torch.tensor(new_seq_len, device=device) return word_lens, batch_pos, new_seq_len def _decode_parsing(self, dep_head, dep_label, seq_len, seq_len_for_wordlist, word_lens): device = dep_head.device heads = [] labels = [] batch_size = dep_head.size()[0] app_index = self.label_vocab['Parsing'].to_index('APP') max_len = seq_len.max() for i in range(batch_size): head = list(range(1, seq_len[i] + 1)) label = [app_index] * int(seq_len[i]) head[0] = 0 for j in range(1, seq_len_for_wordlist[i]): if j + 1 == seq_len_for_wordlist[i]: idx = seq_len[i] - 1 else: idx = word_lens[i][j + 1] - 1 label[idx] = int(dep_label[i][j]) root = dep_head[i][j] if root >= seq_len_for_wordlist[i] - 1: head[idx] = int(seq_len[i] - 1) else: try: head[idx] = int(word_lens[i][root + 1] - 1) except: print(len(head), idx, word_lens.size(), i, root) head = head + [0] * int(max_len - seq_len[i]) label = label + [0] * int(max_len - seq_len[i]) heads.append(head) labels.append(label) heads = torch.tensor(heads, device=device) labels = torch.tensor(labels, device=device) return heads, labels def forward(self, chars, seq_len, task_class, target, seq_len_for_wordlist=None, dep_head=None, dep_label=None, pos=None, word_lens=None): task = task_class[0] mask = chars.ne(0) layers = self.layers_map[task] feats = self.embed(chars, layers) if task == 'Parsing': parsing_feats = self._generate_embedding(feats, word_lens, seq_len, pos) loss_parsing = self.biaffine_parser(parsing_feats, seq_len_for_wordlist, dep_head, dep_label) return loss_parsing if task == 'NER': #?需要relu吗 feats = F.relu(self.ner_linear(feats)) logits = F.log_softmax(feats, dim=-1) loss = self.ner_crf(logits, target, mask) return {'loss': loss} if task == 'CWS': feats = self.cws_mlp(feats) #logits=F.log_softmax(feats, dim=-1) #loss=self.cws_crf(logits, target, mask) loss = self.loss.get_loss(feats, target, seq_len) return {'loss': loss} if task == 'POS': feats = self.pos_mlp(feats) #logits=F.log_softmax(feats, dim=-1) #loss=self.pos_crf(logits, target, mask) loss = self.loss.get_loss(feats, target, seq_len) return {'loss': loss} def predict(self, chars, seq_len, task_class): task = task_class[0] mask = chars.ne(0) layers = self.layers_map[task] feats = self.embed(chars, layers) if task == 'Parsing': for sample in chars: sample[0] = self.pos_idx pos_feats = self.embed(chars, '2') pos_feats = self.pos_mlp(pos_feats) #logits = F.log_softmax(pos_feats, dim=-1) #paths, _ = self.pos_crf.viterbi_decode(logits, mask) paths = pos_feats.max(dim=-1)[1] word_lens, batch_pos, seq_len_for_wordlist = self._generate_from_pos( paths, seq_len) parsing_feats = self._generate_embedding(feats, word_lens, seq_len, batch_pos) answer = self.biaffine_parser.predict(parsing_feats, seq_len_for_wordlist) head_preds = answer['head_preds'] label_preds = answer['label_preds'] heads, labels = self._decode_parsing(head_preds, label_preds, seq_len, seq_len_for_wordlist, word_lens) return {'head_preds': heads, 'label_preds': labels, 'pred': paths} if task == 'CWS': feats = self.cws_mlp(feats) #logits = F.log_softmax(feats, dim=-1) #paths, _ = self.cws_crf.viterbi_decode(logits, mask) paths = feats.max(dim=-1)[1] return {'pred': paths} if task == 'POS': feats = self.pos_mlp(feats) #logits = F.log_softmax(feats, dim=-1) #paths, _ = self.pos_crf.viterbi_decode(logits, mask) paths = feats.max(dim=-1)[1] return {'pred': paths} #output=feats.max(dim=-1)[1] if task == 'NER': feats = F.relu(self.ner_linear(feats)) logits = F.log_softmax(feats, dim=-1) paths, _ = self.ner_crf.viterbi_decode(logits, mask) return {'pred': paths}
def __init__(self, tag_vocab, embed, num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=True, attn_type='adatrans', bi_embed=None, fc_dropout=0.3, pos_embed=None, scale=False, dropout_attn=None, use_knowledge=False, feature2count=None, vocab_size=None, feature_vocab_size=None, kv_attn_type="dot", memory_dropout=0.2, fusion_dropout=0.2, fusion_type='concat', highway_layer=0, key_embed_dropout=0.2, knowledge_type="all", use_zen=False, zen_model=None): """ :param tag_vocab: fastNLP Vocabulary :param embed: fastNLP TokenEmbedding :param num_layers: number of self-attention layers :param d_model: input size :param n_head: number of head :param feedforward_dim: the dimension of ffn :param dropout: dropout in self-attention :param after_norm: normalization place :param attn_type: adatrans, naive :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None :param bi_embed: Used in Chinese scenerio :param fc_dropout: dropout rate before the fc layer :param use_knowledge: 是否使用stanford corenlp的知识 :param feature2count: 字典, {"gram2count": dict, "pos_tag2count": dict, "chunk_tag2count": dict, "dep_tag2count": dict}, :param """ super().__init__() self.use_knowledge = use_knowledge self.feature2count = feature2count self.vocab_size = vocab_size self.feature_vocab_size = feature_vocab_size # add ZEN self.use_zen = use_zen self.embed = embed embed_size = self.embed.embed_size self.bi_embed = None if bi_embed is not None: self.bi_embed = bi_embed embed_size += self.bi_embed.embed_size self.in_fc = nn.Linear(embed_size, d_model) self.transformer = TransformerEncoder(num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=after_norm, attn_type=attn_type, scale=scale, dropout_attn=dropout_attn, pos_embed=pos_embed) self.kv_memory = KeyValueMemoryNetwork( vocab_size=vocab_size, feature_vocab_size=feature_vocab_size, attn_type=kv_attn_type, emb_size=d_model, scaled=True, key_embed_dropout=key_embed_dropout, knowledge_type=knowledge_type) self.output_dim = d_model * _dim_map[fusion_type] self.fusion = FusionModule(fusion_type=fusion_type, layer=highway_layer, input_size=d_model, output_size=self.output_dim, dropout=fusion_dropout) self.memory_dropout = nn.Dropout(p=memory_dropout) self.out_fc = nn.Linear(self.output_dim, len(tag_vocab)) self.fc_dropout = nn.Dropout(fc_dropout) trans = allowed_transitions(tag_vocab, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans)
class BiLSTMCRF(nn.Module): def __init__(self, char_embed, hidden_size, num_layers, target_vocab=None, bigram_embed=None, trigram_embed=None, dropout=0.5): super().__init__() embed_size = char_embed.embed_size self.char_embed = char_embed if bigram_embed: embed_size += bigram_embed.embed_size self.bigram_embed = bigram_embed if trigram_embed: embed_size += trigram_embed.embed_size self.trigram_embed = trigram_embed self.lstm = LSTM(embed_size, hidden_size=hidden_size // 2, bidirectional=True, batch_first=True, num_layers=num_layers) self.dropout = nn.Dropout(p=dropout) self.fc = nn.Linear(hidden_size, len(target_vocab)) transitions = None if target_vocab: transitions = allowed_transitions(target_vocab, include_start_end=True, encoding_type='bmes') self.crf = ConditionalRandomField(num_tags=len(target_vocab), allowed_transitions=transitions) def _forward(self, chars, bigrams, trigrams, seq_len, target=None): chars = self.char_embed(chars) if bigrams is not None: bigrams = self.bigram_embed(bigrams) chars = torch.cat([chars, bigrams], dim=-1) if trigrams is not None: trigrams = self.trigram_embed(trigrams) chars = torch.cat([chars, trigrams], dim=-1) output, _ = self.lstm(chars, seq_len) output = self.dropout(output) output = self.fc(output) output = F.log_softmax(output, dim=-1) mask = seq_len_to_mask(seq_len) if target is None: pred, _ = self.crf.viterbi_decode(output, mask) return {Const.OUTPUT: pred} else: loss = self.crf.forward(output, tags=target, mask=mask) return {Const.LOSS: loss} def forward(self, chars, seq_len, target, bigrams=None, trigrams=None): return self._forward(chars, bigrams, trigrams, seq_len, target) def predict(self, chars, seq_len, bigrams=None, trigrams=None): return self._forward(chars, bigrams, trigrams, seq_len)
class CNBiLSTMCRFNER(nn.Module): def __init__(self, char_embed, num_classes, bigram_embed=None, trigram_embed=None, num_layers=1, hidden_size=100, dropout=0.5, target_vocab=None, encoding_type=None): super().__init__() self.char_embed = get_embeddings(char_embed) embed_size = self.char_embed.embedding_dim if bigram_embed: self.bigram_embed = get_embeddings(bigram_embed) embed_size += self.bigram_embed.embedding_dim if trigram_embed: self.trigram_ebmbed = get_embeddings(trigram_embed) embed_size += self.bigram_embed.embedding_dim if num_layers > 1: self.lstm = LSTM(embed_size, num_layers=num_layers, hidden_size=hidden_size // 2, bidirectional=True, batch_first=True, dropout=dropout) else: self.lstm = LSTM(embed_size, num_layers=num_layers, hidden_size=hidden_size // 2, bidirectional=True, batch_first=True) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(hidden_size, num_classes) trans = None if target_vocab is not None and encoding_type is not None: trans = allowed_transitions(target_vocab.idx2word, encoding_type=encoding_type, include_start_end=True) self.crf = ConditionalRandomField(num_classes, include_start_end_trans=True, allowed_transitions=trans) def _forward(self, chars, bigrams=None, trigrams=None, seq_len=None, target=None): chars = self.char_embed(chars) if hasattr(self, 'bigram_embed'): bigrams = self.bigram_embed(bigrams) chars = torch.cat((chars, bigrams), dim=-1) if hasattr(self, 'trigram_embed'): trigrams = self.trigram_embed(trigrams) chars = torch.cat((chars, trigrams), dim=-1) feats, _ = self.lstm(chars, seq_len=seq_len) feats = self.fc(feats) feats = self.dropout(feats) logits = F.log_softmax(feats, dim=-1) mask = seq_len_to_mask(seq_len) if target is None: pred, _ = self.crf.viterbi_decode(logits, mask) return {C.OUTPUT: pred} else: loss = self.crf(logits, target, mask).mean() return {C.LOSS: loss} def forward(self, chars, target, bigrams=None, trigrams=None, seq_len=None): return self._forward(chars, bigrams, trigrams, seq_len, target) def predict(self, chars, seq_len=None, bigrams=None, trigrams=None): return self._forward(chars, bigrams, trigrams, seq_len)
class TransXL(nn.Module): def __init__(self, tag_vocab, bert_config, bi_embed=None): """ :param tag_vocab: fastNLP Vocabulary :param embed: fastNLP TokenEmbedding :param num_layers: number of self-attention layers :param d_model: input size :param n_head: number of head :param feedforward_dim: the dimension of ffn :param dropout: dropout in self-attention :param after_norm: normalization place :param attn_type: adatrans, naive :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None :param bi_embed: Used in Chinese scenerio :param fc_dropout: dropout rate before the fc layer """ super().__init__() self.embed = BertModel.from_pretrained(bert_config) embed_size = self.embed.embeddings.word_embeddings.weight.shape[1] self.bi_embed = None if bi_embed is not None: self.bi_embed = bi_embed embed_size += self.bi_embed.embed_size self.configuration = TransfoXLConfig(d_model=768, d_head=16, n_head=16, n_layer=4, mem_len=1000) self.xl_model = TransfoXLModel(self.configuration) self.liner = nn.Linear(768, len(tag_vocab)) # trans = allowed_transitions(tag_vocab, include_start_end=True, encoding_type = "bioes") #TODO: trans 为限制转移的数组,非常有用,过后加上 self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=None) def _forward(self, sentence, target=None, mems=None): batch_size = sentence.size(0) seq_length = sentence.size(1) mask = sentence.ne(0) embeds, _ = self.embed(sentence, attention_mask=None, output_all_encoded_layers=False) trans_out = self.xl_model(None, mems, inputs_embeds=embeds)[:2] feats, mems = trans_out[0], trans_out[1] feats = self.liner(feats.contiguous().view(-1, 768)) feats = feats.contiguous().view(batch_size, seq_length, -1) logits = F.log_softmax(feats, dim=-1) if target is None: paths, _ = self.crf.viterbi_decode(logits, mask) return {'pred': [paths, mems]} else: loss = self.crf(logits, target, mask) return {'loss': [loss, mems]} def forward(self, chars, target=None, mems=None): return self._forward(chars, target, mems) def predict(self, chars, mems=None): return self._forward(chars, target=None, mems=mems)
class TENER(nn.Module): def __init__(self, config, data_bundle, embed, num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=True, attn_type='adatrans', bi_embed=None, fc_dropout=0.3, pos_embed=None, scale=False, dropout_attn=None): """ :param tag_vocab: fastNLP Vocabulary :param embed: fastNLP TokenEmbedding :param num_layers: number of self-attention layers :param d_model: input size :param n_head: number of head :param feedforward_dim: the dimension of ffn :param dropout: dropout in self-attention :param after_norm: normalization place :param attn_type: adatrans, naive :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None :param bi_embed: Used in Chinese scenerio :param fc_dropout: dropout rate before the fc layer """ super().__init__() self.config = config self.data_bundle = data_bundle tag_vocab = data_bundle.get_vocab('target') self.embed = embed embed_size = self.embed.embed_size self.bi_embed = None if bi_embed is not None: self.bi_embed = bi_embed embed_size += self.bi_embed.embed_size self.in_fc = nn.Linear(embed_size, d_model) self.transformer = TransformerEncoder(num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=after_norm, attn_type=attn_type, scale=scale, dropout_attn=dropout_attn, pos_embed=pos_embed) self.fc_dropout = nn.Dropout(fc_dropout) self.out_fc = nn.Linear(d_model, len(tag_vocab)) trans = allowed_transitions(tag_vocab, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans) def _forward(self, chars, target, bigrams=None): mask = chars.ne(0) chars = self.embed(chars) if self.bi_embed is not None: bigrams = self.bi_embed(bigrams) chars = torch.cat([chars, bigrams], dim=-1) chars = self.in_fc(chars) chars = self.transformer(chars, mask) chars = self.fc_dropout(chars) chars = self.out_fc(chars) logits = F.log_softmax(chars, dim=-1) if target is None: paths, _ = self.crf.viterbi_decode(logits, mask) return {'pred': paths} else: loss = self.crf(logits, target, mask) return {'loss': loss} def forward(self, chars, target, bigrams=None): return self._forward(chars, target, bigrams) def predict(self, chars, bigrams=None): return self._forward(chars, target=None, bigrams=bigrams) def _get_trainer(self, models_folder): optimizer = optim.SGD(self.parameters(), lr=self.config['lr'], momentum=0.9) callbacks = [] clip_callback = GradientClipCallback(clip_type='value', clip_value=5) evaluate_callback = EvaluateCallback( self.data_bundle.get_dataset('test')) if self.config['warmup_steps'] > 0: warmup_callback = WarmupCallback(self.config['warmup_steps'], schedule='linear') callbacks.append(warmup_callback) callbacks.extend([clip_callback, evaluate_callback]) return Trainer(self.data_bundle.get_dataset('train'), self, optimizer, batch_size=self.config['batch_size'], sampler=BucketSampler(), num_workers=2, n_epochs=100, dev_data=self.data_bundle.get_dataset('dev'), metrics=SpanFPreRecMetric( tag_vocab=self.data_bundle.get_vocab('target'), encoding_type=self.config['encoding_type']), dev_batch_size=self.config['batch_size'] * 5, callbacks=callbacks, device=self.config['device'], test_use_tqdm=False, use_tqdm=True, print_every=300, save_path=models_folder) def train_model(self, models_folder): trainer = self._get_trainer(models_folder) trainer.train(load_best_model=False) def load(self, path): self.load_state_dict(torch.load(path).state_dict()) print("Reloaded trained model.") return self def test(self, dataset, subset): metrics_to_test = [fastNLP.core.metrics.AccuracyMetric()] # Load dataset for testing databundle_for_test = read_dataset(dataset, self.config) # Perform testing tester = Tester(databundle_for_test.get_dataset(subset), self, metrics_to_test, batch_size=self.config['batch_size'], num_workers=0, device=None, verbose=1, use_tqdm=True) tester.test() flattened_true_entities, flattened_predicted_entities = flatten_prediction_results( self.data_bundle, databundle_for_test, subset, self._predict( subset_for_prediction=databundle_for_test.get_dataset(subset), targets=self.data_bundle.vocabs["target"], filename=None)) print("Precision per label:") labels = get_unique_targets(self.data_bundle.vocabs["target"]) scores = get_average_precision(y_true=flattened_true_entities, y_pred=flattened_predicted_entities, labels=labels, average=None) for label, score in zip(labels, scores): print(f'{label:10s} {score:.2f}') #print(get_average_precision(flattened_true_entities, flattened_predicted_entities, 'weighted')) #for averaging_method in ['micro', 'macro', 'weighted', 'samples']: #print(averaging_method) #print(get_average_precision(flattened_true_entities, flattened_predicted_entities, averaging_method)) # print(len(flattened_predicted_entities)) # print(len(flattened_true_entities)) def _predict(self, subset_for_prediction, targets, filename): predictor = Predictor(self) predictions = predictor.predict(subset_for_prediction)['pred'] words = list(subset_for_prediction.get_field('raw_words')) lines = [] words_sequence_index = 1 labels_sequence_index = 0 for sentence in list(zip(predictions, words)): if type(sentence[labels_sequence_index][0]) == int: continue words = sentence[words_sequence_index] #print(sentence[labels_sequence_index]) #labels = map(lambda label: f'{targets.to_word(label).split("-")[-1]}', sentence[labels_sequence_index][0]) labels = map(lambda label: f'{targets.to_word(label)}', sentence[labels_sequence_index][0]) for pair in zip(words, labels): lines.append(' '.join(pair)) lines.append('') if filename is not None: write_lines(filename, lines) return lines def export_predictions(self, dataset, subset, output_file): # Load dataset for prediction databundle_for_prediction = read_dataset(dataset, self.config) # Perform prediction return self._predict(databundle_for_prediction.get_dataset(subset), self.data_bundle.vocabs["target"], output_file)
class TENER(nn.Module): def __init__(self, tag_vocab, embed, num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=True, attn_type='adatrans', bi_embed=None, fc_dropout=0.3, pos_embed=None, scale=False, dropout_attn=None, use_knowledge=False, feature2count=None, vocab_size=None, feature_vocab_size=None, kv_attn_type="dot", memory_dropout=0.2, fusion_dropout=0.2, fusion_type='concat', highway_layer=0, key_embed_dropout=0.2, knowledge_type="all", use_zen=False, zen_model=None): """ :param tag_vocab: fastNLP Vocabulary :param embed: fastNLP TokenEmbedding :param num_layers: number of self-attention layers :param d_model: input size :param n_head: number of head :param feedforward_dim: the dimension of ffn :param dropout: dropout in self-attention :param after_norm: normalization place :param attn_type: adatrans, naive :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None :param bi_embed: Used in Chinese scenerio :param fc_dropout: dropout rate before the fc layer :param use_knowledge: 是否使用stanford corenlp的知识 :param feature2count: 字典, {"gram2count": dict, "pos_tag2count": dict, "chunk_tag2count": dict, "dep_tag2count": dict}, :param """ super().__init__() self.use_knowledge = use_knowledge self.feature2count = feature2count self.vocab_size = vocab_size self.feature_vocab_size = feature_vocab_size # add ZEN self.use_zen = use_zen self.embed = embed embed_size = self.embed.embed_size self.bi_embed = None if bi_embed is not None: self.bi_embed = bi_embed embed_size += self.bi_embed.embed_size self.in_fc = nn.Linear(embed_size, d_model) self.transformer = TransformerEncoder(num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=after_norm, attn_type=attn_type, scale=scale, dropout_attn=dropout_attn, pos_embed=pos_embed) self.kv_memory = KeyValueMemoryNetwork( vocab_size=vocab_size, feature_vocab_size=feature_vocab_size, attn_type=kv_attn_type, emb_size=d_model, scaled=True, key_embed_dropout=key_embed_dropout, knowledge_type=knowledge_type) self.output_dim = d_model * _dim_map[fusion_type] self.fusion = FusionModule(fusion_type=fusion_type, layer=highway_layer, input_size=d_model, output_size=self.output_dim, dropout=fusion_dropout) self.memory_dropout = nn.Dropout(p=memory_dropout) self.out_fc = nn.Linear(self.output_dim, len(tag_vocab)) self.fc_dropout = nn.Dropout(fc_dropout) trans = allowed_transitions(tag_vocab, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans) def _forward(self, chars, target, bigrams=None, pos_features=None, dep_features=None, chunk_features=None, pos_matrix=None, dep_matrix=None, chunk_matrix=None, nan_matrix=None, zen_input=None): # get the hidden state from transformer encoder mask = chars.ne(0) hidden = self.embed(chars) if self.use_zen: hidden_dim = hidden.shape[-1] zen_dim = zen_input.shape[-1] hidden[:, :, (hidden_dim - zen_dim):] = zen_input if self.bi_embed is not None: bigrams = self.bi_embed(bigrams) hidden = torch.cat([hidden, bigrams], dim=-1) hidden = self.in_fc(hidden) encoder_output = self.transformer(hidden, mask) # new add # kv_output: hidden state of key value memory network kv_output = self.kv_memory(chars, pos_features, dep_features, chunk_features, encoder_output, pos_matrix, dep_matrix, chunk_matrix, nan_matrix) kv_output = self.memory_dropout(kv_output) # o: output of gating mechanism concat = self.fusion(encoder_output, kv_output) concat = self.fc_dropout(concat) concat = self.out_fc(concat) logits = F.log_softmax(concat, dim=-1) if target is None: paths, _ = self.crf.viterbi_decode(logits, mask) return {'pred': paths} else: loss = self.crf(logits, target, mask) return {'loss': loss} def forward(self, chars, target, bigrams=None, pos_features=None, dep_features=None, chunk_features=None, pos_matrix=None, dep_matrix=None, chunk_matrix=None, nan_matrix=None, zen_input=None): return self._forward(chars, target, bigrams, pos_features, dep_features, chunk_features, pos_matrix, dep_matrix, chunk_matrix, nan_matrix, zen_input) def predict(self, chars, bigrams=None, pos_features=None, dep_features=None, chunk_features=None, pos_matrix=None, dep_matrix=None, chunk_matrix=None, nan_matrix=None, zen_input=None): return self._forward(chars, target=None, bigrams=bigrams, pos_features=pos_features, dep_features=dep_features, chunk_features=chunk_features, pos_matrix=pos_matrix, dep_matrix=dep_matrix, chunk_matrix=chunk_matrix, nan_matrix=nan_matrix, zen_input=zen_input)