class opinionBERT(nn.Module): def __init__(self, bert_name: str, num_labels: int, num_layers: int, hidden_size: int, dropout_prob: float, rnn_type: str, bidirectional: bool, use_crf: bool, freeze_bert: bool): super().__init__() self.bert = BertModel.from_pretrained(bert_name) if freeze_bert: self.bert.requires_grad = False if num_layers > 0: if rnn_type == "gru": self.rnn = nn.GRU(self.bert.config.hidden_size, hidden_size, num_layers=num_layers, bidirectional=bidirectional, batch_first=True) else: self.rnn = nn.LSTM(self.bert.config.hidden_size, hidden_size, num_layers=num_layers, bidirectional=bidirectional, batch_first=True) else: self.rnn = nn.Identity() self.classifier = nn.Linear((1 + bidirectional) * hidden_size, num_labels) self.dropout = nn.Dropout(dropout_prob) self.use_crf = use_crf if self.use_crf: self.crf = CRF(num_labels, batch_first=True) def forward(self, input_ids, attn_mask, crf_attn_mask, tags=None, class_weights=None): bert_output = self.bert(input_ids, attn_mask) bert_output = bert_output.last_hidden_state bert_output = self.dropout(bert_output) rnn_output, _ = self.rnn(bert_output) logits = self.classifier(rnn_output) if self.use_crf: pred = self.crf.decode(logits, crf_attn_mask) else: detached_logits = logits.detach().cpu().numpy() pred = [ list(sentence_pred) for sentence_pred in np.argmax(detached_logits, axis=2) ] if tags is not None: if self.use_crf: loss = -self.crf( logits, tags, mask=crf_attn_mask, reduction="mean") else: num_labels = logits.shape[-1] if class_weights is not None: loss_fct = nn.CrossEntropyLoss(weight=class_weights) else: loss_fct = nn.CrossEntropyLoss() active_loss = attn_mask.view(-1) == 1 active_logits = logits.view(-1, num_labels) active_labels = torch.where( active_loss, tags.view(-1), torch.Tensor([loss_fct.ignore_index ]).type_as(tags)).long() loss = loss_fct(active_logits, active_labels) return loss, pred else: return pred
def __init__(self, config): super().__init__() self.birnn = BiRNN(config) # self.transitions = nn.Parameter(torch.randn(config.num_classes, config.num_classes)) # 转移矩阵,随机初始化 self.crf = CRF(config.num_classes, batch_first=True)
hidden_layers, dropout, output_layers, lemma2synsets, synset2id, known_pos, known_entity_tags, use_flair=use_flair, combine_WN_FN=combine_WN_FN) model.to(device) loss_func_embed = torch.nn.MSELoss() if crf_layer is True: if "classify_wsd" in output_layers: loss_func_classify = torch.nn.CrossEntropyLoss(ignore_index=-100) if "pos_tagger" in output_layers: loss_func_pos = CRF(len(known_pos), batch_first=True) if "ner" in output_layers: loss_func_ner = CRF(len(known_entity_tags), batch_first=True) else: loss_func_classify = torch.nn.CrossEntropyLoss(ignore_index=-100) loss_func_pos = torch.nn.CrossEntropyLoss() loss_func_ner = torch.nn.CrossEntropyLoss() # loss_func_classify = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters()) # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Eval loop if args.mode == "evaluate": model.load_state_dict(torch.load(args.save_path)) model.eval() test_accuracy_embed, test_accuracy_classify, log = eval_loop(
class CRF_Model(nn.Module): def __init__(self, hparams): super(CRF_Model, self).__init__() self._device = 'cuda' if torch.cuda.is_available() else 'cpu' self.name = hparams.model_name self.word_embedding = nn.Embedding( hparams.vocab_size, hparams.embedding_dim) if hparams.embeddings is not None: print("initializing embeddings from pretrained") self.word_embedding.weight.data.copy_(hparams.embeddings) self.lstm = nn.LSTM(hparams.embedding_dim, hparams.hidden_dim, bidirectional=hparams.bidirectional, num_layers=hparams.num_layers, dropout=hparams.dropout if hparams.num_layers > 1 else 0, batch_first=True) lstm_output_dim = hparams.hidden_dim if hparams.bidirectional is False else hparams.hidden_dim * 2 self.dropout = nn.Dropout(hparams.dropout) self.classifier = nn.Linear(lstm_output_dim, hparams.num_classes) self.crf = CRF(hparams.num_classes, batch_first=True) def forward(self, x): # [Samples_Num, Seq_Len] embeddings = self.word_embedding(x) embeddings = self.dropout(embeddings) # [Samples_Num, Seq_Len] o, _ = self.lstm(embeddings) # [Samples_Num, Seq_Len, Tags_Num] o = self.dropout(o) # [Samples_Num, Seq_Len, Tags_Num] logits = self.classifier(o) # [Samples_Num, Seq_Len] return logits def log_probs(self, x, tags, mask=None): emissions = self(x) return self.crf(emissions, tags, mask=mask) def predict(self, x): emissions = self(x) return self.crf.decode(emissions) def predict_new(self, x, mask=None): emissions = self(x) return self.crf.decode(emissions, mask=mask) def save_checkpoint(self, model_path): """ Saves the model checkpoint Args: model_path: Returns: """ torch.save(self, model_path) model_checkpoint = model_path.replace('.pt', '.pth') torch.save(self.state_dict(), model_checkpoint) def load_model(self, path): """ Loads the model from a given path, loads it to the available device whether its CUDA or CPU Args: path: Returns: """ state_dict = torch.load(path) if self._device == 'cuda' else torch.load(path, map_location=torch.device(self._device)) self.load_state_dict(state_dict) def encode_tokens(self, tokens, word2idx): """ Helper method during prediction Encodes the tokens passed during prediction time, fetches word idx from word2idx Args: tokens: word2idx: Returns: """ data = [] for sentence in tokens: paragraph = [] for i in sentence: paragraph.append(word2idx.get(i, 1)) paragraph = torch.LongTensor(paragraph).to(self._device) data.append(paragraph) return pad_sequence(data, batch_first=True, padding_value=0)
def make_crf(num_tags=5): return CRF(num_tags)
class RobertaLSTMCRF(RobertaForTokenClassification): def __init__(self, config, lstm_hidden_size, lstm_layers): super().__init__(config) self.lstm = torch.nn.LSTM( input_size=config.hidden_size, hidden_size=lstm_hidden_size, num_layers=lstm_layers, dropout=0.2, batch_first=True, bidirectional=True, ) self.crf = CRF(config.num_labels, batch_first=True) del self.classifier self.classifier = torch.nn.Linear(2 * lstm_hidden_size, config.num_labels) def forward( self, input_ids, attention_mask=None, token_type_ids=None, labels=None, prediction_mask=None, ): outputs = self.roberta( input_ids, attention_mask, token_type_ids, output_hidden_states=True, return_dict=False, ) # seq_output, all_hidden_states, all_self_attntions, all_cross_attentions sequence_output = outputs[ 0] # outputs[1] is pooled output which is none. sequence_output = self.dropout(sequence_output) lstm_out, *_ = self.lstm(sequence_output) sequence_output = self.dropout(lstm_out) logits = self.classifier(sequence_output) ## CRF mask = prediction_mask mask = mask[:, :logits.size(1)].contiguous() # print(logits) if labels is not None: labels = labels[:, :logits.size(1)].contiguous() loss = -self.crf( logits, labels, mask=mask.bool(), reduction="token_mean") tags = self.crf.decode(logits, mask.bool()) # print(tags) if labels is not None: return (loss, logits, tags) else: return (logits, tags)
def test_full(self): crf = CRF(10, batch_first=True) assert crf.batch_first
class ElmoLSTMCRF(BaseModel): def __init__(self, config, elmo_model, embedding_path, label_path, pos_path, emb_non_trainable=True, use_crf=False, use_char_cnn=False): super().__init__(config=config) self.config = config self.device = config['opt'].device self.seq_size = config['n_ctx'] pos_emb_dim = config['pos_emb_dim'] elmo_emb_dim = config['elmo_emb_dim'] lstm_hidden_dim = config['lstm_hidden_dim'] lstm_num_layers = config['lstm_num_layers'] lstm_dropout = config['lstm_dropout'] self.use_crf = use_crf self.use_char_cnn = use_char_cnn # elmo embedding self.elmo_model = elmo_model # glove embedding layer weights_matrix = super().load_embedding(embedding_path) vocab_dim, token_emb_dim = weights_matrix.size() padding_idx = config['pad_token_id'] self.embed_token = super().create_embedding_layer( vocab_dim, token_emb_dim, weights_matrix=weights_matrix, non_trainable=emb_non_trainable, padding_idx=padding_idx) # pos embedding layer self.poss = super().load_dict(pos_path) self.pos_vocab_size = len(self.poss) padding_idx = config['pad_pos_id'] self.embed_pos = super().create_embedding_layer( self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx) emb_dim = elmo_emb_dim + token_emb_dim + pos_emb_dim # char embedding layer if self.use_char_cnn: self.charcnn = CharCNN(config) emb_dim = elmo_emb_dim + token_emb_dim + pos_emb_dim + self.charcnn.last_dim # BiLSTM layer self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=lstm_hidden_dim, num_layers=lstm_num_layers, dropout=lstm_dropout, bidirectional=True, batch_first=True) self.dropout = nn.Dropout(config['dropout']) # projection layer self.labels = super().load_dict(label_path) self.label_size = len(self.labels) self.linear = nn.Linear(lstm_hidden_dim * 2, self.label_size) # CRF layer if self.use_crf: self.crf = CRF(num_tags=self.label_size, batch_first=True) def forward(self, x): # x[0,1] : [batch_size, seq_size] # x[2] : [batch_size, seq_size, max_characters_per_token] token_ids = x[0] pos_ids = x[1] char_ids = x[2] mask = torch.sign(torch.abs(token_ids)).to(torch.uint8).to(self.device) # mask : [batch_size, seq_size] lengths = torch.sum(mask.to(torch.long), dim=1) # lengths : [batch_size] # 1. Embedding elmo_embed_out = self.elmo_model(char_ids)['elmo_representations'][0] # elmo_embed_out : [batch_size, seq_size, elmo_emb_dim] ''' masks = mask.unsqueeze(2).to(torch.float) # masks : [batch_size, seq_size, 1] elmo_embed_out *= masks # auto-braodcasting ''' token_embed_out = self.embed_token(token_ids) # token_embed_out : [batch_size, seq_size, token_emb_dim] pos_embed_out = self.embed_pos(pos_ids) # pos_embed_out : [batch_size, seq_size, pos_emb_dim] if self.use_char_cnn: char_ids = x[2] # char_ids : [batch_size, seq_size, char_n_ctx] charcnn_out = self.charcnn(char_ids) # charcnn_out : [batch_size, seq_size, self.charcnn.last_dim] embed_out = torch.cat( [elmo_embed_out, token_embed_out, pos_embed_out, charcnn_out], dim=-1) # embed_out : [batch_size, seq_size, emb_dim] else: embed_out = torch.cat( [elmo_embed_out, token_embed_out, pos_embed_out], dim=-1) # embed_out : [batch_size, seq_size, emb_dim] embed_out = self.dropout(embed_out) # 2. LSTM packed_embed_out = torch.nn.utils.rnn.pack_padded_sequence( embed_out, lengths, batch_first=True, enforce_sorted=False) lstm_out, (h_n, c_n) = self.lstm(packed_embed_out) lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence( lstm_out, batch_first=True, total_length=self.seq_size) # lstm_out : [batch_size, seq_size, lstm_hidden_dim*2] lstm_out = self.dropout(lstm_out) # 3. Output logits = self.linear(lstm_out) # logits : [batch_size, seq_size, label_size] if not self.use_crf: return logits prediction = self.crf.decode(logits) prediction = torch.as_tensor(prediction, dtype=torch.long) # prediction : [batch_size, seq_size] return logits, prediction
def __init__(self, config, elmo_model, embedding_path, label_path, pos_path, emb_non_trainable=True, use_crf=False, use_char_cnn=False): super().__init__(config=config) self.config = config self.device = config['opt'].device self.seq_size = config['n_ctx'] pos_emb_dim = config['pos_emb_dim'] elmo_emb_dim = config['elmo_emb_dim'] lstm_hidden_dim = config['lstm_hidden_dim'] lstm_num_layers = config['lstm_num_layers'] lstm_dropout = config['lstm_dropout'] self.use_crf = use_crf self.use_char_cnn = use_char_cnn # elmo embedding self.elmo_model = elmo_model # glove embedding layer weights_matrix = super().load_embedding(embedding_path) vocab_dim, token_emb_dim = weights_matrix.size() padding_idx = config['pad_token_id'] self.embed_token = super().create_embedding_layer( vocab_dim, token_emb_dim, weights_matrix=weights_matrix, non_trainable=emb_non_trainable, padding_idx=padding_idx) # pos embedding layer self.poss = super().load_dict(pos_path) self.pos_vocab_size = len(self.poss) padding_idx = config['pad_pos_id'] self.embed_pos = super().create_embedding_layer( self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx) emb_dim = elmo_emb_dim + token_emb_dim + pos_emb_dim # char embedding layer if self.use_char_cnn: self.charcnn = CharCNN(config) emb_dim = elmo_emb_dim + token_emb_dim + pos_emb_dim + self.charcnn.last_dim # BiLSTM layer self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=lstm_hidden_dim, num_layers=lstm_num_layers, dropout=lstm_dropout, bidirectional=True, batch_first=True) self.dropout = nn.Dropout(config['dropout']) # projection layer self.labels = super().load_dict(label_path) self.label_size = len(self.labels) self.linear = nn.Linear(lstm_hidden_dim * 2, self.label_size) # CRF layer if self.use_crf: self.crf = CRF(num_tags=self.label_size, batch_first=True)
class BertLSTMCRF(BaseModel): def __init__(self, config, bert_config, bert_model, bert_tokenizer, label_path, pos_path, use_crf=False, use_pos=False, disable_lstm=False, feature_based=False): super().__init__(config=config) self.config = config self.device = config['opt'].device self.seq_size = config['n_ctx'] pos_emb_dim = config['pos_emb_dim'] lstm_hidden_dim = config['lstm_hidden_dim'] lstm_num_layers = config['lstm_num_layers'] lstm_dropout = config['lstm_dropout'] self.use_crf = use_crf self.use_pos = use_pos self.disable_lstm = disable_lstm # bert embedding layer self.bert_config = bert_config self.bert_model = bert_model self.bert_tokenizer = bert_tokenizer self.bert_feature_based = feature_based self.bert_hidden_size = bert_config.hidden_size self.bert_num_layers = bert_config.num_hidden_layers # DSA layer for bert_feature_based dsa_num_attentions = config['dsa_num_attentions'] dsa_input_dim = self.bert_hidden_size dsa_dim = config['dsa_dim'] dsa_r = config['dsa_r'] self.dsa = DSA(config, dsa_num_attentions, dsa_input_dim, dsa_dim, dsa_r=dsa_r) self.layernorm_dsa = nn.LayerNorm(self.dsa.last_dim) bert_emb_dim = self.bert_hidden_size if self.bert_feature_based: ''' # 1) last layer, 2) mean pooling bert_emb_dim = self.bert_hidden_size ''' # 3) DSA pooling bert_emb_dim = self.dsa.last_dim # pos embedding layer self.poss = super().load_dict(pos_path) self.pos_vocab_size = len(self.poss) padding_idx = config['pad_pos_id'] self.embed_pos = super().create_embedding_layer( self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx) # BiLSTM layer if self.use_pos: emb_dim = bert_emb_dim + pos_emb_dim else: emb_dim = bert_emb_dim if not self.disable_lstm: self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=lstm_hidden_dim, num_layers=lstm_num_layers, dropout=lstm_dropout, bidirectional=True, batch_first=True) self.dropout = nn.Dropout(config['dropout']) # projection layer self.labels = super().load_dict(label_path) self.label_size = len(self.labels) if not self.disable_lstm: self.linear = nn.Linear(lstm_hidden_dim * 2, self.label_size) else: self.linear = nn.Linear(emb_dim, self.label_size) # CRF layer if self.use_crf: self.crf = CRF(num_tags=self.label_size, batch_first=True) def _compute_bert_embedding(self, x): if self.bert_feature_based: # feature-based with torch.no_grad(): if self.config['emb_class'] in ['bart', 'distilbert']: bert_outputs = self.bert_model(input_ids=x[0], attention_mask=x[1]) # bart model's output(output_hidden_states == True) # [0] last decoder layer's output : [batch_size, seq_size, bert_hidden_size] # [1] all hidden states of decoder layer's # [2] last encoder layer's output : [seq_size, batch_size, bert_hidden_size] # [3] all hidden states of encoder layer's all_hidden_states = bert_outputs[1][0:] elif 'electra' in self.config['emb_class']: bert_outputs = self.bert_model(input_ids=x[0], attention_mask=x[1], token_type_ids=x[2]) # electra model's output # list of each layer's hidden states all_hidden_states = bert_outputs else: bert_outputs = self.bert_model( input_ids=x[0], attention_mask=x[1], token_type_ids=None if self.config['emb_class'] in [ 'roberta' ] else x[2]) # RoBERTa don't use segment_ids all_hidden_states = bert_outputs[2][0:] # last hidden states, pooled output, initial embedding layer, 1 ~ last layer's hidden states # bert_outputs[0], bert_outputs[1], bert_outputs[2][0], bert_outputs[2][1:] ''' # 1) last layer embedded = bert_outputs[0] # embedded : [batch_size, seq_size, bert_hidden_size] ''' ''' # 2) mean pooling stack = torch.stack(all_hidden_states, dim=-1) embedded = torch.mean(stack, dim=-1) # ([batch_size, seq_size, bert_hidden_size], ..., [batch_size, seq_size, bert_hidden_size]) # -> stack(-1) -> [batch_size, seq_size, bert_hidden_size, *], ex) * == 25 for bert large # -> max/mean(-1) -> [batch_size, seq_size, bert_hidden_size] ''' # 3) DSA pooling stack = torch.stack(all_hidden_states, dim=-2) # stack : [batch_size, seq_size, *, bert_hidden_size] stack = stack.view(-1, self.bert_num_layers + 1, self.bert_hidden_size) # stack : [*, bert_num_layers, bert_hidden_size] dsa_mask = torch.ones(stack.shape[0], stack.shape[1]).to(self.device) # dsa_mask : [*, bert_num_layers] dsa_out = self.dsa(stack, dsa_mask) # dsa_out : [*, self.dsa.last_dim] dsa_out = self.layernorm_dsa(dsa_out) embedded = dsa_out.view(-1, self.seq_size, self.dsa.last_dim) # embedded : [batch_size, seq_size, self.dsa.last_dim] else: # fine-tuning # x[0], x[1], x[2] : [batch_size, seq_size] if self.config['emb_class'] in ['bart', 'distilbert']: bert_outputs = self.bert_model(input_ids=x[0], attention_mask=x[1]) embedded = bert_outputs[0] else: bert_outputs = self.bert_model( input_ids=x[0], attention_mask=x[1], token_type_ids=None if self.config['emb_class'] in ['roberta'] else x[2]) # RoBERTa don't use segment_ids embedded = bert_outputs[0] # embedded : [batch_size, seq_size, bert_hidden_size] return embedded def forward(self, x): # x[0,1,2] : [batch_size, seq_size] mask = x[1].to(torch.uint8).to(self.device) # mask == attention_mask : [batch_size, seq_size] lengths = torch.sum(mask.to(torch.long), dim=1) # lengths : [batch_size] # 1. Embedding bert_embed_out = self._compute_bert_embedding(x) # bert_embed_out : [batch_size, seq_size, *] pos_ids = x[3] pos_embed_out = self.embed_pos(pos_ids) # pos_embed_out : [batch_size, seq_size, pos_emb_dim] if self.use_pos: embed_out = torch.cat([bert_embed_out, pos_embed_out], dim=-1) else: embed_out = bert_embed_out # embed_out : [batch_size, seq_size, emb_dim] embed_out = self.dropout(embed_out) # 2. LSTM if not self.disable_lstm: packed_embed_out = torch.nn.utils.rnn.pack_padded_sequence( embed_out, lengths, batch_first=True, enforce_sorted=False) lstm_out, (h_n, c_n) = self.lstm(packed_embed_out) lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence( lstm_out, batch_first=True, total_length=self.seq_size) # lstm_out : [batch_size, seq_size, lstm_hidden_dim*2] lstm_out = self.dropout(lstm_out) else: lstm_out = embed_out # lstm_out : [batch_size, seq_size, emb_dim] # 3. Output logits = self.linear(lstm_out) # logits : [batch_size, seq_size, label_size] if not self.use_crf: return logits prediction = self.crf.decode(logits) prediction = torch.as_tensor(prediction, dtype=torch.long) # prediction : [batch_size, seq_size] return logits, prediction
def __init__(self, config, bert_config, bert_model, bert_tokenizer, label_path, pos_path, use_crf=False, use_pos=False, disable_lstm=False, feature_based=False): super().__init__(config=config) self.config = config self.device = config['opt'].device self.seq_size = config['n_ctx'] pos_emb_dim = config['pos_emb_dim'] lstm_hidden_dim = config['lstm_hidden_dim'] lstm_num_layers = config['lstm_num_layers'] lstm_dropout = config['lstm_dropout'] self.use_crf = use_crf self.use_pos = use_pos self.disable_lstm = disable_lstm # bert embedding layer self.bert_config = bert_config self.bert_model = bert_model self.bert_tokenizer = bert_tokenizer self.bert_feature_based = feature_based self.bert_hidden_size = bert_config.hidden_size self.bert_num_layers = bert_config.num_hidden_layers # DSA layer for bert_feature_based dsa_num_attentions = config['dsa_num_attentions'] dsa_input_dim = self.bert_hidden_size dsa_dim = config['dsa_dim'] dsa_r = config['dsa_r'] self.dsa = DSA(config, dsa_num_attentions, dsa_input_dim, dsa_dim, dsa_r=dsa_r) self.layernorm_dsa = nn.LayerNorm(self.dsa.last_dim) bert_emb_dim = self.bert_hidden_size if self.bert_feature_based: ''' # 1) last layer, 2) mean pooling bert_emb_dim = self.bert_hidden_size ''' # 3) DSA pooling bert_emb_dim = self.dsa.last_dim # pos embedding layer self.poss = super().load_dict(pos_path) self.pos_vocab_size = len(self.poss) padding_idx = config['pad_pos_id'] self.embed_pos = super().create_embedding_layer( self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx) # BiLSTM layer if self.use_pos: emb_dim = bert_emb_dim + pos_emb_dim else: emb_dim = bert_emb_dim if not self.disable_lstm: self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=lstm_hidden_dim, num_layers=lstm_num_layers, dropout=lstm_dropout, bidirectional=True, batch_first=True) self.dropout = nn.Dropout(config['dropout']) # projection layer self.labels = super().load_dict(label_path) self.label_size = len(self.labels) if not self.disable_lstm: self.linear = nn.Linear(lstm_hidden_dim * 2, self.label_size) else: self.linear = nn.Linear(emb_dim, self.label_size) # CRF layer if self.use_crf: self.crf = CRF(num_tags=self.label_size, batch_first=True)
def __init__(self, config, embedding_path, label_path, pos_path, emb_non_trainable=True, use_crf=False, use_char_cnn=False): super().__init__(config=config) self.config = config self.device = config['opt'].device self.seq_size = config['n_ctx'] pos_emb_dim = config['pos_emb_dim'] self.use_crf = use_crf self.use_char_cnn = use_char_cnn # glove embedding layer weights_matrix = super().load_embedding(embedding_path) vocab_dim, token_emb_dim = weights_matrix.size() padding_idx = config['pad_token_id'] self.embed_token = super().create_embedding_layer( vocab_dim, token_emb_dim, weights_matrix=weights_matrix, non_trainable=emb_non_trainable, padding_idx=padding_idx) # pos embedding layer self.poss = super().load_dict(pos_path) self.pos_vocab_size = len(self.poss) padding_idx = config['pad_pos_id'] self.embed_pos = super().create_embedding_layer( self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx) emb_dim = token_emb_dim + pos_emb_dim # char embedding layer if self.use_char_cnn: self.charcnn = CharCNN(config) emb_dim = token_emb_dim + pos_emb_dim + self.charcnn.last_dim # Densenet layer densenet_kernels = config['densenet_kernels'] first_num_filters = config['densenet_first_num_filters'] num_filters = config['densenet_num_filters'] last_num_filters = config['densenet_last_num_filters'] self.densenet = DenseNet(densenet_kernels, emb_dim, first_num_filters, num_filters, last_num_filters, activation=F.relu) self.layernorm_densenet = nn.LayerNorm(self.densenet.last_dim) self.dropout = nn.Dropout(config['dropout']) # projection layer self.labels = super().load_dict(label_path) self.label_size = len(self.labels) self.linear = nn.Linear(last_num_filters, self.label_size) # CRF layer if self.use_crf: self.crf = CRF(num_tags=self.label_size, batch_first=True)
class GloveDensenetCRF(BaseModel): def __init__(self, config, embedding_path, label_path, pos_path, emb_non_trainable=True, use_crf=False, use_char_cnn=False): super().__init__(config=config) self.config = config self.device = config['opt'].device self.seq_size = config['n_ctx'] pos_emb_dim = config['pos_emb_dim'] self.use_crf = use_crf self.use_char_cnn = use_char_cnn # glove embedding layer weights_matrix = super().load_embedding(embedding_path) vocab_dim, token_emb_dim = weights_matrix.size() padding_idx = config['pad_token_id'] self.embed_token = super().create_embedding_layer( vocab_dim, token_emb_dim, weights_matrix=weights_matrix, non_trainable=emb_non_trainable, padding_idx=padding_idx) # pos embedding layer self.poss = super().load_dict(pos_path) self.pos_vocab_size = len(self.poss) padding_idx = config['pad_pos_id'] self.embed_pos = super().create_embedding_layer( self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx) emb_dim = token_emb_dim + pos_emb_dim # char embedding layer if self.use_char_cnn: self.charcnn = CharCNN(config) emb_dim = token_emb_dim + pos_emb_dim + self.charcnn.last_dim # Densenet layer densenet_kernels = config['densenet_kernels'] first_num_filters = config['densenet_first_num_filters'] num_filters = config['densenet_num_filters'] last_num_filters = config['densenet_last_num_filters'] self.densenet = DenseNet(densenet_kernels, emb_dim, first_num_filters, num_filters, last_num_filters, activation=F.relu) self.layernorm_densenet = nn.LayerNorm(self.densenet.last_dim) self.dropout = nn.Dropout(config['dropout']) # projection layer self.labels = super().load_dict(label_path) self.label_size = len(self.labels) self.linear = nn.Linear(last_num_filters, self.label_size) # CRF layer if self.use_crf: self.crf = CRF(num_tags=self.label_size, batch_first=True) def forward(self, x): # x[0, 1] : [batch_size, seq_size] # x[2] : [batch_size, seq_size, char_n_ctx] token_ids = x[0] pos_ids = x[1] mask = torch.sign(torch.abs(token_ids)).to(torch.uint8).to(self.device) # mask : [batch_size, seq_size] # 1. Embedding token_embed_out = self.embed_token(token_ids) # token_embed_out : [batch_size, seq_size, token_emb_dim] pos_embed_out = self.embed_pos(pos_ids) # pos_embed_out : [batch_size, seq_size, pos_emb_dim] if self.use_char_cnn: char_ids = x[2] # char_ids : [batch_size, seq_size, char_n_ctx] charcnn_out = self.charcnn(char_ids) # charcnn_out : [batch_size, seq_size, self.charcnn.last_dim] embed_out = torch.cat( [token_embed_out, pos_embed_out, charcnn_out], dim=-1) # embed_out : [batch_size, seq_size, emb_dim] else: embed_out = torch.cat([token_embed_out, pos_embed_out], dim=-1) # embed_out : [batch_size, seq_size, emb_dim] embed_out = self.dropout(embed_out) # 2. DenseNet densenet_out = self.densenet(embed_out, mask) # densenet_out : [batch_size, seq_size, last_num_filters] densenet_out = self.layernorm_densenet(densenet_out) densenet_out = self.dropout(densenet_out) # 3. Output logits = self.linear(densenet_out) # logits : [batch_size, seq_size, label_size] if not self.use_crf: return logits prediction = self.crf.decode(logits) prediction = torch.as_tensor(prediction, dtype=torch.long) # prediction : [batch_size, seq_size] return logits, prediction
def __init__(self, scorer: EmissionScorer, padding_idx: int = 0) -> None: super(CRFTagger, self).__init__() self.scorer = scorer self.padding_idx = padding_idx self.crf = CRF(scorer.num_tags) self.reset_parameters()
########读取验证集########### with open(dev_pkl, "rb") as f: dev_features, word_index, char_index = pkl.load(f) dev_sents = read_data(dev_path) print('读取验证集完成') dev_count = len(dev_features) #########获取词向量初始矩阵############### with open(word_pkl, 'rb') as f: word_matrix = pkl.load(f) print('初始化词向量完成') #########加载模型############### lstm = cnn_lstm_no_pad_model(word_matrix, word_dim, len(char_index), char_dim, feature_maps, kernels, hidden_dim, tagset_size) lstm.cuda(device=0) crf = CRF(tagset_size, batch_first=True) crf.cuda(device=0) parameters = [] for param in lstm.parameters(): parameters.append(param) for param in crf.parameters(): parameters.append(param) optimizer = optim.RMSprop(parameters, lr=learn_rate) # optimizer=optim.Adam(parameters, lr=learn_rate) # optimizer=optim.Adagrad(parameters, lr=learn_rate) # optimizer=optim.SGD(parameters, lr=learn_rate) ########训练和测试############## distant_index = list(range(distant_count)) dev_index = list(range(dev_count)) max_f_dev = 0.0 for epoch in range(epoch_num):
class BertLstmCrf(BertModel): """On the outputs of Bert there is a LSTM layer. On top of the LSTM there is a CRF layer. """ def __init__(self, config, pad_idx, lstm_hidden_dim, num_lstm_layers, bidirectional, num_labels): super(BertLstmCrf, self).__init__(config) self.dropout_prob = config.hidden_dropout_prob self.pad_idx = pad_idx self.lstm_hidden_dim = lstm_hidden_dim self.num_lstm_layers = num_lstm_layers self.bidirectional = bidirectional self.num_labels = num_labels self.bert = BertModel(config) if self.num_lstm_layers > 1: self.lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=self.lstm_hidden_dim, num_layers=self.num_lstm_layers, bidirectional=self.bidirectional, dropout=self.dropout_prob, batch_first=True) else: self.lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=self.lstm_hidden_dim, num_layers=self.num_lstm_layers, bidirectional=self.bidirectional, batch_first=True) if self.bidirectional is True: self.linear = nn.Linear(self.lstm_hidden_dim * 2, self.num_labels) else: self.linear = nn.Linear(self.lstm_hidden_dim, self.num_labels) self.crf_layer = CRF(self.num_labels, batch_first=True) self.dropout_layer = nn.Dropout(self.dropout_prob) self.init_weights() def create_mask_for_crf(self, inp): """Creates a mask for the feesing to crf layer. Args: inp (TYPE): input given to bert layer """ mask = (inp != self.pad_idx) & (inp != self.sep_idx) # mask = [seq_len, batch_size] return mask def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, labels=None): outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask) sequence_output = outputs[0] lstm_out, (hidden, cell) = self.lstm(sequence_output) logits = self.linear(self.dropout_layer(lstm_out)) # removing cls token logits = logits[:, 1:, :] if labels is not None: labels = labels[:, 1:] input_ids = input_ids[:, 1:] # creating mask for crf mask = self.create_mask_for_crf(input_ids) # crf part if labels is not None: loss = self.crf_layer(logits, labels, mask=mask) * torch.tensor( -1, device=self.device) else: loss = None out = self.crf_layer.decode(logits) out = torch.tensor(out, dtype=torch.long, device=self.device) # out = [batch_Size, seq_len] return out, labels, loss
w_tag_pad=w_padding, t_tag_pad=len(id2label)), model, SimpleLossCompute(criterion, optimizer, scheduler), train=False, id2label=id2label) print('Loss:', loss) testResult.append(f) valBest = max(valResult) print('ValBest epoch:', [i for i, j in enumerate(valResult) if j == valBest]) testBest = max(testResult) print('TestBest epoch:', [i for i, j in enumerate(testResult) if j == testBest]) trainSents = preProcess(readData('dps/swbd/train')) valSents = preProcess(readData('dps/swbd/val')) testSents = preProcess(readData('dps/swbd/test')) label2id, id2label = build_vocab(trainSents) print(id2label) tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) trainData = idData(tokenizer, trainSents, label2id) valData = idData(tokenizer, valSents, label2id) testData = idData(tokenizer, testSents, label2id) encoder = Encoder(len(id2label)).to(device) criterion = CRF(len(id2label), batch_first=True).to(device) run(EPOCH, encoder, BATCH_SIZE, trainData, valData, testData, id2label, tokenizer._convert_token_to_id('[PAD]'), criterion)
class BertCrfForNER(BertModel): """ This class inherits functionality from huggingface BertModel. It applies a crf layer on the Bert outputs. """ def __init__(self, config, pad_idx, sep_idx, num_labels): """Inititalization Args: config (TYPE): model config flie (similar to bert_config.json) num_labels : total number of layers using the bio format pad_idx (TYPE): pad_idx of the tokenizer device (TYPE): torch.device() """ super(BertCrfForNER, self).__init__(config) self.num_labels = num_labels self.pad_idx = pad_idx self.sep_idx = sep_idx self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.crf_layer = CRF(self.num_labels, batch_first=True) self.linear = nn.Linear(config.hidden_size, self.num_labels) self.init_weights() def create_mask_for_crf(self, inp): """Creates a mask for the feeding to crf layer. Mask <PAD> and <SEP> token positions Args: inp (TYPE): input given to bert layer """ mask = (inp != self.pad_idx) & (inp != self.sep_idx) # mask = [seq_len, batch_size] return mask def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, labels=None): """Forwar propagate. Args: input_ids (TYPE): bert input ids attention_mask (None, optional): attention mask for bert token_type_ids (None, optional): token type ids for bert position_ids (None, optional): position ids for bert head_mask (None, optional): head mask for bert labels (None, optional): labels required while training crf """ # getting outputs from Bert outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask) # taking tokens embeddings from the output sequence_output = outputs[0] # sequence_ouput = [batch_size, seq_len, hidden_size] logits = self.linear(sequence_output) # logits = [batch_size, seq_len, num_labels] # removing cls token logits = logits[:, 1:, :] if labels is not None: labels = labels[:, 1:] # check whether labels include the cls token too or not input_ids = input_ids[:, 1:] mask = self.create_mask_for_crf(input_ids) if labels is not None: loss = self.crf_layer(logits, labels, mask=mask) * torch.tensor( -1, device=self.device) else: loss = None # this is the crf loss out = self.crf_layer.decode(logits) out = torch.tensor(out, dtype=torch.long, device=self.device) # out = [batch_size, seq_length] return out, labels, loss
def make_crf(num_tags=5, batch_first=False): return CRF(num_tags, batch_first=batch_first)
def transformation(): # Do an inference on a single batch of data data = None # 1) INPUT: convert Korean text input to NER code array if flask.request.content_type == 'text/plain': '''CHECK file locations''' model_config = Config(json_path="config.json") tok_path = "./tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) with open("vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) with open("ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} decoder_from_res = DecoderFromNamedEntitySequence( tokenizer=tokenizer, index_to_ner=index_to_ner) ''' Assuming request.data is a string: name of txt file > NER_OY_data.txt as an example > 지금은 /opt/program에 (product-tags) HERE:? ''' f = flask.request.data.decode("utf-8") lines = f.splitlines(True) index = 0 with open("NER_OY_result.txt", 'w', encoding='utf-8-sig') as w: for i in range(len(lines)): input_text = '' if i % 4 == 1: input_text = lines[i][3:] addInfo = lines[i + 1][3:] if input_text == '': continue index += 1 # print("\n## " + str(index) + "\n") list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids( [input_text]) x_input = torch.tensor(list_of_input_ids).long() # print(list_of_input_ids) # print(x_input) data = {"instances": list_of_input_ids} predictions = ScoringService.predict(data) # 2) OUTPUT: convert NER code to Korean text (FILE) emission = torch.tensor(predictions['predictions']) num_classes = len(ner_to_index) crf = CRF(num_tags=num_classes, batch_first=True) list_of_pred_ids = crf.decode(emission) input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res( list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=False) unkTokenList = makeUNKTokenList(input_text, input_token) input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res( list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=unkTokenList) w.write('## ' + str(index) + '\n') w.write(addInfo) w.write(str(list_of_ner_word) + '\n') w.write(str(decoding_ner_sentence[6:-5]) + '\n') '''RETURN a file: NER_OY_result.txt''' return flask.Response(response=open("NER_OY_result.txt", 'r'), status=200, mimetype='text/plain') else: return flask.Response( response='This predictor only supports TEXT data', status=415, mimetype='text/plain')
def __init__(self, args): super(Aspect_CS_GAT_BERT, self).__init__() self.args = args self.wembeddings = args.bert_model # POS-Tagging Embedding Layer self.pembeddings = EmbeddingLayer(embedding_size=(232, 232), dropout=args.posemb_dp, device=args.device) # Residual POS-Tagging Embedding self.res_posemb = EmbeddingLayer( embedding_size=(2 * args.lstm_hidden_size, 2 * args.lstm_hidden_size), dropout=None, device=args.device) # Bi-LSTM Encoder self.bilstm = DynamicLSTM(input_size=1000, hidden_size=args.lstm_hidden_size, num_layers=args.num_layers, dropout=args.bilstm_dp, bidirectional=True, device=args.device) # GCN self.gcns = nn.ModuleList() for i in range(args.gcn_num_layers): gcn = GraphConvolution( in_features=2 * args.lstm_hidden_size, out_features=2 * args.lstm_hidden_size, edge_types=args.edge_types_num, dropout=args.gcn_dp if i != args.gcn_num_layers - 1 else 0, use_bn=args.gcn_use_bn, device=args.device) self.gcns.append(gcn) # Highway if args.highway_use: self.hws = nn.ModuleList() for i in range(args.gcn_num_layers): hw = HighWay(size=2 * args.lstm_hidden_size, dropout_ratio=args.gcn_dp) self.hws.append(hw) self.sa_output = BottledXavierLinear( in_features=4 * args.lstm_hidden_size, out_features=args.sa_classes).to(device=args.device) # CRF self.CRF_model = CRF(4, batch_first=True) if args.target_method == 'BIO': self.dt_output = nn.Linear(4 * args.lstm_hidden_size, 4) else: self.dt_output = nn.Linear(4 * args.lstm_hidden_size, 3) self.loss_func_sa = FocalLoss(alpha=0.5, num_classes=4) self.dropout_sa = nn.Dropout(0.5) # 0.5 self.dropout_dt = nn.Dropout(0.35) # 0.2 0.35
class BERT_LSTM_CRF2(MODEL_TEMP): def __init__(self, config={}, show_param=False): ''' :param - dict *param['embedding_dim'] *param['hidden_dim'] param['n_ent_tags'] param['n_rel_tags'] param['n_rels'] param['n_words'] *param['start_ent_idx'] int, <start> tag index for entity tag seq *param['end_ent_idx'] int, <end> tag index for entity tag seq *param['start_rel_idx'] int, <start> tag index for entity tag seq *param['end_rel_idx'] int, <end> tag index for entity tag seq param['use_cuda'] param['dropout_prob'] param['lstm_layer_num'] ''' super(BERT_LSTM_CRF2, self).__init__() self.config = config self.embedding_dim = self.config.get('embedding_dim', 768) self.hidden_dim = self.config.get('hidden_dim', 64) ##TODO: 128*2 assert self.hidden_dim % 2 == 0, 'hidden_dim for BLSTM must be even' self.n_tags = self.config['n_rel_tags'] - 2 # self.n_words = self.config.get('n_words', 10000) self.dropout_prob = self.config.get('dropout_prob', 0) self.lstm_layer_num = self.config.get('lstm_layer_num', 1) self.use_cuda = self.config.get('use_cuda', 0) self.model_type = 'BERT_LSTM_CRF2' self.build_model() self.reset_parameters() if show_param: self.show_model_param() def show_model_param(self): log('='*80, 0) log(f'model_type: {self.model_type}', 1) log(f'use_cuda: {self.use_cuda}', 1) log(f'embedding_dim: {self.embedding_dim}', 1) log(f'hidden_dim: {self.hidden_dim}', 1) log(f'n_rel_tags: {self.n_tags}', 1) # log(f"crf_start_idx: {self.config['start_ent_idx']}", 1) # log(f"crf_end_idx: {self.config['end_ent_idx']}", 1) log(f'lstm_layer_num: {self.lstm_layer_num}', 1) log(f'dropout_prob: {self.dropout_prob}', 1) log('='*80, 0) def build_model(self): ''' build the bert layer, lstm layer and CRF layer ''' # self.word_embeds = nn.Embedding(self.n_words, self.embedding_dim) self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim//2, batch_first=True, num_layers=self.lstm_layer_num, dropout=self.dropout_prob, bidirectional=True) self.hidden2tag = nn.Linear(self.hidden_dim, self.n_tags) self.crf = CRF(self.n_tags, batch_first=True) self.bert = transformers.BertModel.from_pretrained('bert-base-chinese') def reset_parameters(self): # I.xavier_normal_(self.word_embeds.weight.data) self.lstm.reset_parameters() # stdv = 1.0 / math.sqrt(self.hidden_dim) # for weight in self.lstm.parameters(): # I.uniform_(weight, -stdv, stdv) I.xavier_normal_(self.hidden2tag.weight.data) self.crf.reset_parameters() def _get_lstm_features(self, x, lens, use_cuda=None): ''' TODO: 添加关于句子长度处理的部分 :param @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array @lens: 每个句子的实际长度 :return @lstm_feature: (batch_size, T, n_tags) -- 类似于eject score, torch.tensor ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda batch_size, T = x.shape words_tensor = self._to_tensor(x, use_cuda) #(batch_size, T) lens = self._to_tensor(lens, use_cuda) att_mask = self._generate_mask(lens, max_len=T) embeds = self.bert(words_tensor, attention_mask=att_mask)[0] #(batch_size, T, n_embed) ##LSTM layer if use_cuda: h_0 = torch.randn(2*self.lstm_layer_num, batch_size, self.hidden_dim//2).cuda() #(n_layer*n_dir, N, n_hid) c_0 = torch.randn(2*self.lstm_layer_num, batch_size, self.hidden_dim//2).cuda() else: h_0 = torch.randn(2*self.lstm_layer_num, batch_size, self.hidden_dim//2) c_0 = torch.randn(2*self.lstm_layer_num, batch_size, self.hidden_dim//2) # c_0 = h_0.clone() hidden = (h_0, c_0) lstm_out, _hidden = self.lstm(embeds, hidden) #(batch_size, T, n_dir*n_hid), (h, c) ##FC layer lstm_feature = self.hidden2tag(lstm_out) #(batch_size, T, n_tags) lstm_feature = torch.tanh(lstm_feature) return lstm_feature def _loss(self, x, y_rel, lens, use_cuda=None): ''' loss function: neg_log_likelihood :param @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array @y_rel: (batch_size, T), np.array, index之后的rel_with_ent seq, 字符级别, @lens: (batch_size), list, 具体每个句子的长度, :return @loss: (batch_size), torch.tensor ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda T = x.shape[1] logits = self._get_lstm_features(x, lens) ##(batch_size, T, n_tags) tensor_y_rel = self._to_tensor(y_rel, use_cuda) lens = self._to_tensor(lens, use_cuda) len_mask = self._generate_mask(lens, max_len=T) ##(batch_size, T) log_likelihood_ent = self.crf(emissions=logits, tags=tensor_y_rel, mask=len_mask, reduction='mean') return - log_likelihood_ent def _output(self, x, lens, use_cuda=None): ''' return the crf decode paths :param @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array @lens: (batch_size), list, 具体每个句子的长度, :return @paths: (batch_size, T), torch.tensor, 最佳句子路径 @scores: (batch_size), torch.tensor, 最佳句子路径上的得分 ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda T = x.shape[1] logits = self._get_lstm_features(x, lens, use_cuda) lens = self._to_tensor(lens, use_cuda) len_mask = self._generate_mask(lens, max_len=T) ##(batch_size, T) # paths = self.crf.decode(logits, len_mask) paths = self.crf.decode(logits) paths = self._to_tensor(paths, use_cuda) return paths
class GPT2LSTMLogRegCRF(nn.Module): def __init__(self, freeze_bert, tokenizer, device, bidirectional, class_weights): super(GPT2LSTMLogRegCRF, self).__init__() #Instantiating BERT model object self.gpt2_layer = GPT2Model.from_pretrained('gpt2', output_hidden_states=True, output_attentions=False) #Freeze bert layers: if True, the freeze BERT weights if freeze_bert: for p in self.gpt2_layer.parameters(): p.requires_grad = False self.tokenizer = tokenizer self.device = device self.bidirectional = bidirectional self.dropout = nn.Dropout(0.5) # lstm layer self.lstm_layer = nn.LSTM(input_size=768, hidden_size=512, num_layers=1, bidirectional=bidirectional, batch_first=True) # log reg if bidirectional == True: self.hidden2tag = nn.Linear(1024, clf_P_fine_num_labels) else: self.hidden2tag = nn.Linear(512, clf_P_fine_num_labels) # crf self.crf_layer = CRF(clf_P_fine_num_labels, batch_first=True) def forward(self, input_ids=None, attention_mask=None, labels=None): # BERT outputs = self.gpt2_layer(input_ids, attention_mask=attention_mask) # output 0 = batch size 6, tokens 512, each token dimension 768 [CLS] token # output 1 = batch size 6, each token dimension 768 # output 2 = layers 13, batch 6 (hidden states), tokens 512, each token dimension 768 sequence_output = outputs[2] # Last layer of each token prediction num_layer_sum = 4 summed_last_4_layers = torch.stack( sequence_output[:num_layer_sum]).mean(0) summed_last_4_layers = self.dropout( summed_last_4_layers) # newly added dropout # lstm with masks (same as attention masks) packed_input, perm_idx, seq_lengths = get_packed_padded_output( summed_last_4_layers, input_ids, attention_mask, self.tokenizer) packed_output, (ht, ct) = self.lstm_layer(packed_input) # Unpack and reorder the output output, input_sizes = pad_packed_sequence(packed_output, batch_first=True) _, unperm_idx = perm_idx.sort(0) lstm_output = output[ unperm_idx] # lstm_output.shape = shorter than the padded torch.Size([6, 388, 512]) seq_lengths_ordered = seq_lengths[unperm_idx] # shorten the labels as per the batchsize labels = labels[:, :lstm_output.shape[1]] # mask the unimportant tokens before log_reg mask = ((input_ids[:, :lstm_output.shape[1]] != 50256) & (input_ids[:, :lstm_output.shape[1]] != 50256) & (labels != 100)) # on the first time steps for eachIndex in range(mask.shape[0]): mask[eachIndex, 0] = True mask_expanded = mask.unsqueeze(-1).expand(lstm_output.size()) lstm_output *= mask_expanded.float() labels *= mask.long() # log reg probablities = self.hidden2tag(lstm_output) # CRF emissions loss = self.crf_layer(probablities, labels, reduction='token_mean') emissions_ = self.crf_layer.decode(probablities) emissions = [item for sublist in emissions_ for item in sublist] # flatten the nest list of emissions return loss, torch.Tensor(emissions_), labels, mask
def __init__(self, config: RobertaConfig, args: Any, intent_label_dict: Dict[str, List[str]], slot_label_dict: Dict[str, List[str]], pos_label_lst: List[str], tasks: List[str]) -> None: super(JointRoberta, self).__init__(config) self.args = args self.tasks = tasks self.intent_label_dict = intent_label_dict self.slot_label_dict = slot_label_dict self.pos_label_lst = pos_label_lst self.num_intent_labels_dict = { k: len(v) for (k, v) in intent_label_dict.items() } self.num_slot_labels_dict = { k: len(v) for (k, v) in slot_label_dict.items() } self.intent_classifiers = {} self.slot_classifiers = {} self.crfs = {} self.num_pos_labels = len(pos_label_lst) self.num_np_labels = 1 # len(np_label_lst) self.num_vp_labels = 1 # len(vp_label_lst) self.num_entity_labels = 1 # len(entity_label_lst) self.num_acronym_labels = 1 # len(acronym_label_lst) self.roberta = RobertaModel(config=config) # Load pretrained bert hidden_size = config.hidden_size # TODO pos_emb = 50 should be an input variable if args.use_pos: pos_dim = 50 hidden_size += pos_dim self.pos_emb = (nn.Embedding(self.num_pos_labels, pos_dim) if pos_dim > 0 else None) if args.use_np: hidden_size += self.num_np_labels if args.use_vp: hidden_size += self.num_vp_labels if args.use_entity: hidden_size += self.num_entity_labels if args.use_acronym: hidden_size += self.num_acronym_labels self.custom_pooler = Pooler(hidden_size=hidden_size) for pred_type in self.tasks: self.intent_classifiers[pred_type] = IntentClassifier( hidden_size, self.num_intent_labels_dict[pred_type], args.dropout_rate) for pred_type in self.tasks: self.slot_classifiers[pred_type] = SlotClassifier( hidden_size, self.num_slot_labels_dict[pred_type], args.dropout_rate) if args.use_crf: self.crfs[pred_type] = CRF( num_tags=self.num_slot_labels_dict[pred_type], batch_first=True) self.intent_classifiers = nn.ModuleDict(self.intent_classifiers) self.slot_classifiers = nn.ModuleDict(self.slot_classifiers) self.crfs = nn.ModuleDict(self.crfs)
def test_nonpositive_num_tags(self): with pytest.raises(ValueError) as excinfo: CRF(0) assert 'invalid number of tags: 0' in str(excinfo.value)
class CRFEvaluateWorkflow(object): def __init__(self): self.__MODE = "CRF_EVALUATE" self.__METRIC = "loss_avg" # Validation parameters self.batch_size = None self.num_workers = None self.seed = None self.model_config = None self.ckpt_dir = None # Data self.dataset = None self.prediction_dir = None self.dtype = None self.class_weight = None # Save self.output_dir = None self.experiment_name = None self.result_dir = None self.tmp_dir = None ######## Configs ######## self.args = None def run(self): # Set up logger and print out configurations self.model_dir = init_model_dir(self.output_dir, self.experiment_name) self.logger = set_logger( self.model_dir, self.experiment_name, self.__MODE, self.dtype ) display_args(self.args, self.logger) # Set up GPUs if not torch.cuda.is_available(): raise Exception("No GPU found.") torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.device = torch.device("cuda") self.logger.info( "Use {} GPU(s) for training".format(torch.cuda.device_count()) ) # Initialize model. self.load_model() self.logger.info("MODEL ARCHITECTURE:\n{}".format(self.model)) # Create evaluation result folders. self.initialize_result_directories() # Start evaluation. self.logger.info("Start evaluating CRF classifier.") try: self.crf_evaluate() self.logger.info("CRF classifier evaluation finished.") except KeyboardInterrupt: self.logger.warning("Evaluation interrupted. Program exit.") ######################## # Multiprocessing : main thread ######################## def crf_evaluate(self): """Evaluate all checkpoints for a trained CRF classifier.""" ckpt_tracker = CheckpointTracker(os.path.join(self.ckpt_path, "*.ckpt")) self.time_tracker = TimeTracker() # Initialize validation dataset params = { "batch_size": self.batch_size, "shuffle": False, "num_workers": self.num_workers, "collate_fn": CRF_collate_samples, } file_path, _ = merge_predictions( self.prediction_dir, self.logger, self.tmp_dir, target_path=self.dataset, ) evaluate_dset = CRFDataset(file_path, self.class_weight) evaluate_iter = DataLoader(evaluate_dset, **params) num_batches = len(evaluate_iter) self.logger.info( "{:,} samples used for evaluation.".format(evaluate_dset.__len__()) ) # data preprocess worker preprocess_queue = mp.JoinableQueue(maxsize=128) preprocess_worker = mp.Process( name="preprocess", target=self.preprocess, args=(preprocess_queue, evaluate_iter), ) preprocess_worker.start() self.logger.info("CRF evaluation data workder started") # Evaluate all checkpoints. while len(ckpt_tracker.remaining) > 0: for ckpt in ckpt_tracker.remaining: self.evaluate_checkpoint(ckpt, preprocess_queue, num_batches) ckpt_tracker.add_evaluated(ckpt) ckpt_tracker.reset_params() # Terminate data worker. preprocess_worker.terminate() def evaluate_checkpoint(self, checkpoint, preprocess_queue, num_batches): """Evaluate one checkpoint of a trained CRF classifier.""" # Load checkpoint step = self.load_checkpoint(checkpoint) self.logger.info("Evaluating CRF step {}".format(step)) # Return if the checkpoint is already evaluated. eval_path = os.path.join( self.eval_path, "{}_{}.json".format(self.experiment_name, step) ) if os.path.exists(eval_path): self.logger.info("Step {} already evaluated".format(step)) return # Initialize evaluation worker. evaluate_queue = mp.JoinableQueue(maxsize=64) evaluate_worker = mp.Process( name="evaluate_{}".format(step), target=self.evaluate, args=(checkpoint, evaluate_queue, eval_path, step, num_batches), ) evaluate_worker.start() # Evaluate checkpoint. self.model.eval() with torch.no_grad(): for b in tqdm(range(num_batches)): dset = preprocess_queue.get() feature, target = CRF_push_to_device(dset, self.device) loss = -self.model(feature, target, reduction="mean") evaluate_queue.put(loss.item()) evaluate_queue.join() ######################## # Multiprocessing : workers ######################## # Preprocess worker def preprocess(self, queue, dataloader): """Set up multiprocessing data queue""" while True: for dset in dataloader: queue.put(dset) # Evaluate worker def evaluate(self, ckpt, queue, eval_path, step, num_batches): """Evaluate checkpoint and save evaluation to disk.""" self.loss_avg = AverageMeter() for batch in range(num_batches): loss = queue.get() queue.task_done() self.loss_avg.update(loss) self.display_result(step) # Save evaluation result = { "step": step, "loss_avg": self.loss_avg.avg, } with open(eval_path, "w") as outfile: json.dump(result, outfile, indent=4) # Update best checkpoint self.update_best_ckpt(result, ckpt) ############################ # Display and save evaluations ############################ def display_result(self, step): """Display average evaluation loss.""" self.logger.info( "EVALUATE CRF | step {:8d} | avg loss {:8.4f} " "| time elapse: {:>12} |".format( step, self.loss_avg.avg, self.time_tracker.elapse() ) ) def update_best_ckpt(self, result, checkpoint): """Update best checkpoint metrics.""" result["ckpt_path"] = checkpoint result["metric"] = self.__METRIC path = os.path.join( self.best_ckpt_path, "best_{}.json".format(self.__METRIC) ) if os.path.exists(path): with open(path, "r") as infile: metrics = json.load(infile) if metrics[self.__METRIC] <= result[self.__METRIC]: return with open(path, "w") as outfile: json.dump(result, outfile, indent=4) def initialize_result_directories(self): """Initialize output evaluation and checkpoint directories.""" if self.result_dir is not None: self.eval_path = self.result_dir else: self.eval_path = os.path.join( self.model_dir, "{}_{}".format(self.dtype.lower(), self.__MODE.lower()), ) create_dirs(self.eval_path, logger=self.logger) self.best_ckpt_path = os.path.join(self.eval_path, "best_checkpoint") create_dirs(self.best_ckpt_path, logger=self.logger) if self.ckpt_dir is not None: self.ckpt_path = self.ckpt_dir else: self.ckpt_path = os.path.join(self.model_dir, "crf_checkpoints") ############################ # Loading model and checkpoint ############################ def load_model(self): """Load args from .config file and initialize CRF classifier.""" # Load model params if self.model_config is not None: path = self.model_config else: path = os.path.join( self.model_dir, "{}_crf.config".format(self.experiment_name) ) params = torch.load(path, map_location="cpu") # Initialize classifier self.model = CRF(params["output_size"], batch_first=True) self.model.to(self.device) def load_checkpoint(self, ckpt_path): """Load CRF checkpoint state_dict.""" ckpt_params = torch.load(ckpt_path, map_location=self.device) self.model.load_state_dict(ckpt_params["state_dict"]) return ckpt_params["step"]
train_list.append(train) test = copy.copy(whole_corpus).set_filter(test_ids) test_list.append(test) if args.mode != 'eval': training_data = ConcatDataset(train_list) testing_data = ConcatDataset(test_list) print('----------------------------------') end_loading = time.time() print("Loading done:", end_loading - start_loading) time_record['Load'] = end_loading - start_loading model = CRF(len(valid_types), batch_first=True).to(device) #################### # Training #################### if args.mode == 'train': classifier.load_state_dict( torch.load(join(pre_trained_sherlock_loc, pre_trained_sherlock_path), map_location=device)) # Set initial transition parameters if init_transition is not None: model.transitions = torch.nn.Parameter( torch.tensor(init_transition).float().to(device))
class BERT_CRF(nn.Module): """ 官方模板<https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html> 官方为cpu,要在gpu中运行,所有单独生成的tensor需要.to(device)导入gpu """ def __init__(self, tag_to_ix, mask=False): super(BERT_CRF, self).__init__() self.hidden_dim = 768 # BERT最后一层维度=768 self.mask = mask self.tag_to_ix = tag_to_ix self.tagset_size = len(tag_to_ix) self.hidden2tag = nn.Linear(self.hidden_dim, self.tagset_size) self.crf = CRF(self.tagset_size, batch_first=True) def _get_sentence_features(self, sentences): """ 用BERT抽取特征,保持结构统一直接输出,[time_step,768] :param sentences: :return: """ if self.mask: mask_idx = 1 - torch.eq(sentences, 0) mask_idx = (mask_idx.sum(dim=2) > 0) self.mask_idx = mask_idx return sentences def _get_sentence_feats(self, features): feats = self.hidden2tag(features) return feats def neg_log_likelihood(self, sentences, tags): """ 损失函数=所有序列得分-正确序列得分 :param sentence: :param tags: :return: """ features = self._get_sentence_features(sentences) feats = self._get_sentence_feats(features) if self.mask: loss = -self.crf(feats, tags, self.mask_idx, reduction='mean') else: loss = -self.crf(feats, tags, reduction='mean') return loss def _viterbi_decode(self, batch_feats): """ 维特比算法寻找最大得分序列,用于推断 :param batch_feats: :return: """ best_path = self.crf.decode(batch_feats) return best_path def forward(self, sentences): """ 前向传播过程 :param sentence: :return: """ features = self._get_sentence_features(sentences) feats = self._get_sentence_feats(features) tags = self._viterbi_decode(feats) return tags
class EnsembleCRFModel: def __init__(self, model_path_list, bert_dir_list, num_tags, device, lamb=1 / 3): self.models = [] self.crf_module = CRF(num_tags=num_tags, batch_first=True) self.lamb = lamb for idx, _path in enumerate(model_path_list): print(f'Load model from {_path}') print(f'Load model type: {bert_dir_list[0]}') model = CRFModel(bert_dir=bert_dir_list[0], num_tags=num_tags) model.load_state_dict( torch.load(_path, map_location=torch.device('cpu'))) model.eval() model.to(device) self.models.append(model) if idx == 0: print(f'Load CRF weight from {_path}') self.crf_module.load_state_dict(model.crf_module.state_dict()) self.crf_module.to(device) def weight(self, t): """ 牛顿冷却定律加权融合 """ return math.exp(-self.lamb * t) def predict(self, model_inputs): weight_sum = 0. logits = None attention_masks = model_inputs['attention_masks'] for idx, model in enumerate(self.models): # 使用牛顿冷却概率融合 weight = self.weight(idx) # 使用概率平均融合 # weight = 1 / len(self.models) tmp_logits = model(**model_inputs)[1] * weight weight_sum += weight if logits is None: logits = tmp_logits else: logits += tmp_logits logits = logits / weight_sum tokens_out = self.crf_module.decode(emissions=logits, mask=attention_masks.byte()) return tokens_out def vote_entities(self, model_inputs, sent, id2ent, threshold): entities_ls = [] for idx, model in enumerate(self.models): tmp_tokens = model(**model_inputs)[0][0] tmp_entities = crf_decode(tmp_tokens, sent, id2ent) entities_ls.append(tmp_entities) return vote(entities_ls, threshold)
class BertTagger_with_LSTMCRF(nn.Module): def __init__(self, args, model): # 传参传入了model super(BertTagger_with_LSTMCRF, self).__init__() self.embedding = model.embedding self.encoder = model.encoder self.target = model.target self.args = args self.need_birnn = args.need_birnn self.labels_num = args.labels_num out_dim = args.hidden_size # 如果为False,则不要BiLSTM层 if self.need_birnn: self.birnn = nn.LSTM(args.hidden_size, args.rnn_dim, num_layers=1, bidirectional=True, batch_first=True) out_dim = args.rnn_dim * 2 self.output_layer = nn.Linear(out_dim, self.labels_num) self.dropout = nn.Dropout(args.dropout) self.crf = CRF(args.labels_num, batch_first=True) def forward(self, src, label, mask, pos=None, vm=None): """ Args: src: [batch_size x seq_length] label: [batch_size x seq_length] mask: [batch_size x seq_length] Returns: loss: Sequence labeling loss. correct: Number of labels that are predicted correctly. predict: Predicted label. label: Gold label. example: src size: torch.Size([8, 128]) output size: torch.Size([8, 128, 768]) output size: torch.Size([8, 128, 256]) output size: torch.Size([8, 128, 256]) output size: torch.Size([8, 128, 15]) output size: torch.Size([8, 128]) output size: torch.Size([1024, 1]) label size: torch.Size([1024, 1]) label size: torch.Size([1024]) """ # Embedding. emb = self.embedding(src, mask, pos) # Encoder. output = self.encoder(emb, mask, vm) if(self.need_birnn): output, _ = self.birnn(output) # Target. output = self.dropout(output) output = self.output_layer(output) loss = -1*self.crf(output,label, mask=mask.byte()) output = torch.LongTensor(np.array(self.crf.decode(output))).to(self.args.device) output = output.contiguous().view(-1, 1) label = label.contiguous().view(-1, 1) label_mask = (label > 0).float().to(torch.device(label.device)) label_mask = label_mask.contiguous().view(-1) label = label.contiguous().view(-1) predict = output.contiguous().view(-1) correct = torch.sum( label_mask * (predict.eq(label)).float() ) #torch nb return loss, correct, predict, label