def _run_mst_decoding( batch_energy: torch.Tensor, lengths: np.ndarray, ) -> Tuple[torch.Tensor, torch.Tensor]: heads = [] head_tags = [] for energy, length in zip(batch_energy.detach().cpu(), lengths): scores, tag_ids = energy.max(dim=0) # Although we need to include the root node so that the MST includes it, # we do not want any word to be the parent of the root node. # Here, we enforce this by setting the scores for all word -> ROOT edges # edges to be 0. scores[0, :] = 0 # Decode the heads. Because we modify the scores to prevent # adding in word -> ROOT edges, we need to find the labels ourselves. instance_heads, _ = decode_mst(scores.numpy(), length, has_labels=False) # Find the labels which correspond to the edges in the max spanning tree. instance_head_tags = [] for child, parent in enumerate(instance_heads): instance_head_tags.append(tag_ids[parent, child].item()) # We don't care what the head or tag is for the root token, but by default # it's not necesarily the same in the batched vs unbatched case, which is # annoying. Here we'll just set them to zero. instance_heads[0] = 0 instance_head_tags[0] = 0 heads.append(instance_heads) head_tags.append(instance_head_tags) return ( torch.from_numpy(np.stack(heads)).to(batch_energy.device), torch.from_numpy(np.stack(head_tags)).to(batch_energy.device), )
def loss_aug_inf(true_tree, scores_tensor, edges_map): # todo: make this method more efficient and generic true_edges = set([(true_h, true_m) for true_m, true_h in enumerate(true_tree[1:], 1)]) n_edges = len(true_edges) n_words = n_edges + 1 fine = 1 # Populate score matrix - add a constant for edges that aren't part of the true tree scores_np_matrix = np.zeros((n_words, n_words)) for (h, m) in edges_map.keys(): if (h, m) in true_edges: scores_np_matrix[h][m] = scores_tensor[edges_map[(h, m)]].data[0] else: scores_np_matrix[h][m] = scores_tensor[edges_map[(h, m)]].data[0] + fine # Get the maximum spanning tree predicted_tree = decode_mst(scores_np_matrix, n_words, has_labels=False)[0] # Fill a tensor with the predicted tree scores pred_scores = torch.empty(n_edges, requires_grad=True) for pred_m, pred_h in enumerate(predicted_tree[1:], 1): if (pred_h, pred_m) not in true_edges: pred_scores[pred_m - 1] = scores_tensor[edges_map[(pred_h, pred_m)]] + fine else: pred_scores[pred_m - 1] = scores_tensor[edges_map[(pred_h, pred_m)]] # Fill a tensor with the true tree scores true_scores = torch.empty(n_edges, requires_grad=True) for true_m, true_h in enumerate(true_tree[1:], 1): true_scores[true_m - 1] = scores_tensor[edges_map[(true_h, true_m)]] # Loss calculation loss = torch.max(torch.tensor([0, 1 + torch.sum(true_scores) - torch.sum(pred_scores)], requires_grad=True)) return -1 * loss # todo: maybe we should multiply the loss by -1
def evaluate(model, dataloader): acc = 0 loss_value = 0 # tell the model not to learn with torch.no_grad(): loss_function = nn.NLLLoss(ignore_index=-1, reduction='mean') for batch_idx, input_data in enumerate(dataloader): MLP_scores_mat = model(input_data) gold_heads = input_data[2] # concat -1 to true heads, we ignore this target value of -1 target = torch.cat((torch.tensor([-1]), gold_heads[0])).to(model.device) # calculate negative log likelihood loss # log softmax over the rows (modifiers in rows) loss = loss_function(F.log_softmax(MLP_scores_mat, dim=1), target) loss_value += loss.item() # Use Chu-Liu-Edmonds to get the predicted parse tree T' given the calculated score matrix # res=[-1, 5, 0, , 4] - always -1 at the beginning because it's '<root>' token in every sentence's start predicted_tree = decode_mst(MLP_scores_mat.data.cpu().numpy().T, length=MLP_scores_mat.shape[0], has_labels=False)[0] acc += sum(gold_heads[0].numpy() == predicted_tree[1:]) / len(gold_heads[0]) acc = acc / len(dataloader) loss_value = loss_value / len(dataloader) return acc, loss_value
def get_acc(edge_scores, headers_idx_tensors, batch_size, max_length, sentence_length): """ Uses Chu Liu Edmonds algorithm to infer a parse tree and calculates the current batch accuracy. Args: edge_scores: Edge scores matrix, gained our of our chosen model. headers_idx_tensors: The gold headers to compare to. batch_size: The number of sentences in a batch. max_length: The maximum length of a sentence in the batch. sentence_length: List of all the sentences length. Returns: The summed accuracy of the current batch. """ acc = 0 trees = [] for i in range(batch_size): trees.append( decode_mst(np.array(edge_scores[:, i].detach().cpu()).reshape( (max_length, max_length))[:sentence_length[i], :sentence_length[i]], sentence_length[i], has_labels=False)[0]) for i in range(batch_size): acc += torch.sum( torch.tensor(headers_idx_tensors[i][1:].tolist() == trees[i][1:], dtype=torch.float, requires_grad=False)) return acc
def tag_file_save_output(model, dataloader, original_unlabeled_file, result_path): # read the whole file we wish to tag to list of lines with open(original_unlabeled_file) as file_to_tag: lines = file_to_tag.readlines() # inference and write output to file in the wanted format with open(result_path, 'w') as result: with torch.no_grad(): for batch_idx, input_data in enumerate(dataloader): MLP_scores_mat = model(input_data) # res=[-1, 5, 0, , 4] - always -1 at the beginning because it's '<root>' token in every sentence's start predicted_tree = decode_mst( MLP_scores_mat.data.cpu().numpy().T, length=MLP_scores_mat.shape[0], has_labels=False)[0] for head in predicted_tree[1:]: original_line = lines[0] tabs_locs = [ idx for idx, char in enumerate(original_line) if char == "\t" ] # search for the 6th '\t' line_to_save = original_line[:tabs_locs[5] + 1] + str( head) + original_line[tabs_locs[6]:] result.write(line_to_save) del lines[0] result.write(lines[0]) del lines[0] # the separating \n
def evaluate(dataloader, model, pretrained_embeds=None, ix_to_word=None): model.eval() # put model on eval model to avoid dropouts true_positives = 0 total_tokens = 0 with torch.no_grad(): for batch_idx, input_data in enumerate(dataloader): if len(input_data) == 4: word_idx, pos_idx, gold, word_embeds_idx = input_data else: word_idx, pos_idx, gold = input_data word_embeds_idx = word_idx if pretrained_embeds and ix_to_word: external_embeds = get_pretrained_vector( pretrained_embeds, word_embeds_idx, ix_to_word) scores = model(word_idx, pos_idx, external_embeds) else: scores = model(word_idx, pos_idx) scores = scores.cpu().detach().numpy().T gold = gold.squeeze(0)[1:].detach().numpy() predicted_heads, _ = decode_mst(scores, len(scores[0]), False) true_positives += np.sum(np.equal(predicted_heads[1:], gold)) total_tokens += len(gold) uas = true_positives / total_tokens return uas
def predict_dep(scores): predictions = [] for sentence_scores in scores: score_matrix = sentence_scores.cpu().detach().numpy() score_matrix[:, 0] = float("-inf") mst, _ = decode_mst(score_matrix, len(score_matrix), has_labels=False) predictions.append(mst) return np.array(predictions)
def forward(self, word_idx_tensor, tag_idx_tensor, calc_mst=False): # get embedding of input word_embeds = self.word_embedding(word_idx_tensor.to( self.device)) # [batch_size, seq_length, word_emb_dim] tag_embeds = self.tag_embedding(tag_idx_tensor.to( self.device)) # [batch_size, seq_length, tag_emb_dim] concat_emb = torch.cat( [word_embeds, tag_embeds], dim=2) # [batch_size, seq_length, word_emb_dim+tag_emb_dim] lstm_out, _ = self.encoder( concat_emb.view(concat_emb.shape[1], 1, -1)) # [seq_length, batch_size, 2*hidden_dim] lstm_out_b_first = lstm_out.permute(1, 0, 2) first_part_out = ( lstm_out_b_first @ self.fc1.weight.T[:lstm_out_b_first.shape[2], :] + self.fc1.bias.T).squeeze(0) second_part_out = ( lstm_out_b_first @ self.fc1.weight.T[lstm_out_b_first.shape[2]:, :] + self.fc1.bias.T).squeeze(0) first_part_out1 = first_part_out.unsqueeze(0) second_part_out1 = second_part_out.unsqueeze(1) first_part_out2 = first_part_out1.repeat(second_part_out.shape[0], 1, 1) second_part_out2 = second_part_out1.repeat(1, first_part_out.shape[0], 1) Z = first_part_out2 + second_part_out2 out_1 = Z.view(-1, Z.shape[-1]) # [seq_length**2,hidden_dim_mlp] # scores = self.fc2(self.tan(out_1)).view(lstm_out.shape[0], lstm_out.shape[0]).squeeze(0) scores = self.fc2(self.tan(out_1)).view(lstm_out.shape[0], lstm_out.shape[0]).squeeze(0) tmp_scores = F.log_softmax(scores, dim=1) # calc tree our_heads = None if calc_mst: with torch.no_grad(): dep_scores = scores.unsqueeze(0).permute(0, 2, 1) dep_scores_2d = dep_scores.squeeze(0) # TODO: add zeros on diagonal our_heads, _ = decode_mst(energy=dep_scores_2d.cpu().numpy(), length=tmp_scores.shape[0], has_labels=False) # print(f'our heads: {our_heads}') # print(f'tmp_scores.device: {tmp_scores.device}') # print(f'our_heads type: {type(our_heads)}') # print(f'scores.device: {scores.device}') return tmp_scores, our_heads, scores
def parser(dataloader, model, pretrained_embeds=None, ix_to_word=None): predicted_list = [] model.eval() # put model on eval model to avoid dropouts with torch.no_grad(): for batch_idx, input_data in enumerate(dataloader): if len(input_data) == 4: word_idx, pos_idx, gold, word_embeds_idx = input_data else: word_idx, pos_idx, gold = input_data word_embeds_idx = word_idx if pretrained_embeds and ix_to_word: external_embeds = get_pretrained_vector( pretrained_embeds, word_embeds_idx, ix_to_word) scores = model(word_idx, pos_idx, external_embeds) else: scores = model(word_idx, pos_idx) scores = scores.cpu().detach().numpy().T predicted_heads, _ = decode_mst(scores, len(scores[0]), False) predicted_list.append(predicted_heads[1:]) return predicted_list
def forward(self, sentence, word_dropout=False): # Decompose the input word_indx_tensor, pos_indx_tensor, true_tree_heads = sentence n_words = word_indx_tensor.shape[1] # Word dropout if word_dropout: for cell_indx, word_indx in enumerate(word_indx_tensor): unk_prob = 0.25 / (self.w_indx_counter[word_indx] + 0.25) bernoulli_rv = np.random.binomial(1, unk_prob, 1) if bernoulli_rv: word_indx_tensor[cell_indx] = self.w2i['<unk>'] # Word & POS embedding word_emb_tensor = self.word_embedding(word_indx_tensor.to(self.device)) pos_emb_tensor = self.pos_embedding(pos_indx_tensor.to(self.device)) # Embeddings concatenation if self.ex_emb_flag: ex_word_em_tensor = self.ex_word_embedding(word_indx_tensor) input_vectors = torch.cat((word_emb_tensor, ex_word_em_tensor, pos_emb_tensor), dim=-1) else: input_vectors = torch.cat((word_emb_tensor, pos_emb_tensor), dim=-1) hidden_vectors, _ = self.encoder(input_vectors) hidden_vectors = hidden_vectors.squeeze() heads_tensor = self.mlp_head(hidden_vectors) dep_tensor = self.mlp_dep(hidden_vectors) pad_dep_tensor = torch.cat((dep_tensor, torch.ones(dep_tensor.shape[0]).to(self.device).unsqueeze(1)), dim=1) scores = torch.matmul(torch.matmul(pad_dep_tensor, self.weights), heads_tensor.T).T # Prediction & loss calculation predicted_tree = decode_mst(scores.detach().numpy(), n_words, has_labels=False) scores_s_max = self.s_max(scores) loss = self.loss(true_tree_heads, scores_s_max) return loss, predicted_tree[0]
def train_kiperwasser_parser(model, train_dataloader, test_dataloader, epochs, learning_rate, weight_decay, alpha): start = time.time() total_test_time = 0 use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") if use_cuda: model.cuda() # Define the loss function as the Negative Log Likelihood loss (NLLLoss) loss_function = nn.NLLLoss(ignore_index=-1, reduction='mean') # We will be using a simple SGD optimizer to minimize the loss function optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # TODO optimize learning rate 'lr' acumulate_grad_steps = 50 # This is the actual batch_size, while we officially use batch_size=1 # Training start print("Training Started") train_accuracy_list, train_loss_list = [], [] test_accuracy_list, test_loss_list = [], [] model.zero_grad() for epoch in range(epochs): train_acc = 0 # to keep track of accuracy train_loss = 0 # to keep track of the loss value mst_trees_calculated = 0 # keep track of amount of trees calculated to plot the accuracy graph i = 0 # keep track of samples processed # print(f'word embedding <root token>: {model.word_embedding(torch.tensor([[0]]).to(model.device))}') # print(f'word embedding <unk token>: {model.word_embedding(torch.tensor([[1]]).to(model.device))}') data = list(enumerate(train_dataloader)) # save this so we can modify it to introduce word-dropout word_dropout(model, data, alpha=alpha) for batch_idx, input_data in data: i += 1 # size = [sentence_length + 1, sentence_length + 1] MLP_scores_mat = model(input_data) # forward activated inside gold_heads = input_data[2] # concat -1 to true heads, we ignore this target value of -1 target = torch.cat((torch.tensor([-1]), gold_heads[0])).to(device) # calculate negative log likelihood loss # log softmax over the rows (modifiers in rows) loss = loss_function(F.log_softmax(MLP_scores_mat, dim=1), target) loss = loss / acumulate_grad_steps loss.backward() train_loss += loss.item() # calculated sampled tress - only for accuracy calculations during train if i > 0.9 * len(train_dataloader): # predict trees on 10% of train data # res=[-1, 5, 0, , 4] - always -1 at the beginning because it's '<root>' token in every sentence's start predicted_tree = decode_mst(MLP_scores_mat.cpu().data.numpy().T, length=MLP_scores_mat.shape[0], has_labels=False)[0] train_acc += sum(gold_heads[0].numpy() == predicted_tree[1:]) / len(gold_heads[0]) mst_trees_calculated += 1 # perform optimization step if i % acumulate_grad_steps == 0 or i == len(train_dataloader): optimizer.step() model.zero_grad() train_loss = acumulate_grad_steps * train_loss / len(train_dataloader) train_acc = train_acc / mst_trees_calculated if mst_trees_calculated != 0 else 0 train_loss_list.append(train_loss) train_accuracy_list.append(train_acc) start_test_time = time.time() # calculate test accuracy >>> skip the next 3 lines if no need to know the test accuracy during training test_acc, test_loss = evaluate(model, test_dataloader) test_accuracy_list.append(test_acc) test_loss_list.append(test_loss) stop_test_time = time.time() total_test_time += stop_test_time - start_test_time print(f"Epoch {epoch + 1} Completed,\tTrain Loss {train_loss}\t Train Accuracy: {train_acc}\t " f"Test Loss {test_loss}\t Test Accuracy: {test_acc}") # print time for the end of epoch print(f"Epoch {epoch + 1} Time " f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))}") stop = time.time() total_train_time = stop - start - total_test_time print(f'\n\n\ntotal_train_time = {int(total_train_time)} SECS \t total_test_time = {int(total_test_time)} SECS') return train_accuracy_list, train_loss_list, test_accuracy_list, test_loss_list
def forward(self, sentence, word_dropout=False): # Decompose the input if self.labels_flag: word_indx_tensor, pos_indx_tensor, true_tree_heads, true_edges_labels = sentence else: word_indx_tensor, pos_indx_tensor, true_tree_heads = sentence n_words = word_indx_tensor.shape[1] # Word dropout if word_dropout: for cell_indx, word_indx in enumerate(word_indx_tensor): unk_prob = 0.25 / (self.w_indx_counter[word_indx] + 0.25) bernoulli_rv = np.random.binomial(1, unk_prob, 1) if bernoulli_rv: word_indx_tensor[cell_indx] = self.w2i['<unk>'] # Word & POS embedding word_emb_tensor = self.word_embedding(word_indx_tensor.to(self.device)) pos_emb_tensor = self.pos_embedding(pos_indx_tensor.to(self.device)) # Embeddings concatenation if self.ex_emb_flag: ex_word_em_tensor = self.ex_word_embedding(word_indx_tensor) input_vectors = torch.cat((word_emb_tensor, ex_word_em_tensor, pos_emb_tensor), dim=-1) else: input_vectors = torch.cat((word_emb_tensor, pos_emb_tensor), dim=-1) hidden_vectors, _ = self.encoder(input_vectors) # Create 'edge vectors' by concatenating couples of hidden vectors edges_map, true_edges_map = {}, {} true_indx = 0 valid_edges = [(h, m) for m in range(1, n_words) for h in range(0, n_words) if h != m] heads, mods, true_edges_indx = [], [], [] for indx, (h, m) in enumerate(valid_edges): heads.append(h) mods.append(m) edges_map[(h, m)] = indx if true_tree_heads[m] == h: true_edges_indx.append(indx) true_edges_map[(h, m)] = true_indx true_indx += 1 edges_tensor = torch.cat((hidden_vectors[0, heads, :], hidden_vectors[0, mods, :]), dim=-1) true_edges_tensor = edges_tensor[true_edges_indx, :] # Activate the scorer scores_tensor = self.edge_scorer(edges_tensor) if self.labels_flag: l_softmax_tensor = self.labels_mlp(true_edges_tensor) # Represent the scores as a 2-dimensional numpy array scores_np_matrix = np.zeros((n_words, n_words)) for (h, m) in edges_map.keys(): scores_np_matrix[h][m] = scores_tensor[edges_map[(h, m)]].data[0] # Prediction & loss calculation predicted_tree = decode_mst(scores_np_matrix, n_words, has_labels=False) if self.labels_flag: loss = self.loss(true_tree_heads, scores_tensor, edges_map, true_edges_labels, l_softmax_tensor, true_edges_map) else: loss = self.loss(true_tree_heads, scores_tensor, edges_map) return loss, predicted_tree[0]
def unlabeled_attachment_score(scores, heads): parse_tree, _ = decode_mst(scores[1:, :].detach().numpy(), heads.shape[0] - 1, has_labels=False) return sum(parse_tree[i] + 1 == heads[i].item() for i in range(heads.shape[0] - 1)) / (heads.shape[0] - 1)
def train_epoch(train, dl): global max_uas losses_test = [] losses_train = [] UAS_train = [] UAS_test = [] loss = None acumulate_grad_steps = 256 for i, data_batch in enumerate(dl): curr_sentence = data_batch curr_sentence[0] = curr_sentence[0].squeeze().to(device) curr_sentence[1] = curr_sentence[1].squeeze().to(device) curr_sentence[2] = curr_sentence[2].squeeze().to(device) curr_sentence[3] = curr_sentence[3].squeeze().to(device) sentence_inputs = curr_sentence[0:3] sentence_len = curr_sentence[2].item() sentence_labels = curr_sentence[3].to(device) score_mat = curr_model.forward( sentence_inputs) # do not forward the labels. score_mat = score_mat.to(device) ''' for head_idx in range(sentence_len): for modifyer_idx in range(sentence_len): if head_idx == modifyer_idx: score_mat[head_idx][modifyer_idx] = 0 continue if modifyer_idx == sentence_labels[head_idx]: score_mat[modifyer_idx][head_idx] = 100 else: score_mat[modifyer_idx][head_idx] = 0 ''' # Calculate the negative log likelihood loss described above loss = nll_loss(score_mat, sentence_labels, sentence_len) loss = loss / acumulate_grad_steps if train is True: # optimizer.zero_grad() loss.backward() if i % acumulate_grad_steps == 0: optimizer.step() curr_model.zero_grad() losses_train.append(loss.item()) if i % acumulate_grad_steps == 0: predicted_tree, _ = decode_mst(energy=score_mat.cpu().detach(), length=score_mat.shape[0], has_labels=False) uas_score = UAS(predicted_tree, sentence_labels) UAS_train.append(uas_score) else: # Use Chu-Liu-Edmonds to get the predicted parse tree T' given the calculated score matrix predicted_tree, _ = decode_mst(energy=score_mat.cpu().detach(), length=score_mat.shape[0], has_labels=False) uas_score = UAS(predicted_tree, sentence_labels) losses_test.append(loss.item()) UAS_test.append(uas_score) if train is True: print("\nTrain: epoch number", epoch, ": loss = ", np.mean(losses_train), ": UAS = ", np.mean(UAS_train), "%") else: print("Test: epoch number", epoch, ": loss = ", np.mean(losses_test), ": UAS = ", np.mean(UAS_test), "%") if np.mean(UAS_test) > max_uas: print("saving the model") max_uas = np.mean(UAS_test) torch.save( curr_model.state_dict(), "model" + str(model_choosed) + "_epoch" + str(epoch) + ".pt") return np.mean(losses_train), np.mean(UAS_train), np.mean( losses_test), np.mean(UAS_test)