Пример #1
0
    def _run_mst_decoding(
        batch_energy: torch.Tensor, lengths: np.ndarray,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        heads = []
        head_tags = []
        for energy, length in zip(batch_energy.detach().cpu(), lengths):
            scores, tag_ids = energy.max(dim=0)
            # Although we need to include the root node so that the MST includes it,
            # we do not want any word to be the parent of the root node.
            # Here, we enforce this by setting the scores for all word -> ROOT edges
            # edges to be 0.
            scores[0, :] = 0
            # Decode the heads. Because we modify the scores to prevent
            # adding in word -> ROOT edges, we need to find the labels ourselves.
            instance_heads, _ = decode_mst(scores.numpy(), length, has_labels=False)

            # Find the labels which correspond to the edges in the max spanning tree.
            instance_head_tags = []
            for child, parent in enumerate(instance_heads):
                instance_head_tags.append(tag_ids[parent, child].item())
            # We don't care what the head or tag is for the root token, but by default
            # it's not necesarily the same in the batched vs unbatched case, which is
            # annoying. Here we'll just set them to zero.
            instance_heads[0] = 0
            instance_head_tags[0] = 0
            heads.append(instance_heads)
            head_tags.append(instance_head_tags)
        return (
            torch.from_numpy(np.stack(heads)).to(batch_energy.device),
            torch.from_numpy(np.stack(head_tags)).to(batch_energy.device),
        )
Пример #2
0
def loss_aug_inf(true_tree, scores_tensor, edges_map):
    # todo: make this method more efficient and generic
    true_edges = set([(true_h, true_m) for true_m, true_h in enumerate(true_tree[1:], 1)])
    n_edges = len(true_edges)
    n_words = n_edges + 1
    fine = 1

    # Populate score matrix - add a constant for edges that aren't part of the true tree
    scores_np_matrix = np.zeros((n_words, n_words))
    for (h, m) in edges_map.keys():
        if (h, m) in true_edges:
            scores_np_matrix[h][m] = scores_tensor[edges_map[(h, m)]].data[0]
        else:
            scores_np_matrix[h][m] = scores_tensor[edges_map[(h, m)]].data[0] + fine

    # Get the maximum spanning tree
    predicted_tree = decode_mst(scores_np_matrix, n_words, has_labels=False)[0]

    # Fill a tensor with the predicted tree scores
    pred_scores = torch.empty(n_edges, requires_grad=True)
    for pred_m, pred_h in enumerate(predicted_tree[1:], 1):
        if (pred_h, pred_m) not in true_edges:
            pred_scores[pred_m - 1] = scores_tensor[edges_map[(pred_h, pred_m)]] + fine
        else:
            pred_scores[pred_m - 1] = scores_tensor[edges_map[(pred_h, pred_m)]]

    # Fill a tensor with the true tree scores
    true_scores = torch.empty(n_edges, requires_grad=True)
    for true_m, true_h in enumerate(true_tree[1:], 1):
        true_scores[true_m - 1] = scores_tensor[edges_map[(true_h, true_m)]]

    # Loss calculation
    loss = torch.max(torch.tensor([0, 1 + torch.sum(true_scores) - torch.sum(pred_scores)], requires_grad=True))

    return -1 * loss  # todo: maybe we should multiply the loss by -1
Пример #3
0
def evaluate(model, dataloader):
    acc = 0
    loss_value = 0

    # tell the model not to learn
    with torch.no_grad():
        loss_function = nn.NLLLoss(ignore_index=-1, reduction='mean')
        for batch_idx, input_data in enumerate(dataloader):
            MLP_scores_mat = model(input_data)
            gold_heads = input_data[2]

            # concat -1 to true heads, we ignore this target value of -1
            target = torch.cat((torch.tensor([-1]), gold_heads[0])).to(model.device)

            # calculate negative log likelihood loss
            # log softmax over the rows (modifiers in rows)
            loss = loss_function(F.log_softmax(MLP_scores_mat, dim=1), target)
            loss_value += loss.item()

            # Use Chu-Liu-Edmonds to get the predicted parse tree T' given the calculated score matrix
            # res=[-1, 5, 0, , 4] - always -1 at the beginning because it's '<root>' token in every sentence's start
            predicted_tree = decode_mst(MLP_scores_mat.data.cpu().numpy().T, length=MLP_scores_mat.shape[0],
                                        has_labels=False)[0]

            acc += sum(gold_heads[0].numpy() == predicted_tree[1:]) / len(gold_heads[0])
        acc = acc / len(dataloader)
        loss_value = loss_value / len(dataloader)

    return acc, loss_value
Пример #4
0
def get_acc(edge_scores, headers_idx_tensors, batch_size, max_length,
            sentence_length):
    """
    Uses Chu Liu Edmonds algorithm to infer a parse tree and calculates the current batch accuracy.
    Args:
        edge_scores: Edge scores matrix, gained our of our chosen model.
        headers_idx_tensors: The gold headers to compare to.
        batch_size: The number of sentences in a batch.
        max_length: The maximum length of a sentence in the batch.
        sentence_length: List of all the sentences length.

    Returns:
        The summed accuracy of the current batch.

    """
    acc = 0
    trees = []
    for i in range(batch_size):
        trees.append(
            decode_mst(np.array(edge_scores[:, i].detach().cpu()).reshape(
                (max_length,
                 max_length))[:sentence_length[i], :sentence_length[i]],
                       sentence_length[i],
                       has_labels=False)[0])

    for i in range(batch_size):
        acc += torch.sum(
            torch.tensor(headers_idx_tensors[i][1:].tolist() == trees[i][1:],
                         dtype=torch.float,
                         requires_grad=False))
    return acc
Пример #5
0
def tag_file_save_output(model, dataloader, original_unlabeled_file,
                         result_path):
    # read the whole file we wish to tag to list of lines
    with open(original_unlabeled_file) as file_to_tag:
        lines = file_to_tag.readlines()

    # inference and write output to file in the wanted format
    with open(result_path, 'w') as result:
        with torch.no_grad():
            for batch_idx, input_data in enumerate(dataloader):
                MLP_scores_mat = model(input_data)

                # res=[-1, 5, 0, , 4] - always -1 at the beginning because it's '<root>' token in every sentence's start
                predicted_tree = decode_mst(
                    MLP_scores_mat.data.cpu().numpy().T,
                    length=MLP_scores_mat.shape[0],
                    has_labels=False)[0]

                for head in predicted_tree[1:]:
                    original_line = lines[0]
                    tabs_locs = [
                        idx for idx, char in enumerate(original_line)
                        if char == "\t"
                    ]
                    # search for the 6th '\t'
                    line_to_save = original_line[:tabs_locs[5] + 1] + str(
                        head) + original_line[tabs_locs[6]:]
                    result.write(line_to_save)
                    del lines[0]

                result.write(lines[0])
                del lines[0]  # the separating \n
Пример #6
0
def evaluate(dataloader, model, pretrained_embeds=None, ix_to_word=None):
    model.eval()  # put model on eval model to avoid dropouts
    true_positives = 0
    total_tokens = 0
    with torch.no_grad():
        for batch_idx, input_data in enumerate(dataloader):
            if len(input_data) == 4:
                word_idx, pos_idx, gold, word_embeds_idx = input_data
            else:
                word_idx, pos_idx, gold = input_data
                word_embeds_idx = word_idx
            if pretrained_embeds and ix_to_word:
                external_embeds = get_pretrained_vector(
                    pretrained_embeds, word_embeds_idx, ix_to_word)
                scores = model(word_idx, pos_idx, external_embeds)
            else:
                scores = model(word_idx, pos_idx)
            scores = scores.cpu().detach().numpy().T
            gold = gold.squeeze(0)[1:].detach().numpy()

            predicted_heads, _ = decode_mst(scores, len(scores[0]), False)
            true_positives += np.sum(np.equal(predicted_heads[1:], gold))
            total_tokens += len(gold)
    uas = true_positives / total_tokens
    return uas
Пример #7
0
def predict_dep(scores):
    predictions = []
    for sentence_scores in scores:
        score_matrix = sentence_scores.cpu().detach().numpy()
        score_matrix[:, 0] = float("-inf")
        mst, _ = decode_mst(score_matrix, len(score_matrix), has_labels=False)
        predictions.append(mst)
    return np.array(predictions)
Пример #8
0
    def forward(self, word_idx_tensor, tag_idx_tensor, calc_mst=False):
        # get embedding of input
        word_embeds = self.word_embedding(word_idx_tensor.to(
            self.device))  # [batch_size, seq_length, word_emb_dim]
        tag_embeds = self.tag_embedding(tag_idx_tensor.to(
            self.device))  # [batch_size, seq_length, tag_emb_dim]
        concat_emb = torch.cat(
            [word_embeds, tag_embeds],
            dim=2)  # [batch_size, seq_length, word_emb_dim+tag_emb_dim]
        lstm_out, _ = self.encoder(
            concat_emb.view(concat_emb.shape[1], 1,
                            -1))  # [seq_length, batch_size, 2*hidden_dim]

        lstm_out_b_first = lstm_out.permute(1, 0, 2)
        first_part_out = (
            lstm_out_b_first @ self.fc1.weight.T[:lstm_out_b_first.shape[2], :]
            + self.fc1.bias.T).squeeze(0)
        second_part_out = (
            lstm_out_b_first @ self.fc1.weight.T[lstm_out_b_first.shape[2]:, :]
            + self.fc1.bias.T).squeeze(0)
        first_part_out1 = first_part_out.unsqueeze(0)
        second_part_out1 = second_part_out.unsqueeze(1)
        first_part_out2 = first_part_out1.repeat(second_part_out.shape[0], 1,
                                                 1)
        second_part_out2 = second_part_out1.repeat(1, first_part_out.shape[0],
                                                   1)
        Z = first_part_out2 + second_part_out2
        out_1 = Z.view(-1, Z.shape[-1])  # [seq_length**2,hidden_dim_mlp]

        # scores = self.fc2(self.tan(out_1)).view(lstm_out.shape[0], lstm_out.shape[0]).squeeze(0)
        scores = self.fc2(self.tan(out_1)).view(lstm_out.shape[0],
                                                lstm_out.shape[0]).squeeze(0)
        tmp_scores = F.log_softmax(scores, dim=1)
        # calc tree
        our_heads = None
        if calc_mst:
            with torch.no_grad():
                dep_scores = scores.unsqueeze(0).permute(0, 2, 1)
                dep_scores_2d = dep_scores.squeeze(0)
                # TODO: add zeros on diagonal
                our_heads, _ = decode_mst(energy=dep_scores_2d.cpu().numpy(),
                                          length=tmp_scores.shape[0],
                                          has_labels=False)
                # print(f'our heads: {our_heads}')
        # print(f'tmp_scores.device: {tmp_scores.device}')
        # print(f'our_heads type: {type(our_heads)}')
        # print(f'scores.device: {scores.device}')
        return tmp_scores, our_heads, scores
Пример #9
0
def parser(dataloader, model, pretrained_embeds=None, ix_to_word=None):
    predicted_list = []
    model.eval()  # put model on eval model to avoid dropouts
    with torch.no_grad():
        for batch_idx, input_data in enumerate(dataloader):
            if len(input_data) == 4:
                word_idx, pos_idx, gold, word_embeds_idx = input_data
            else:
                word_idx, pos_idx, gold = input_data
                word_embeds_idx = word_idx
            if pretrained_embeds and ix_to_word:
                external_embeds = get_pretrained_vector(
                    pretrained_embeds, word_embeds_idx, ix_to_word)
                scores = model(word_idx, pos_idx, external_embeds)
            else:
                scores = model(word_idx, pos_idx)
            scores = scores.cpu().detach().numpy().T
            predicted_heads, _ = decode_mst(scores, len(scores[0]), False)
            predicted_list.append(predicted_heads[1:])
    return predicted_list
Пример #10
0
    def forward(self, sentence, word_dropout=False):

        # Decompose the input
        word_indx_tensor, pos_indx_tensor, true_tree_heads = sentence
        n_words = word_indx_tensor.shape[1]

        # Word dropout
        if word_dropout:
            for cell_indx, word_indx in enumerate(word_indx_tensor):
                unk_prob = 0.25 / (self.w_indx_counter[word_indx] + 0.25)
                bernoulli_rv = np.random.binomial(1, unk_prob, 1)
                if bernoulli_rv:
                    word_indx_tensor[cell_indx] = self.w2i['<unk>']

        # Word & POS embedding
        word_emb_tensor = self.word_embedding(word_indx_tensor.to(self.device))
        pos_emb_tensor = self.pos_embedding(pos_indx_tensor.to(self.device))
        # Embeddings concatenation
        if self.ex_emb_flag:
            ex_word_em_tensor = self.ex_word_embedding(word_indx_tensor)
            input_vectors = torch.cat((word_emb_tensor, ex_word_em_tensor, pos_emb_tensor), dim=-1)
        else:
            input_vectors = torch.cat((word_emb_tensor, pos_emb_tensor), dim=-1)

        hidden_vectors, _ = self.encoder(input_vectors)
        hidden_vectors = hidden_vectors.squeeze()

        heads_tensor = self.mlp_head(hidden_vectors)
        dep_tensor = self.mlp_dep(hidden_vectors)
        pad_dep_tensor = torch.cat((dep_tensor, torch.ones(dep_tensor.shape[0]).to(self.device).unsqueeze(1)), dim=1)

        scores = torch.matmul(torch.matmul(pad_dep_tensor, self.weights), heads_tensor.T).T

        # Prediction & loss calculation
        predicted_tree = decode_mst(scores.detach().numpy(), n_words, has_labels=False)

        scores_s_max = self.s_max(scores)
        loss = self.loss(true_tree_heads, scores_s_max)
        return loss, predicted_tree[0]
Пример #11
0
def train_kiperwasser_parser(model, train_dataloader, test_dataloader, epochs, learning_rate, weight_decay, alpha):
    start = time.time()
    total_test_time = 0

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")

    if use_cuda:
        model.cuda()

    # Define the loss function as the Negative Log Likelihood loss (NLLLoss)
    loss_function = nn.NLLLoss(ignore_index=-1, reduction='mean')

    # We will be using a simple SGD optimizer to minimize the loss function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    # TODO optimize learning rate 'lr'
    acumulate_grad_steps = 50  # This is the actual batch_size, while we officially use batch_size=1

    # Training start
    print("Training Started")
    train_accuracy_list, train_loss_list = [], []
    test_accuracy_list, test_loss_list = [], []

    model.zero_grad()
    for epoch in range(epochs):
        train_acc = 0  # to keep track of accuracy
        train_loss = 0  # to keep track of the loss value
        mst_trees_calculated = 0  # keep track of amount of trees calculated to plot the accuracy graph
        i = 0  # keep track of samples processed

        # print(f'word embedding <root token>: {model.word_embedding(torch.tensor([[0]]).to(model.device))}')
        # print(f'word embedding <unk token>: {model.word_embedding(torch.tensor([[1]]).to(model.device))}')
        data = list(enumerate(train_dataloader))  # save this so we can modify it to introduce word-dropout
        word_dropout(model, data, alpha=alpha)

        for batch_idx, input_data in data:
            i += 1
            # size = [sentence_length + 1, sentence_length + 1]
            MLP_scores_mat = model(input_data)  # forward activated inside

            gold_heads = input_data[2]

            # concat -1 to true heads, we ignore this target value of -1
            target = torch.cat((torch.tensor([-1]), gold_heads[0])).to(device)

            # calculate negative log likelihood loss
            # log softmax over the rows (modifiers in rows)
            loss = loss_function(F.log_softmax(MLP_scores_mat, dim=1), target)
            loss = loss / acumulate_grad_steps
            loss.backward()
            train_loss += loss.item()

            # calculated sampled tress - only for accuracy calculations during train
            if i > 0.9 * len(train_dataloader):  # predict trees on 10% of train data
                # res=[-1, 5, 0, , 4] - always -1 at the beginning because it's '<root>' token in every sentence's start
                predicted_tree = decode_mst(MLP_scores_mat.cpu().data.numpy().T, length=MLP_scores_mat.shape[0],
                                            has_labels=False)[0]

                train_acc += sum(gold_heads[0].numpy() == predicted_tree[1:]) / len(gold_heads[0])
                mst_trees_calculated += 1

            # perform optimization step
            if i % acumulate_grad_steps == 0 or i == len(train_dataloader):
                optimizer.step()
                model.zero_grad()

        train_loss = acumulate_grad_steps * train_loss / len(train_dataloader)
        train_acc = train_acc / mst_trees_calculated if mst_trees_calculated != 0 else 0
        train_loss_list.append(train_loss)
        train_accuracy_list.append(train_acc)

        start_test_time = time.time()
        # calculate test accuracy >>> skip the next 3 lines if no need to know the test accuracy during training
        test_acc, test_loss = evaluate(model, test_dataloader)
        test_accuracy_list.append(test_acc)
        test_loss_list.append(test_loss)
        stop_test_time = time.time()
        total_test_time += stop_test_time - start_test_time

        print(f"Epoch {epoch + 1} Completed,\tTrain Loss {train_loss}\t Train Accuracy: {train_acc}\t "
              f"Test Loss {test_loss}\t Test Accuracy: {test_acc}")

        # print time for the end of epoch
        print(f"Epoch {epoch + 1} Time "
              f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))}")

    stop = time.time()

    total_train_time = stop - start - total_test_time

    print(f'\n\n\ntotal_train_time = {int(total_train_time)} SECS \t total_test_time = {int(total_test_time)} SECS')
    return train_accuracy_list, train_loss_list, test_accuracy_list, test_loss_list
Пример #12
0
    def forward(self, sentence, word_dropout=False):

        # Decompose the input
        if self.labels_flag:
            word_indx_tensor, pos_indx_tensor, true_tree_heads, true_edges_labels = sentence
        else:
            word_indx_tensor, pos_indx_tensor, true_tree_heads = sentence
        n_words = word_indx_tensor.shape[1]

        # Word dropout
        if word_dropout:
            for cell_indx, word_indx in enumerate(word_indx_tensor):
                unk_prob = 0.25 / (self.w_indx_counter[word_indx] + 0.25)
                bernoulli_rv = np.random.binomial(1, unk_prob, 1)
                if bernoulli_rv:
                    word_indx_tensor[cell_indx] = self.w2i['<unk>']

        # Word & POS embedding
        word_emb_tensor = self.word_embedding(word_indx_tensor.to(self.device))
        pos_emb_tensor = self.pos_embedding(pos_indx_tensor.to(self.device))
        # Embeddings concatenation
        if self.ex_emb_flag:
            ex_word_em_tensor = self.ex_word_embedding(word_indx_tensor)
            input_vectors = torch.cat((word_emb_tensor, ex_word_em_tensor, pos_emb_tensor), dim=-1)
        else:
            input_vectors = torch.cat((word_emb_tensor, pos_emb_tensor), dim=-1)

        hidden_vectors, _ = self.encoder(input_vectors)

        # Create 'edge vectors' by concatenating couples of hidden vectors
        edges_map, true_edges_map = {}, {}
        true_indx = 0
        valid_edges = [(h, m) for m in range(1, n_words) for h in range(0, n_words) if h != m]
        heads, mods, true_edges_indx = [], [], []

        for indx, (h, m) in enumerate(valid_edges):
            heads.append(h)
            mods.append(m)
            edges_map[(h, m)] = indx

            if true_tree_heads[m] == h:
                true_edges_indx.append(indx)
                true_edges_map[(h, m)] = true_indx
                true_indx += 1

        edges_tensor = torch.cat((hidden_vectors[0, heads, :], hidden_vectors[0, mods, :]), dim=-1)
        true_edges_tensor = edges_tensor[true_edges_indx, :]

        # Activate the scorer
        scores_tensor = self.edge_scorer(edges_tensor)
        if self.labels_flag:
            l_softmax_tensor = self.labels_mlp(true_edges_tensor)

        # Represent the scores as a 2-dimensional numpy array
        scores_np_matrix = np.zeros((n_words, n_words))
        for (h, m) in edges_map.keys():
            scores_np_matrix[h][m] = scores_tensor[edges_map[(h, m)]].data[0]

        # Prediction & loss calculation
        predicted_tree = decode_mst(scores_np_matrix, n_words, has_labels=False)

        if self.labels_flag:
            loss = self.loss(true_tree_heads, scores_tensor, edges_map,
                             true_edges_labels, l_softmax_tensor, true_edges_map)
        else:
            loss = self.loss(true_tree_heads, scores_tensor, edges_map)
        return loss, predicted_tree[0]
Пример #13
0
def unlabeled_attachment_score(scores, heads):
    parse_tree, _ = decode_mst(scores[1:, :].detach().numpy(), heads.shape[0] - 1, has_labels=False)
    return sum(parse_tree[i] + 1 == heads[i].item() for i in range(heads.shape[0] - 1)) / (heads.shape[0] - 1)
Пример #14
0
def train_epoch(train, dl):
    global max_uas
    losses_test = []
    losses_train = []
    UAS_train = []
    UAS_test = []
    loss = None
    acumulate_grad_steps = 256

    for i, data_batch in enumerate(dl):
        curr_sentence = data_batch

        curr_sentence[0] = curr_sentence[0].squeeze().to(device)
        curr_sentence[1] = curr_sentence[1].squeeze().to(device)
        curr_sentence[2] = curr_sentence[2].squeeze().to(device)
        curr_sentence[3] = curr_sentence[3].squeeze().to(device)

        sentence_inputs = curr_sentence[0:3]
        sentence_len = curr_sentence[2].item()
        sentence_labels = curr_sentence[3].to(device)
        score_mat = curr_model.forward(
            sentence_inputs)  # do not forward the labels.
        score_mat = score_mat.to(device)
        '''
        for head_idx in range(sentence_len):
            for modifyer_idx in range(sentence_len):
                if head_idx == modifyer_idx:
                    score_mat[head_idx][modifyer_idx] = 0
                    continue
                if modifyer_idx == sentence_labels[head_idx]:
                    score_mat[modifyer_idx][head_idx] = 100
                else:
                    score_mat[modifyer_idx][head_idx] = 0
        '''

        # Calculate the negative log likelihood loss described above
        loss = nll_loss(score_mat, sentence_labels, sentence_len)
        loss = loss / acumulate_grad_steps

        if train is True:
            # optimizer.zero_grad()
            loss.backward()
            if i % acumulate_grad_steps == 0:
                optimizer.step()
                curr_model.zero_grad()

            losses_train.append(loss.item())
            if i % acumulate_grad_steps == 0:
                predicted_tree, _ = decode_mst(energy=score_mat.cpu().detach(),
                                               length=score_mat.shape[0],
                                               has_labels=False)
                uas_score = UAS(predicted_tree, sentence_labels)
                UAS_train.append(uas_score)
        else:
            # Use Chu-Liu-Edmonds to get the predicted parse tree T' given the calculated score matrix
            predicted_tree, _ = decode_mst(energy=score_mat.cpu().detach(),
                                           length=score_mat.shape[0],
                                           has_labels=False)
            uas_score = UAS(predicted_tree, sentence_labels)

            losses_test.append(loss.item())
            UAS_test.append(uas_score)

    if train is True:
        print("\nTrain: epoch number", epoch, ":  loss = ",
              np.mean(losses_train), ": UAS = ", np.mean(UAS_train), "%")
    else:
        print("Test: epoch number", epoch, ":  loss = ", np.mean(losses_test),
              ": UAS = ", np.mean(UAS_test), "%")

        if np.mean(UAS_test) > max_uas:
            print("saving the model")
            max_uas = np.mean(UAS_test)
            torch.save(
                curr_model.state_dict(),
                "model" + str(model_choosed) + "_epoch" + str(epoch) + ".pt")

    return np.mean(losses_train), np.mean(UAS_train), np.mean(
        losses_test), np.mean(UAS_test)