Пример #1
0
def extract_structures(model, test_batches, device, vocab, dirName):
    model.eval()
    dirName = dirName + "/structures"
    if not os.path.exists(dirName):
        os.mkdir(dirName)
        print("Directory ", dirName, " Created ")
    count = 0
    for ct, batch in test_batches:
        value, feed_dict = get_feed_dict(batch, device)
        if not value:
            continue
        output, sent_attention_matrix, doc_attention_matrix = model.forward(
            feed_dict)

        for i in range(len(batch)):
            fileName = dirName + "/" + str(count) + ".txt"
            count += 1
            fp = open(fileName, "w")
            #print("\nDoc: "+str(count)+"\n")
            fp.write("Doc: " + str(count) + "\n")

            l = len(batch[i].token_idxs)
            sent_no = 0
            for sent in batch[i].token_idxs:
                printstr = ''
                #scores = str_scores_sent[sent_no][0:l, 0:l]
                token_count = 0
                for token in sent:
                    printstr += vocab[token] + " "
                    token_count = token_count + 1
                #print(printstr)
                fp.write(printstr + "\n")

                scores = sent_attention_matrix[sent_no][0:token_count,
                                                        0:token_count]
                shape2 = sent_attention_matrix[sent_no][0:token_count,
                                                        0:token_count].size()
                row = torch.ones([1, shape2[1] + 1]).to(device)
                column = torch.zeros([shape2[0], 1]).to(device)
                new_scores = torch.cat([column, scores], dim=1)
                new_scores = torch.cat([row, new_scores], dim=0)
                heads, tree_score = chu_liu_edmonds(
                    new_scores.data.cpu().numpy().astype(np.float64))
                #print(heads, tree_score)
                fp.write(str(heads) + " ")
                fp.write(str(tree_score) + "\n")

            shape2 = doc_attention_matrix[i][0:l, 0:l].size()
            row = torch.ones([1, shape2[1] + 1]).to(device)
            column = torch.zeros([shape2[0], 1]).to(device)
            scores = doc_attention_matrix[i][0:l, 0:l]
            new_scores = torch.cat([column, scores], dim=1)
            new_scores = torch.cat([row, new_scores], dim=0)
            heads, tree_score = chu_liu_edmonds(
                new_scores.data.cpu().numpy().astype(np.float64))
            #print(heads, tree_score)
            fp.write("\n")
            fp.write(str(heads) + " ")
            fp.write(str(tree_score) + "\n")
            fp.close()
Пример #2
0
    def predict(self, dataset, args):
        import io
        conllu, sentences = io.StringIO(), 0

        while not dataset.epoch_finished():
            sentence_lens, word_ids, charseq_ids, charseqs, charseq_lens = dataset.next_batch(
                args.batch_size)

            feeds = {
                self.is_training: False,
                self.sentence_lens: sentence_lens,
                self.charseqs: charseqs[train.FORMS],
                self.charseq_lens: charseq_lens[train.FORMS],
                self.word_ids: word_ids[train.FORMS],
                self.charseq_ids: charseq_ids[train.FORMS]
            }
            for tag in args.tags:
                feeds[self.tags[tag]] = word_ids[train.FACTORS_MAP[tag]]
            if args.parse: feeds[self.heads] = word_ids[train.HEAD]

            if args.parse:
                predictions, heads, _ = self.session.run(
                    [self.predictions, self.heads_logs, self.update_loss],
                    feeds)
            else:
                predictions, _ = self.session.run(
                    [self.predictions, self.update_loss], feeds)

            for i in range(len(sentence_lens)):
                overrides = [None] * dataset.FACTORS
                for tag in args.tags:
                    overrides[dataset.FACTORS_MAP[tag]] = predictions[tag][i]
                if args.parse:
                    padded_heads = np.pad(
                        heads[i][:sentence_lens[i], :sentence_lens[i] +
                                 1].astype(np.float), ((1, 0), (0, 0)),
                        mode="constant")
                    roots, _ = dependency_decoding.chu_liu_edmonds(
                        padded_heads)
                    if np.count_nonzero(roots) != len(roots) - 1:
                        best_score = None
                        padded_heads[:, 0] = np.nan
                        for r in range(len(roots)):
                            if roots[r] == 0:
                                padded_heads[r, 0] = heads[i][r - 1, 0]
                                current_roots, current_score = dependency_decoding.chu_liu_edmonds(
                                    padded_heads)
                                padded_heads[r, 0] = np.nan
                                if best_score is None or current_score > best_score:
                                    best_score, best_roots = current_score, current_roots
                        roots = best_roots

                    overrides[dataset.HEAD] = roots[1:]
                dataset.write_sentence(conllu, sentences, overrides)
                sentences += 1

        return conllu.getvalue()
Пример #3
0
def mst_decode(scores):
    """Decode arc-factored parse using maximum spanning tree."""
    device = scores.device
    scores = scores.cpu().double().numpy()
    heads, _ = dependency_decoding.chu_liu_edmonds(scores)
    heads[0] = 0  # Set root to itself
    return torch.LongTensor(heads).to(device)
Пример #4
0
def cle_loss(scores: torch.Tensor, lengths: torch.Tensor,
             gold_heads: torch.Tensor, normalize_wrt_seq_len: bool):
    """
        Parses a batch of sentences and computes a hinge loss (see code by Eliyahu Kiperwasser: https://github.com/elikip/bist-parser)
        :param scores torch.Tensor of shape (batch_size,tokens, tokens), the length of the sentences is an array of length batch_size that specifies how long the sentences are
        :param gold_heads: Tensor of shape (batch_size, tokens) that contains the correct head for every word.
        :param lengths: actual lengths of the sentences, tensor of shape (batch_size,)
        :return: a scalar torch.Tensor with the hinge loss
        """
    losses: torch.Tensor = 0
    device = get_device_of(scores)
    scores = scores.cpu()
    #scores_np = scores.detach().double().numpy()

    gold_heads = gold_heads.cpu().numpy()
    lengths = lengths.cpu().numpy()

    for m, g, l in zip(scores, gold_heads, lengths):
        #m: shape (tokens, tokens)
        #g: shape (tokens,)
        #l: scalar, sentence length
        range = np.arange(l)
        #remove padding at the end:
        m = m[:l, :l]
        g = g[:l]  # -> shape (l,)

        # make gold solution look worse by cost augmentation (in the original, make non-gold look better)/introduce margin:
        m[range, g] -= 1.0  # cost augmentation

        r, _ = chu_liu_edmonds(m.detach().double().numpy(
        ))  #discard _score_ of solution, -> r has shape (l,)
        # this implementation says that head of artificial root is -1, but the rest of the pipeline says the head of the artificial root is the artificial root itself (i.e. 0):
        r[0] = 0
        r = np.array(r)

        scores_of_solution = m[
            range,
            r]  #extract the scores belonging to the decoded edges -> shape (l,)
        scores_of_gold = m[
            range,
            g]  # extract the scores belonging to the gold edges -> shape (l,)
        r = torch.from_numpy(r)
        g = torch.from_numpy(g)
        zero = torch.zeros(1, dtype=torch.float32)
        #where predicted head differs from gold head, add the score difference to the loss term:
        loss_term = torch.sum(
            torch.where(torch.eq(r, g), zero,
                        scores_of_solution - scores_of_gold))
        if normalize_wrt_seq_len:
            loss_term /= l
        losses += loss_term
    if device < 0:
        return losses
    return losses.to(device)
Пример #5
0
def cle_decode(scores, lengths):
    """
    Parses a batch of sentences
    :param scores torch.Tensor of shape (batch_size,tokens, tokens), the length of the sentences is an array of length batch_size that specifies how long the sentences are
    :param lengths: actual lengths of the sentences, tensor of shape (batch_size,)
    :return: a tensor of shape (batch_size, tokens) that contains the heads of the tokens. Positions that go over the sentence length are filled with -1.
    """
    heads = []
    scores = scores.detach().cpu().double().numpy()
    lengths = lengths.cpu().numpy()
    bs, toks, _ = scores.shape
    for m, l in zip(scores, lengths):
        r, _ = chu_liu_edmonds(m[:l, :l])  #discard _score_ of solution
        h = np.concatenate([r, -np.ones(toks - l, dtype=np.long)])
        heads.append(h)
    return torch.from_numpy(np.stack(heads))
Пример #6
0
    def __call__(self, probs: List[np.ndarray]) -> List[List[int]]:
        """Applies Chu-Liu-Edmonds algorithm to the matrix of head probabilities.

        probs: a 3D-array of probabilities of shape B*L*(L+1)
        """
        answer = []
        for elem in probs:
            m, n = elem.shape
            assert n == m + 1
            elem = np.log10(np.maximum(self.min_edge_prob, elem)) - np.log10(
                self.min_edge_prob)
            elem = np.concatenate([np.zeros_like(elem[:1, :]), elem], axis=0)
            # it makes impossible to create multiple edges 0->i
            elem[1:, 0] += np.log10(self.min_edge_prob) * len(elem)
            chl_data = chu_liu_edmonds(elem.astype("float64"))
            answer.append(chl_data[0][1:])
        return answer
Пример #7
0
def calculate(targets):
    for target in targets:
        target.T = softmax(np.nansum(target.T, axis=2))
        # target.T = eliminate_all_nan_rows(target.T)

        target.T = chu_liu_edmonds(target.T)[0]

        pos_tags = []
        for pos_projection in target.pos_tags:
            # TODO: ako ima istih
            if len(pos_projection) != 0:
                most_common = Counter(pos_projection).most_common(1)[0][0]
                if most_common == '_' and len(Counter(pos_projection)) != 1:
                    most_common = Counter(pos_projection).most_common(1)[1][0]
            else:
                most_common = '_'
            pos_tags.append(most_common)

        target.pos_tags = pos_tags
Пример #8
0
def solve_mst(g):
    start_overall = time.time()
    # Create dummy root node
    nodes = g.nodes()
    gc = g.copy()
    gc.add_node('ROOT')
    for w in g.nodes():
        gc.add_edge(w, 'ROOT', {'weight': 0.1})

    # Create matrix
    N = len(gc)
    gcmat = np.zeros((N, N))
    newnodes = ['ROOT'] + nodes
    for i, w1 in enumerate(newnodes):
        for j, w2 in enumerate(newnodes):
            if w1 == w2: continue
            gcmat[i][j] = gc[w1].get(w2, {}).get('weight', 0.)

    # Solve MST
    heads, score = chu_liu_edmonds(gcmat)

    # Keep only edges in MST
    keptedges = []  # [(i,j,{'relation':'hypernym','weight':X.X})]
    for i, headidx in enumerate(heads[1:]):  # skip ROOT
        if headidx == 0:
            continue
        w1 = nodes[i]
        w2 = nodes[headidx - 1]
        keptedges.append((w1, w2, g[w1][w2]))

    # Calculate stats
    end_overall = time.time()
    stats = {
        'node_cnt': len(nodes),
        'runtime': end_overall - start_overall,
        'keptedge_cnt': len(keptedges)
    }
    return keptedges, stats
Пример #9
0
    def predict(self, dataset, evaluating, args):
        import io
        conllu, sentences = io.StringIO(), 0

        if evaluating: self.session.run(self.reset_metrics)
        while not dataset.epoch_finished():
            sentence_lens, word_ids, charseq_ids, charseqs, charseq_lens = dataset.next_batch(
                args.batch_size)

            feeds = {
                self.is_training: False,
                self.sentence_lens: sentence_lens,
                self.charseqs: charseqs[train.FORMS],
                self.charseq_lens: charseq_lens[train.FORMS],
                self.word_ids: word_ids[train.FORMS],
                self.charseq_ids: charseq_ids[train.FORMS]
            }
            if args.embeddings:
                embeddings = np.zeros([
                    word_ids[train.EMBEDDINGS].shape[0],
                    word_ids[train.EMBEDDINGS].shape[1], args.embeddings_size
                ])
                for i in range(embeddings.shape[0]):
                    for j in range(embeddings.shape[1]):
                        if word_ids[train.EMBEDDINGS][i, j]:
                            embeddings[i, j] = args.embeddings_data[
                                word_ids[train.EMBEDDINGS][i, j] - 1]
                feeds[self.embeddings] = embeddings
            if args.elmo_size:
                feeds[self.elmo] = word_ids[train.ELMO]
            if evaluating:
                for tag in args.tags:
                    feeds[self.tags[tag]] = word_ids[train.FACTORS_MAP[tag]]
                if args.parse:
                    feeds[self.heads] = word_ids[train.HEAD]
                    feeds[self.deprels] = word_ids[train.DEPREL]

            targets = [self.predictions]
            if args.parse:
                targets.extend([self.heads_logs, self.deprel_hidden_layer])
            if evaluating: targets.append(self.update_loss)
            predictions, *other_values = self.session.run(targets, feeds)
            if args.parse: prior_heads, deprel_hidden_layer, *_ = other_values

            if args.parse:
                heads = np.zeros(prior_heads.shape[:2], dtype=np.int32)
                for i in range(len(sentence_lens)):
                    padded_heads = np.pad(
                        prior_heads[i][:sentence_lens[i], :sentence_lens[i] +
                                       1].astype(np.float), ((1, 0), (0, 0)),
                        mode="constant")
                    padded_heads[:, 0] = np.nan
                    padded_heads[1 +
                                 np.argmax(prior_heads[i][:sentence_lens[i],
                                                          0]), 0] = 0
                    chosen_heads, _ = dependency_decoding.chu_liu_edmonds(
                        padded_heads)
                    heads[i, :sentence_lens[i]] = chosen_heads[1:]
                deprels = self.session.run(
                    self.predictions_deprel, {
                        self.is_training: False,
                        self.deprel_hidden_layer: deprel_hidden_layer,
                        self.deprel_heads: heads
                    })

            for i in range(len(sentence_lens)):
                overrides = [None] * dataset.FACTORS
                for tag in args.tags:
                    overrides[dataset.FACTORS_MAP[tag]] = predictions[tag][i]
                if args.parse:
                    overrides[dataset.HEAD] = heads[i]
                    overrides[dataset.DEPREL] = deprels[i]
                dataset.write_sentence(conllu, sentences, overrides)
                sentences += 1

        return conllu.getvalue()
Пример #10
0
import numpy as np
from dependency_decoding import chu_liu_edmonds

np.random.seed(43)
score_matrix = np.random.rand(3, 3)
heads, tree_score = chu_liu_edmonds(score_matrix)
print(score_matrix)
print(heads, tree_score)
Пример #11
0
import numpy as np
from dependency_decoding import chu_liu_edmonds

np.random.seed(43)
score_arc = np.random.rand(2, 3, 3)
score_root = np.random.rand(2, 3)
lengths = np.array([3, 2], dtype=np.int32)
heads, scores = chu_liu_edmonds(score_arc, score_root, lengths)
# print(score_arc)
print(f'heads => {heads}')
print(f'scores => {scores}')

score_arc = np.array([[[1, 1], [2, 1]], [[1, 2], [1, 1]]], dtype=np.float64)
score_arc = np.log(score_arc)
score_root = np.array([[3, 2], [1, 2]], dtype=np.float64)
score_root = np.log(score_root)
lengths = np.array([2, 2], dtype=np.int32)
heads, scores = chu_liu_edmonds(score_arc, score_root, lengths)
scores = np.exp(np.array(scores))
# print(score_arc)
print(f'heads => {heads}')
print(f'scores => {scores}')
Пример #12
0
                if args.parse:
                    prior_heads = np.log(head_probs / len(networks))
                    heads = np.zeros(prior_heads.shape[:2], dtype=np.int32)
                    for i in range(len(sentence_lens)):
                        padded_heads = np.pad(
                            prior_heads[i]
                            [:sentence_lens[i], :sentence_lens[i] + 1].astype(
                                np.float), ((1, 0), (0, 0)),
                            mode="constant")
                        padded_heads[:, 0] = np.nan
                        padded_heads[
                            1 +
                            np.argmax(prior_heads[i][:sentence_lens[i], 0]),
                            0] = 0
                        chosen_heads, _ = dependency_decoding.chu_liu_edmonds(
                            padded_heads)
                        heads[i, :sentence_lens[i]] = chosen_heads[1:]

                    deprel_probs = None
                    for network, deprel_hidden_layer in zip(
                            networks, deprel_hidden_layers):
                        deprels = network.session.run(
                            network.predictions_deprel_probs, {
                                network.is_training: False,
                                network.deprel_hidden_layer:
                                deprel_hidden_layer,
                                network.deprel_heads: heads
                            })
                        deprel_probs = deprel_probs + deprels if deprel_probs is not None else deprels

                for i in range(len(sentence_lens)):
Пример #13
0
    def extract_structures(self, batch, sent_attention_matrix,
                           doc_attention_matrix, count, use_cuda, sent_scores):
        fileName = os.path.join(self._structures_dir,
                                "%06d_struct.txt" % count)
        fp = open(fileName, "w")
        fp.write("Doc: " + str(count) + "\n")
        #exit(0)
        doc_attention_matrix = doc_attention_matrix[:, :]  #this change yet to be tested!
        l = batch.enc_doc_lens[0].item()
        doc_sent_no = 0

        # for i in range(l):
        #     printstr = ''
        #     sent = batch.enc_batch[0][i]
        #     #scores = str_scores_sent[sent_no][0:l, 0:l]
        #     token_count = 0
        #     for j in range(batch.enc_sent_lens[0][i].item()):
        #         token = sent[j].item()
        #         printstr += self.vocab.id2word(token)+" "
        #         token_count = token_count + 1
        #     #print(printstr)
        #     fp.write(printstr+"\n")
        #
        #     scores = sent_attention_matrix[doc_sent_no][0:token_count, 0:token_count]
        #     shape2 = sent_attention_matrix[doc_sent_no][0:token_count,0:token_count].size()
        #     row = torch.ones([1, shape2[1]+1]).cuda()
        #     column = torch.zeros([shape2[0], 1]).cuda()
        #     new_scores = torch.cat([column, scores], dim=1)
        #     new_scores = torch.cat([row, new_scores], dim=0)
        #
        #     heads, tree_score = chu_liu_edmonds(new_scores.data.cpu().numpy().astype(np.float64))
        #     #print(heads, tree_score)
        #     fp.write(str(heads)+" ")
        #     fp.write(str(tree_score)+"\n")
        #     doc_sent_no+=1

        shape2 = doc_attention_matrix[0:l, 0:l + 1].size()
        row = torch.zeros([1, shape2[1]]).cuda()
        #column = torch.zeros([shape2[0], 1]).cuda()
        scores = doc_attention_matrix[0:l, 0:l + 1]
        #new_scores = torch.cat([column, scores], dim=1)
        new_scores = torch.cat([row, scores], dim=0)
        val, root_edge = torch.max(new_scores[:, 0], dim=0)
        root_score = torch.zeros([shape2[0] + 1, 1]).cuda()
        root_score[root_edge] = 1
        new_scores[:, 0] = root_score.squeeze()
        #print(new_scores)
        #print(new_scores.sum(dim=0))
        #print(new_scores.sum(dim=1))
        #print(new_scores.size())
        heads, tree_score = chu_liu_edmonds(
            new_scores.data.cpu().numpy().astype(np.float64))
        height = find_height(heads)
        leaf_nodes = leaf_node_proportion(heads)
        #print(heads, tree_score)
        fp.write("\n")
        sentences = str(batch.original_articles[0]).split("<split1>")
        for idx, sent in enumerate(sentences):
            fp.write(str(idx) + "\t" + str(sent) + "\n")
        #fp.write(str("\n".join(batch.original_articles[0].split("<split1>"))+"\n")
        fp.write(str(heads) + " ")
        fp.write(str(tree_score) + "\n")
        fp.write(str(height) + "\n")
        s = sent_scores[0].data.cpu().numpy()
        for val in s:
            fp.write(str(val))
        fp.close()
        #exit()
        structure_info = dict()
        structure_info['heads'] = heads
        structure_info['height'] = height
        structure_info['leaf_nodes'] = leaf_nodes
        return structure_info