예제 #1
0
    def test_from_eflomal_outputs(self):
        """
        Testing GDFA with first 10 eflomal outputs from issue #1829
        https://github.com/nltk/nltk/issues/1829
        """
        # Input.
        forwards = [
            '0-0 1-2', '0-0 1-1',
            '0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 7-8 9-9 10-10 9-11 11-12 12-13 13-14',
            '0-0 1-1 1-2 2-3 3-4 4-5 4-6 5-7 6-8 8-9 9-10',
            '0-0 14-1 15-2 16-3 20-5 21-6 22-7 5-8 6-9 7-10 8-11 9-12 10-13 11-14 12-15 13-16 14-17 17-18 18-19 19-20 20-21 23-22 24-23 25-24 26-25 27-27 28-28 29-29 30-30 31-31',
            '0-0 1-1 0-2 2-3', '0-0 2-2 4-4',
            '0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-20',
            '3-0 4-1 6-2 5-3 6-4 7-5 8-6 9-7 10-8 11-9 16-10 9-12 10-13 12-14',
            '1-0'
        ]
        backwards = [
            '0-0 1-2', '0-0 1-1',
            '0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 9-8 10-10 11-12 12-11 13-13',
            '0-0 1-2 2-3 3-4 4-6 6-8 7-5 8-7 9-8',
            '0-0 1-8 2-9 3-10 4-11 5-12 6-11 8-13 9-14 10-15 11-16 12-17 13-18 14-19 15-20 16-21 17-22 18-23 19-24 20-29 21-30 22-31 23-2 24-3 25-4 26-5 27-5 28-6 29-7 30-28 31-31',
            '0-0 1-1 2-3', '0-0 1-1 2-3 4-4',
            '0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-16 21-18',
            '0-0 1-1 3-2 4-1 5-3 6-4 7-5 8-6 9-7 10-8 11-9 12-8 13-9 14-8 15-9 16-10',
            '1-0'
        ]
        source_lens = [2, 3, 3, 15, 11, 33, 4, 6, 23, 18]
        target_lens = [2, 4, 3, 16, 12, 33, 5, 6, 22, 16]
        # Expected Output.
        expected = [
            [(0, 0), (1, 2)],
            [(0, 0), (1, 1)],
            [(0, 0), (2, 1), (3, 2), (4, 3), (5, 4), (6, 5), (7, 6), (8, 7),
             (10, 10), (11, 12)],
            [(0, 0), (1, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6), (5, 7),
             (6, 8), (7, 5), (8, 7), (8, 9), (9, 8), (9, 10)],
            [(0, 0), (1, 8), (2, 9), (3, 10), (4, 11), (5, 8), (6, 9), (6, 11),
             (7, 10), (8, 11), (31, 31)],
            [(0, 0), (0, 2), (1, 1), (2, 3)],
            [(0, 0), (1, 1), (2, 2), (2, 3), (4, 4)],
            [(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (7, 6), (8, 7), (9, 8),
             (10, 9), (11, 10), (12, 11), (13, 12), (14, 13), (15, 14),
             (16, 16), (17, 17), (18, 18), (19, 19)],
            [(0, 0), (1, 1), (3, 0), (3, 2), (4, 1), (5, 3), (6, 2), (6, 4),
             (7, 5), (8, 6), (9, 7), (9, 12), (10, 8), (10, 13), (11, 9),
             (12, 8), (12, 14), (13, 9), (14, 8), (15, 9), (16, 10)],
            [(1, 0)],
            [(0, 0), (1, 1), (3, 2), (4, 3), (5, 4), (6, 5), (7, 6), (9, 10),
             (10, 12), (11, 13), (12, 14), (13, 15)],
        ]

        # Iterate through all 10 examples and check for expected outputs.
        for fw, bw, src_len, trg_len, expect in zip(forwards, backwards,
                                                    source_lens, target_lens,
                                                    expected):
            self.assertListEqual(expect,
                                 grow_diag_final_and(src_len, trg_len, fw, bw))
예제 #2
0
def do_process_alignments(corpus_lang_name, mongo_connector):
    # One direction
    fastalign_format = mongo_connector.get_all_sentence_pairs_in_corpus_fastalign_format(
        corpus_lang_name)
    vocabulary = mongo_connector.get_extracted_vocabulary_in_fastalign_format(
        corpus_lang_name)

    # The other direction
    fastalign_format_reversed = mongo_connector.get_all_sentence_pairs_in_corpus_fastalign_format(
        corpus_lang_name, reverse=True)
    vocabulary_reverse = mongo_connector.get_extracted_vocabulary_in_fastalign_format(
        corpus_lang_name, reverse=True)

    # Run aligner
    EFLOMAL = True
    if EFLOMAL:
        res, res_reversed = run_aligner_on_fastalign_eflomal(
            fastalign_format, vocabulary, fastalign_format_reversed,
            vocabulary_reverse)
    else:
        res, res_reversed = run_aligner_on_fastalign(
            fastalign_format, vocabulary, fastalign_format_reversed,
            vocabulary_reverse)

    result_list = []
    for nr, (res_el, res_reversed_el, text) in enumerate(
            zip(res[:len(fastalign_format)],
                res_reversed[:len(fastalign_format)], fastalign_format)):
        [lang_1, lang_2] = text.split("|||")
        lang_1 = lang_1.strip()
        lang_2 = lang_2.strip()
        lang_1_len = len(lang_1.split())
        lang_2_len = len(lang_2.split())

        # For some reason, alignements with a word in only one of the languages is returned. Remove these
        res_el_filtered = " ".join([a for a in res_el.split() if a[-1] != "-"])
        res_reversed_el_filtered = " ".join(
            [a for a in res_reversed_el.split() if a[-1] != "-"])

        gdfa_output = sorted(
            gdfa.grow_diag_final_and(lang_1_len, lang_2_len, res_el_filtered,
                                     res_reversed_el_filtered))
        intersection_output = sorted(
            get_intersection(res_el_filtered, res_reversed_el_filtered))
        confidence = len(intersection_output) / (lang_1_len + lang_2_len) / 2

        embedding_alignment = mongo_connector.get_automatic_alignment_from_embedding_space(
            corpus_lang_name, nr)

        embedding_alignment_reversed = [(b, a)
                                        for (a, b) in embedding_alignment]

        print("embedding_alignment", embedding_alignment)
        print("intersection_output", intersection_output)
        embedding_union_intersection_output = sorted(
            list(set(embedding_alignment + intersection_output)))
        print("embedding_union_intersection_output",
              embedding_union_intersection_output)
        print("gdfa_output", gdfa_output)

        # Combine the results from embedding-alignments and from efmaral, and the do gdfa on those
        res_ls = [a for a in res_el_filtered.split()]
        res_tuples = [(int(b), int(c))
                      for (b, c) in [tuple(a.split("-")) for a in res_ls]]
        res_reversed_ls = [a for a in res_reversed_el_filtered.split()]
        res_tuples_reversed = [
            (int(b), int(c))
            for (b, c) in [tuple(a.split("-")) for a in res_reversed_ls]
        ]
        res_and_embedding = " ".join([
            str(a) + "-" + str(b)
            for (a,
                 b) in sorted(list(set(res_tuples) & set(embedding_alignment)))
        ])
        res_and_embedding_reversed = " ".join([
            str(a) + "-" + str(b) for (a, b) in sorted(
                list(
                    set(res_tuples_reversed)
                    & set(embedding_alignment_reversed)))
        ])

        #print("res_and_embedding", res_and_embedding)
        #print("res_and_embedding_reversed", res_and_embedding_reversed)
        #print("res_el_filtered", res_el_filtered)

        gdfa_res_and_embedding = sorted(
            gdfa.grow_diag_final_and(lang_1_len, lang_2_len, res_and_embedding,
                                     res_and_embedding_reversed))
        print("embedding_gdfa", gdfa_res_and_embedding)
        result_list.append((gdfa_output, intersection_output,
                            embedding_union_intersection_output,
                            gdfa_res_and_embedding, confidence))

    mongo_connector.insert_automatic_alignments(corpus_lang_name, result_list)
예제 #3
0
파일: test_gdfa.py 프로젝트: rmalouf/nltk
    def test_from_eflomal_outputs(self):
        """
        Testing GDFA with first 10 eflomal outputs from issue #1829
        https://github.com/nltk/nltk/issues/1829
        """
        # Input.
        forwards = [
            '0-0 1-2',
            '0-0 1-1',
            '0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 7-8 9-9 10-10 9-11 11-12 12-13 13-14',
            '0-0 1-1 1-2 2-3 3-4 4-5 4-6 5-7 6-8 8-9 9-10',
            '0-0 14-1 15-2 16-3 20-5 21-6 22-7 5-8 6-9 7-10 8-11 9-12 10-13 11-14 12-15 13-16 14-17 17-18 18-19 19-20 20-21 23-22 24-23 25-24 26-25 27-27 28-28 29-29 30-30 31-31',
            '0-0 1-1 0-2 2-3',
            '0-0 2-2 4-4',
            '0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-20',
            '3-0 4-1 6-2 5-3 6-4 7-5 8-6 9-7 10-8 11-9 16-10 9-12 10-13 12-14',
            '1-0',
        ]
        backwards = [
            '0-0 1-2',
            '0-0 1-1',
            '0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 9-8 10-10 11-12 12-11 13-13',
            '0-0 1-2 2-3 3-4 4-6 6-8 7-5 8-7 9-8',
            '0-0 1-8 2-9 3-10 4-11 5-12 6-11 8-13 9-14 10-15 11-16 12-17 13-18 14-19 15-20 16-21 17-22 18-23 19-24 20-29 21-30 22-31 23-2 24-3 25-4 26-5 27-5 28-6 29-7 30-28 31-31',
            '0-0 1-1 2-3',
            '0-0 1-1 2-3 4-4',
            '0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-16 21-18',
            '0-0 1-1 3-2 4-1 5-3 6-4 7-5 8-6 9-7 10-8 11-9 12-8 13-9 14-8 15-9 16-10',
            '1-0',
        ]
        source_lens = [2, 3, 3, 15, 11, 33, 4, 6, 23, 18]
        target_lens = [2, 4, 3, 16, 12, 33, 5, 6, 22, 16]
        # Expected Output.
        expected = [
            [(0, 0), (1, 2)],
            [(0, 0), (1, 1)],
            [
                (0, 0),
                (2, 1),
                (3, 2),
                (4, 3),
                (5, 4),
                (6, 5),
                (7, 6),
                (8, 7),
                (10, 10),
                (11, 12),
            ],
            [
                (0, 0),
                (1, 1),
                (1, 2),
                (2, 3),
                (3, 4),
                (4, 5),
                (4, 6),
                (5, 7),
                (6, 8),
                (7, 5),
                (8, 7),
                (8, 9),
                (9, 8),
                (9, 10),
            ],
            [
                (0, 0),
                (1, 8),
                (2, 9),
                (3, 10),
                (4, 11),
                (5, 8),
                (6, 9),
                (6, 11),
                (7, 10),
                (8, 11),
                (31, 31),
            ],
            [(0, 0), (0, 2), (1, 1), (2, 3)],
            [(0, 0), (1, 1), (2, 2), (2, 3), (4, 4)],
            [
                (0, 0),
                (1, 1),
                (2, 3),
                (3, 4),
                (5, 5),
                (7, 6),
                (8, 7),
                (9, 8),
                (10, 9),
                (11, 10),
                (12, 11),
                (13, 12),
                (14, 13),
                (15, 14),
                (16, 16),
                (17, 17),
                (18, 18),
                (19, 19),
            ],
            [
                (0, 0),
                (1, 1),
                (3, 0),
                (3, 2),
                (4, 1),
                (5, 3),
                (6, 2),
                (6, 4),
                (7, 5),
                (8, 6),
                (9, 7),
                (9, 12),
                (10, 8),
                (10, 13),
                (11, 9),
                (12, 8),
                (12, 14),
                (13, 9),
                (14, 8),
                (15, 9),
                (16, 10),
            ],
            [(1, 0)],
            [
                (0, 0),
                (1, 1),
                (3, 2),
                (4, 3),
                (5, 4),
                (6, 5),
                (7, 6),
                (9, 10),
                (10, 12),
                (11, 13),
                (12, 14),
                (13, 15),
            ],
        ]

        # Iterate through all 10 examples and check for expected outputs.
        for fw, bw, src_len, trg_len, expect in zip(
            forwards, backwards, source_lens, target_lens, expected
        ):
            self.assertListEqual(expect, grow_diag_final_and(src_len, trg_len, fw, bw))
예제 #4
0
 def gdfa_wrapper(self, srclen, tgtlen, e2f_str, f2e_str):
     return grow_diag_final_and(srclen, tgtlen, e2f_str, f2e_str)