Пример #1
0
def db_phrase_extract(lang1, lang2,
                      lang1method=lambda x: x,
                      lang2method=lambda x: x,
                      init_val=1.0e-10,
                      db="sqlite:///:memory:"):
    lang1s = lang1method(lang1).split()
    lang2s = lang1method(lang2).split()
    alignment = _db_symmetrization(lang1s, lang2s,
                                   init_val=init_val,
                                   db=db)
    return phrase_extract.phrase_extract(lang1s, lang2s, alignment)
Пример #2
0
    def test_phrase_extract(self):
        # next alignment matrix is like
        #
        # |x| | | | | | | | | |
        # | |x|x|x| | | | | | |
        # | | | | | |x| | | | |
        # | | | | | | |x| | | |
        # | | | | | | | | | |x|
        # | | | | | | | | | |x|
        # | | | | | | | |x| | |
        # | | | | | | | |x| | |
        # | | | | | | | | |x| |
        #
        es = "michael assumes that he will stay in the house".split()
        fs = "michael geht davon aus , dass er im haus bleibt".split()
        alignment = set([(1, 1), (2, 2), (2, 3), (2, 4), (3, 6), (4, 7),
                         (5, 10), (6, 10), (7, 8), (8, 8), (9, 9)])
        ans = set([
            (('assumes', ), ('geht', 'davon', 'aus')),
            (('assumes', ), ('geht', 'davon', 'aus', ',')),
            (('assumes', 'that'), ('geht', 'davon', 'aus', ',', 'dass')),
            (('assumes', 'that', 'he'), ('geht', 'davon', 'aus', ',', 'dass',
                                         'er')),
            (('assumes', 'that', 'he', 'will', 'stay', 'in', 'the', 'house'),
             ('geht', 'davon', 'aus', ',', 'dass', 'er', 'im', 'haus',
              'bleibt')), (('he', ), ('er', )),
            (('he', 'will', 'stay', 'in', 'the', 'house'), ('er', 'im', 'haus',
                                                            'bleibt')),
            (('house', ), ('haus', )), (('in', 'the'), ('im', )),
            (('in', 'the', 'house'), ('im', 'haus')),
            (('michael', ), ('michael', )),
            (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus')),
            (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus', ',')),
            (('michael', 'assumes', 'that'), ('michael', 'geht', 'davon',
                                              'aus', ',', 'dass')),
            (('michael', 'assumes', 'that', 'he'), ('michael', 'geht', 'davon',
                                                    'aus', ',', 'dass', 'er')),
            (('michael', 'assumes', 'that', 'he', 'will', 'stay', 'in', 'the',
              'house'), ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er',
                         'im', 'haus', 'bleibt')), (('that', ), (',', 'dass')),
            (('that', ), ('dass', )), (('that', 'he'), (',', 'dass', 'er')),
            (('that', 'he'), ('dass', 'er')),
            (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
             (',', 'dass', 'er', 'im', 'haus', 'bleibt')),
            (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
             ('dass', 'er', 'im', 'haus', 'bleibt')),
            (('will', 'stay'), ('bleibt', )),
            (('will', 'stay', 'in', 'the', 'house'), ('im', 'haus', 'bleibt'))
        ])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)

        # another test
        es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
        sentenses = [
            ("僕 は 男 です", "I am a man"),
            ("私 は 女 です", "I am a girl"),
            ("私 は 先生 です", "I am a teacher"),
            ("彼女 は 先生 です", "She is a teacher"),
            ("彼 は 先生 です", "He is a teacher"),
        ]
        corpus = mkcorpus(sentenses)
        alignment = symmetrization(es, fs, corpus)
        ans = set([(('\xe3\x81\xaf', '\xe5\x85\x88\xe7\x94\x9f',
                     '\xe3\x81\xa7\xe3\x81\x99'), ('a', 'teacher')),
                   (('\xe5\x85\x88\xe7\x94\x9f', ), ('teacher', )),
                   (('\xe7\xa7\x81', ), ('I', 'am')),
                   (('\xe7\xa7\x81', '\xe3\x81\xaf',
                     '\xe5\x85\x88\xe7\x94\x9f', '\xe3\x81\xa7\xe3\x81\x99'),
                    ('I', 'am', 'a', 'teacher'))])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)
Пример #3
0
    def test_phrase_extract(self):
        # next alignment matrix is like
        #
        # |x| | | | | | | | | |
        # | |x|x|x| | | | | | |
        # | | | | | |x| | | | |
        # | | | | | | |x| | | |
        # | | | | | | | | | |x|
        # | | | | | | | | | |x|
        # | | | | | | | |x| | |
        # | | | | | | | |x| | |
        # | | | | | | | | |x| |
        #
        es = "michael assumes that he will stay in the house".split()
        fs = "michael geht davon aus , dass er im haus bleibt".split()
        alignment = set([(1, 1),
                         (2, 2),
                         (2, 3),
                         (2, 4),
                         (3, 6),
                         (4, 7),
                         (5, 10),
                         (6, 10),
                         (7, 8),
                         (8, 8),
                         (9, 9)])
        ans = set([(('assumes',), ('geht', 'davon', 'aus')),
                   (('assumes',), ('geht', 'davon', 'aus', ',')),
                   (('assumes', 'that'),
                    ('geht', 'davon', 'aus', ',', 'dass')),
                   (('assumes', 'that', 'he'),
                    ('geht', 'davon', 'aus', ',', 'dass', 'er')),
                   (('assumes', 'that', 'he',
                     'will', 'stay', 'in', 'the', 'house'),
                    ('geht', 'davon', 'aus', ',', 'dass',
                     'er', 'im', 'haus', 'bleibt')),
                   (('he',), ('er',)),
                   (('he', 'will', 'stay', 'in', 'the', 'house'),
                    ('er', 'im', 'haus', 'bleibt')),
                   (('house',), ('haus',)),
                   (('in', 'the'), ('im',)),
                   (('in', 'the', 'house'), ('im', 'haus')),
                   (('michael',), ('michael',)),
                   (('michael', 'assumes'),
                    ('michael', 'geht', 'davon', 'aus')),
                   (('michael', 'assumes'),
                    ('michael', 'geht', 'davon', 'aus', ',')),
                   (('michael', 'assumes', 'that'),
                    ('michael', 'geht', 'davon', 'aus', ',', 'dass')),
                   (('michael', 'assumes', 'that', 'he'),
                    ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er')),
                   (('michael',
                     'assumes',
                     'that',
                     'he',
                     'will',
                     'stay',
                     'in',
                     'the',
                     'house'),
                    ('michael',
                     'geht',
                     'davon',
                     'aus',
                     ',',
                     'dass',
                     'er',
                     'im',
                     'haus',
                     'bleibt')),
                   (('that',), (',', 'dass')),
                   (('that',), ('dass',)),
                   (('that', 'he'), (',', 'dass', 'er')),
                   (('that', 'he'), ('dass', 'er')),
                   (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
                    (',', 'dass', 'er', 'im', 'haus', 'bleibt')),
                   (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
                    ('dass', 'er', 'im', 'haus', 'bleibt')),
                   (('will', 'stay'), ('bleibt',)),
                   (('will', 'stay', 'in', 'the', 'house'),
                    ('im', 'haus', 'bleibt'))])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)

        # another test
        es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
        sentenses = [("僕 は 男 です", "I am a man"),
                     ("私 は 女 です", "I am a girl"),
                     ("私 は 先生 です", "I am a teacher"),
                     ("彼女 は 先生 です", "She is a teacher"),
                     ("彼 は 先生 です", "He is a teacher"),
                     ]
        corpus = mkcorpus(sentenses)
        alignment = symmetrization(es, fs, corpus)
        ans = set([(('\xe3\x81\xaf',
                     '\xe5\x85\x88\xe7\x94\x9f',
                     '\xe3\x81\xa7\xe3\x81\x99'),
                    ('a', 'teacher')),
                   (('\xe5\x85\x88\xe7\x94\x9f',), ('teacher',)),
                   (('\xe7\xa7\x81',), ('I', 'am')),
                   (('\xe7\xa7\x81',
                     '\xe3\x81\xaf',
                     '\xe5\x85\x88\xe7\x94\x9f',
                     '\xe3\x81\xa7\xe3\x81\x99'),
                    ('I', 'am', 'a', 'teacher'))])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)