예제 #1
0
    def test_best_model2_alignment_handles_fertile_words(self):
        # arrange
        sentence_pair = AlignedSent(
            ['i', 'really', ',', 'really', 'love', 'ham'],
            TestIBMModel.__TEST_SRC_SENTENCE)
        # 'bien' produces 2 target words: 'really' and another 'really'
        translation_table = {
            'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
            'really': {"j'": 0, 'aime': 0, 'bien': 0.9, 'jambon': 0.01, None: 0.09},
            ',': {"j'": 0, 'aime': 0, 'bien': 0.3, 'jambon': 0, None: 0.7},
            'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
            'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0}
        }
        alignment_table = defaultdict(
            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
                lambda: 0.2))))

        ibm_model = IBMModel([])
        ibm_model.translation_table = translation_table
        ibm_model.alignment_table = alignment_table

        # act
        a_info = ibm_model.best_model2_alignment(sentence_pair)

        # assert
        self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4))
        self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
예제 #2
0
    def test_best_model2_alignment(self):
        # arrange
        sentence_pair = AlignedSent(
            TestIBMModel.__TEST_TRG_SENTENCE,
            TestIBMModel.__TEST_SRC_SENTENCE)
        # None and 'bien' have zero fertility
        translation_table = {
            'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03,
                  None: 0},
            'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01,
                     None: 0.03},
            'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99,
                    None: 0}
        }
        alignment_table = defaultdict(
            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
                lambda: 0.2))))

        ibm_model = IBMModel([])
        ibm_model.translation_table = translation_table
        ibm_model.alignment_table = alignment_table

        # act
        a_info = ibm_model.best_model2_alignment(sentence_pair)

        # assert
        self.assertEqual(a_info.alignment[1:], (1, 2, 4))  # 0th element unused
        self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
예제 #3
0
    def test_best_model2_alignment_does_not_change_pegged_alignment(self):
        # arrange
        sentence_pair = AlignedSent(
            TestIBMModel.__TEST_TRG_SENTENCE,
            TestIBMModel.__TEST_SRC_SENTENCE)
        translation_table = {
            'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03,
                  None: 0},
            'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01,
                     None: 0.03},
            'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0}
        }
        alignment_table = defaultdict(
            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
                lambda: 0.2))))

        ibm_model = IBMModel([])
        ibm_model.translation_table = translation_table
        ibm_model.alignment_table = alignment_table

        # act: force 'love' to be pegged to 'jambon'
        a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4)
        # assert
        self.assertEqual(a_info.alignment[1:], (1, 4, 4))
        self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]])
예제 #4
0
    def test_neighboring_returns_neighbors_with_pegged_alignment(self):
        # arrange
        a_info = AlignmentInfo(
            (0, 3, 2),
            (None, 'des', 'œufs', 'verts'),
            ('UNUSED', 'green', 'eggs'),
            [[], [], [2], [1]]
        )
        ibm_model = IBMModel([])

        # act: peg 'eggs' to align with 'œufs'
        neighbors = ibm_model.neighboring(a_info, 2)

        # assert
        neighbor_alignments = set()
        for neighbor in neighbors:
            neighbor_alignments.add(neighbor.alignment)
        expected_alignments = set([
            # moves
            (0, 0, 2), (0, 1, 2), (0, 2, 2),
            # no swaps
            # original alignment
            (0, 3, 2)
        ])
        self.assertEqual(neighbor_alignments, expected_alignments)
예제 #5
0
    def test_neighboring_finds_neighbor_alignments(self):
        # arrange
        a_info = AlignmentInfo(
            (0, 3, 2),
            (None, 'des', 'œufs', 'verts'),
            ('UNUSED', 'green', 'eggs'),
            [[], [], [2], [1]]
        )
        ibm_model = IBMModel([])

        # act
        neighbors = ibm_model.neighboring(a_info)

        # assert
        neighbor_alignments = set()
        for neighbor in neighbors:
            neighbor_alignments.add(neighbor.alignment)
        expected_alignments = set([
            # moves
            (0, 0, 2), (0, 1, 2), (0, 2, 2),
            (0, 3, 0), (0, 3, 1), (0, 3, 3),
            # swaps
            (0, 2, 3),
            # original alignment
            (0, 3, 2)
        ])
        self.assertEqual(neighbor_alignments, expected_alignments)
예제 #6
0
    def test_best_model2_alignment_handles_empty_trg_sentence(self):
        # arrange
        sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE)
        ibm_model = IBMModel([])

        # act
        a_info = ibm_model.best_model2_alignment(sentence_pair)

        # assert
        self.assertEqual(a_info.alignment[1:], ())
        self.assertEqual(a_info.cepts, [[], [], [], [], []])
예제 #7
0
    def test_best_model2_alignment_handles_empty_src_sentence(self):
        # arrange
        sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, [])
        ibm_model = IBMModel([])

        # act
        a_info = ibm_model.best_model2_alignment(sentence_pair)

        # assert
        self.assertEqual(a_info.alignment[1:], (0, 0, 0))
        self.assertEqual(a_info.cepts, [[1, 2, 3]])
예제 #8
0
    def test_sample(self):
        # arrange
        sentence_pair = AlignedSent(
            TestIBMModel.__TEST_TRG_SENTENCE,
            TestIBMModel.__TEST_SRC_SENTENCE)
        ibm_model = IBMModel([])
        ibm_model.prob_t_a_given_s = lambda x: 0.001

        # act
        samples, best_alignment = ibm_model.sample(sentence_pair)

        # assert
        self.assertEqual(len(samples), 61)
    def test_vocabularies_are_initialized(self):
        parallel_corpora = [
            AlignedSent(['one', 'two', 'three', 'four'],
                        ['un', 'deux', 'trois']),
            AlignedSent(['five', 'one', 'six'], ['quatre', 'cinq', 'six']),
            AlignedSent([], ['sept'])
        ]

        ibm_model = IBMModel(parallel_corpora)
        self.assertEqual(len(ibm_model.src_vocab), 8)
        self.assertEqual(len(ibm_model.trg_vocab), 6)
    def test_best_model2_alignment(self):
        # arrange
        sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE,
                                    TestIBMModel.__TEST_SRC_SENTENCE)
        # None and 'bien' have zero fertility
        translation_table = {
            'i': {
                "j'": 0.9,
                'aime': 0.05,
                'bien': 0.02,
                'jambon': 0.03,
                None: 0
            },
            'love': {
                "j'": 0.05,
                'aime': 0.9,
                'bien': 0.01,
                'jambon': 0.01,
                None: 0.03
            },
            'ham': {
                "j'": 0,
                'aime': 0.01,
                'bien': 0,
                'jambon': 0.99,
                None: 0
            }
        }
        alignment_table = defaultdict(lambda: defaultdict(lambda: defaultdict(
            lambda: defaultdict(lambda: 0.2))))

        ibm_model = IBMModel([])
        ibm_model.translation_table = translation_table
        ibm_model.alignment_table = alignment_table

        # act
        a_info = ibm_model.best_model2_alignment(sentence_pair)

        # assert
        self.assertEqual(a_info.alignment[1:], (1, 2, 4))  # 0th element unused
        self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
    def test_best_model2_alignment_does_not_change_pegged_alignment(self):
        # arrange
        sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE,
                                    TestIBMModel.__TEST_SRC_SENTENCE)
        translation_table = {
            'i': {
                "j'": 0.9,
                'aime': 0.05,
                'bien': 0.02,
                'jambon': 0.03,
                None: 0
            },
            'love': {
                "j'": 0.05,
                'aime': 0.9,
                'bien': 0.01,
                'jambon': 0.01,
                None: 0.03
            },
            'ham': {
                "j'": 0,
                'aime': 0.01,
                'bien': 0,
                'jambon': 0.99,
                None: 0
            }
        }
        alignment_table = defaultdict(lambda: defaultdict(lambda: defaultdict(
            lambda: defaultdict(lambda: 0.2))))

        ibm_model = IBMModel([])
        ibm_model.translation_table = translation_table
        ibm_model.alignment_table = alignment_table

        # act: force 'love' to be pegged to 'jambon'
        a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4)
        # assert
        self.assertEqual(a_info.alignment[1:], (1, 4, 4))
        self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]])
예제 #12
0
    def test_neighboring_sets_neighbor_alignment_info(self):
        # arrange
        a_info = AlignmentInfo(
            (0, 3, 2),
            (None, 'des', 'œufs', 'verts'),
            ('UNUSED', 'green', 'eggs'),
            [[], [], [2], [1]]
        )
        ibm_model = IBMModel([])

        # act
        neighbors = ibm_model.neighboring(a_info)

        # assert: select a few particular alignments
        for neighbor in neighbors:
            if neighbor.alignment == (0, 2, 2):
                moved_alignment = neighbor
            elif neighbor.alignment == (0, 3, 2):
                swapped_alignment = neighbor

        self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []])
        self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]])
    def test_neighboring_returns_neighbors_with_pegged_alignment(self):
        # arrange
        a_info = AlignmentInfo((0, 3, 2), (None, 'des', 'œufs', 'verts'),
                               ('UNUSED', 'green', 'eggs'), [[], [], [2], [1]])
        ibm_model = IBMModel([])

        # act: peg 'eggs' to align with 'œufs'
        neighbors = ibm_model.neighboring(a_info, 2)

        # assert
        neighbor_alignments = set()
        for neighbor in neighbors:
            neighbor_alignments.add(neighbor.alignment)
        expected_alignments = set([
            # moves
            (0, 0, 2),
            (0, 1, 2),
            (0, 2, 2),
            # no swaps
            # original alignment
            (0, 3, 2)
        ])
        self.assertEqual(neighbor_alignments, expected_alignments)
    def test_hillclimb(self):
        # arrange
        initial_alignment = AlignmentInfo((0, 3, 2), None, None, None)

        def neighboring_mock(a, j):
            if a.alignment == (0, 3, 2):
                return set([
                    AlignmentInfo((0, 2, 2), None, None, None),
                    AlignmentInfo((0, 1, 1), None, None, None)
                ])
            elif a.alignment == (0, 2, 2):
                return set([
                    AlignmentInfo((0, 3, 3), None, None, None),
                    AlignmentInfo((0, 4, 4), None, None, None)
                ])
            return set()

        def prob_t_a_given_s_mock(a):
            prob_values = {
                (0, 3, 2): 0.5,
                (0, 2, 2): 0.6,
                (0, 1, 1): 0.4,
                (0, 3, 3): 0.6,
                (0, 4, 4): 0.7
            }
            return prob_values.get(a.alignment, 0.01)

        ibm_model = IBMModel([])
        ibm_model.neighboring = neighboring_mock
        ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock

        # act
        best_alignment = ibm_model.hillclimb(initial_alignment)

        # assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4)
        self.assertEqual(best_alignment.alignment, (0, 4, 4))
예제 #15
0
    def test_hillclimb(self):
        # arrange
        initial_alignment = AlignmentInfo((0, 3, 2), None, None, None)

        def neighboring_mock(a, j):
            if a.alignment == (0, 3, 2):
                return set([
                    AlignmentInfo((0, 2, 2), None, None, None),
                    AlignmentInfo((0, 1, 1), None, None, None)
                ])
            elif a.alignment == (0, 2, 2):
                return set([
                    AlignmentInfo((0, 3, 3), None, None, None),
                    AlignmentInfo((0, 4, 4), None, None, None)
                ])
            return set()

        def prob_t_a_given_s_mock(a):
            prob_values = {
                (0, 3, 2): 0.5,
                (0, 2, 2): 0.6,
                (0, 1, 1): 0.4,
                (0, 3, 3): 0.6,
                (0, 4, 4): 0.7
            }
            return prob_values.get(a.alignment, 0.01)

        ibm_model = IBMModel([])
        ibm_model.neighboring = neighboring_mock
        ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock

        # act
        best_alignment = ibm_model.hillclimb(initial_alignment)

        # assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4)
        self.assertEqual(best_alignment.alignment, (0, 4, 4))
    def test_vocabularies_are_initialized_even_with_empty_corpora(self):
        parallel_corpora = []

        ibm_model = IBMModel(parallel_corpora)
        self.assertEqual(len(ibm_model.src_vocab), 1)  # addition of NULL token
        self.assertEqual(len(ibm_model.trg_vocab), 0)