Exemplo n.º 1
0
    def test_morph_init(self):
        ibm_model = IBMModel1()

        tmp_dir, f1, f2 = get_two_tmp_files()
        ibm_model.initialize_translation_probs(f1, f2)
        assert len(ibm_model.translation_prob) == 10
        assert len(ibm_model.translation_prob[ibm_model.null_str]) == 9
        assert len(ibm_model.translation_prob["345"]) == 6
        assert ibm_model.translation_prob["122"]["123"] == 1.0 / 4
        shutil.rmtree(tmp_dir)
Exemplo n.º 2
0
 def test_str2int(self):
     ibm_model = IBMModel1()
     # Calling multiple times to make sure we get the same value.
     assert ibm_model.str2int("hello") == 1
     assert ibm_model.str2int("bye") == 2
     assert ibm_model.str2int("hello") == 1
     assert ibm_model.str2int("bye") == 2
     assert len(ibm_model._str2int) == 3
     assert len(ibm_model._int2str) == 3
     assert ibm_model._int2str == [ibm_model.null_str, "hello", "bye"]
     assert ibm_model.int2str(2) == "bye"
Exemplo n.º 3
0
    def test_ibm_train(self):
        ibm_model = IBMModel1()

        tmp_dir, f1, f2 = morph_utils.get_two_tmp_files()
        ibm_model.learn_ibm_parameters(src_path=f1, dst_path=f2, num_iters=3)

        assert ibm_model.translation_prob["456789"]["345"] == 0
        assert ibm_model.translation_prob["456789"]["456789"] == 0.5
        assert (
            ibm_model.translation_prob[ibm_model.null_str]["124"]
            < ibm_model.translation_prob[ibm_model.null_str]["456789"]
        )

        shutil.rmtree(tmp_dir)
Exemplo n.º 4
0
    def test_e_step(self):
        ibm_model = IBMModel1()

        tmp_dir, f1, f2 = morph_utils.get_two_tmp_files()
        ibm_model.initialize_translation_probs(f1, f2)
        translation_counts = defaultdict(lambda: defaultdict(float))

        ibm_model.e_step(
            ["123", "124", "234", "345", ibm_model.null_str],
            ["123", "124", "234", "345"],
            translation_counts,
        )
        assert translation_counts["123"]["345"] == 1.0 / 4
        shutil.rmtree(tmp_dir)
Exemplo n.º 5
0
    def test_em_step(self):
        ibm_model = IBMModel1()

        tmp_dir, f1, f2 = morph_utils.get_two_same_tmp_files()
        ibm_model.initialize_translation_probs(f1, f2)

        pool = Pool(3)
        ibm_model.em_step(src_path=f1, dst_path=f2, num_cpus=3, pool=pool)

        assert ibm_model.translation_prob["456789"]["345"] == 0
        assert ibm_model.translation_prob["456789"]["456789"] == 0.5
        assert (ibm_model.translation_prob[ibm_model.null_str]["124"] <
                ibm_model.translation_prob[ibm_model.null_str]["456789"])

        shutil.rmtree(tmp_dir)
Exemplo n.º 6
0
    def test_em_step(self):
        ibm_model = IBMModel1()

        tmp_dir, f1, f2 = morph_utils.get_two_tmp_files()
        ibm_model.initialize_translation_probs(f1, f2)

        ibm_model.em_step(f1, f2)

        assert ibm_model.translation_prob["456789"]["345"] == 0
        assert ibm_model.translation_prob["456789"]["456789"] == 0.5
        assert (
            ibm_model.translation_prob[ibm_model.null_str]["124"]
            < ibm_model.translation_prob[ibm_model.null_str]["456789"]
        )

        shutil.rmtree(tmp_dir)
Exemplo n.º 7
0
    def test_morph_init(self):
        ibm_model = IBMModel1()

        tmp_dir, f1, f2 = morph_utils.get_two_same_tmp_files()
        ibm_model.initialize_translation_probs(f1, f2)
        assert len(ibm_model.translation_prob) == 10
        assert (
            len(ibm_model.translation_prob[ibm_model.str2int(ibm_model.null_str)]) == 9
        )
        assert len(ibm_model.translation_prob[ibm_model.str2int("345")]) == 6
        assert (
            ibm_model.translation_prob[ibm_model.str2int("122")][
                ibm_model.str2int("123")
            ]
            == 1.0 / 4
        )
        shutil.rmtree(tmp_dir)
Exemplo n.º 8
0
    def test_ibm_train(self):
        ibm_model = IBMModel1()

        tmp_dir, f1, f2 = morph_utils.get_two_same_tmp_files()
        ibm_model.learn_ibm_parameters(src_path=f1, dst_path=f2, num_iters=3)

        assert (
            ibm_model.translation_prob[ibm_model.str2int("456789")][
                ibm_model.str2int("345")
            ]
            == 0
        )
        assert (
            ibm_model.translation_prob[ibm_model.str2int("456789")][
                ibm_model.str2int("456789")
            ]
            == 0.5
        )
        shutil.rmtree(tmp_dir)
Exemplo n.º 9
0
    def test_expectation_for_one_sentence(self):
        ibm_model = IBMModel1()

        tmp_dir, f1, f2 = morph_utils.get_two_same_tmp_files()
        ibm_model.initialize_translation_probs(f1, f2)
        translation_counts = defaultdict(lambda: defaultdict(float))

        ibm_model.expectation_for_one_sentence(
            Counter(
                ibm_model.str2int(w)
                for w in ["123", "124", "234", "345", ibm_model.null_str]
            ),
            Counter(ibm_model.str2int(w) for w in ["123", "124", "234", "345"]),
            translation_counts,
        )
        assert (
            round(
                translation_counts[ibm_model.str2int("123")][ibm_model.str2int("345")],
                3,
            )
            == 0.176
        )
        shutil.rmtree(tmp_dir)