def test_likeness_probs(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams(
            use_morph_likeness=True)
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt")

            float_precision = 3
            assert round(morph_hmm_model.morph_likeness["prefix"]["23"],
                         3) == round(0.016515200181119086, float_precision)
            assert round(morph_hmm_model.morph_likeness["stem"]["23"],
                         3) == round(0.9824677845172917, float_precision)
            assert round(morph_hmm_model.morph_likeness["suffix"]["23"],
                         3) == round(0.0010170153015892497, float_precision)

            assert round(morph_hmm_model.morph_likeness["prefix"]["789"],
                         3) == round(0.00024574366719647703, float_precision)
            assert round(morph_hmm_model.morph_likeness["stem"]["789"],
                         3) == round(0.9957636518152019, float_precision)
            assert round(morph_hmm_model.morph_likeness["suffix"]["789"],
                         3) == round(0.003990604517601711, float_precision)
    def test_zero_out_params(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams()
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt")
            for tag in morph_hmm_model.morph_emit_probs.keys():
                for morph in morph_hmm_model.morph_emit_probs[tag].keys():
                    assert morph_hmm_model.morph_emit_probs[tag][morph] > 0

            morph_hmm_model.zero_out_parmas()
            for tag in morph_hmm_model.morph_emit_probs.keys():
                for morph in morph_hmm_model.morph_emit_probs[tag].keys():
                    assert morph_hmm_model.morph_emit_probs[tag][morph] == 0

            for prev_tag in morph_hmm_model.affix_trans_probs.keys():
                for tag in morph_hmm_model.affix_trans_probs[prev_tag].keys():
                    assert morph_hmm_model.affix_trans_probs[prev_tag][
                        tag] == 0
    def test_morph_init(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams()
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt")

            assert len(morph_hmm_model.morph_emit_probs["prefix"]) == 28
            assert "9" not in morph_hmm_model.morph_emit_probs["prefix"]
            assert len(morph_hmm_model.morph_emit_probs["stem"]) == 42
            assert "689" not in morph_hmm_model.morph_emit_probs["stem"]
            assert len(morph_hmm_model.morph_emit_probs["suffix"]) == 29
            assert "1" not in morph_hmm_model.morph_emit_probs["suffix"]
            assert morph_hmm_model.morph_emit_probs["stem"]["1234"] == 1.1 / (
                42 * 1.1)

            assert morph_hmm_model.affix_trans_probs["START"]["START"] == 0
            assert morph_hmm_model.affix_trans_probs["START"]["prefix"] == 0.5
            assert morph_hmm_model.affix_trans_probs["START"]["stem"] == 0.5
            assert morph_hmm_model.affix_trans_probs["START"]["suffix"] == 0
            assert morph_hmm_model.affix_trans_probs["START"]["END"] == 0
            assert morph_hmm_model.affix_trans_probs["prefix"]["START"] == 0
            assert morph_hmm_model.affix_trans_probs["prefix"]["prefix"] == 0.5
            assert morph_hmm_model.affix_trans_probs["prefix"]["stem"] == 0.5
            assert morph_hmm_model.affix_trans_probs["prefix"]["suffix"] == 0
            assert morph_hmm_model.affix_trans_probs["prefix"]["END"] == 0
            assert morph_hmm_model.affix_trans_probs["stem"]["START"] == 0
            assert morph_hmm_model.affix_trans_probs["stem"]["prefix"] == 0
            assert morph_hmm_model.affix_trans_probs["stem"]["stem"] == 1.0 / 3
            assert morph_hmm_model.affix_trans_probs["stem"][
                "suffix"] == 1.0 / 3
            assert morph_hmm_model.affix_trans_probs["stem"]["END"] == 1.0 / 3
            assert morph_hmm_model.affix_trans_probs["suffix"]["START"] == 0
            assert morph_hmm_model.affix_trans_probs["suffix"]["prefix"] == 0
            assert morph_hmm_model.affix_trans_probs["suffix"]["stem"] == 0
            assert morph_hmm_model.affix_trans_probs["suffix"]["suffix"] == 0.5
            assert morph_hmm_model.affix_trans_probs["suffix"]["END"] == 0.5
            assert morph_hmm_model.affix_trans_probs["END"]["START"] == 0
            assert morph_hmm_model.affix_trans_probs["END"]["prefix"] == 0
            assert morph_hmm_model.affix_trans_probs["END"]["stem"] == 0
            assert morph_hmm_model.affix_trans_probs["END"]["suffix"] == 0
            assert morph_hmm_model.affix_trans_probs["END"]["END"] == 0
    def test_segment_viterbi_w_smoothing(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams()
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content))
            morph_hmm_model.init_params_from_data("no_exist_file.txt")

            segmentor = unsupervised_morphology.MorphologySegmentor(morph_hmm_model)
            assert segmentor.segment_viterbi("123123789") == [0, 2, 3, 5, 6, 9]
예제 #5
0
    def test_emission_probs(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams()
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content))
            morph_hmm_model.init_params_from_data("no_exist_file.txt")

            assert morph_hmm_model.emission_prob("stem", "1234") == 1.1 / (42 * 1.1)
            assert morph_hmm_model.emission_prob("suffix", "1") == 0.1 / (29 * 1.1)
            assert morph_hmm_model.emission_prob("END", "1") == 0
    def test_emission_probs(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams()
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content))
            morph_hmm_model.init_params_from_data("no_exist_file.txt")

            # todo add more tests
            e = 0.014141414141414142
            e_r = e * math.exp(-9)
            assert round(morph_hmm_model.emission_prob("1234"), 3) == round(e_r, 3)
    def test_morph_init(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams()
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content))
            morph_hmm_model.init_params_from_data("no_exist_file.txt")

            assert len(morph_hmm_model.morph_emit_probs) == 51
            assert round(morph_hmm_model.morph_emit_probs["1234"], 3) == round(
                0.014141414141414142, 3
            )
    def test_transition_log_probs(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams()
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt")

            assert morph_hmm_model.transition_log_prob(
                "stem", "END") == math.log(1.0 / 3)
            assert (morph_hmm_model.transition_log_prob(
                "suffix", "START") == morph_hmm_model.SMALL_CONST)
    def test_segment_viterbi_no_smoothing(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams(
            smoothing_const=0.0, use_morph_likeness=False)
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt")

            segmentor = unsupervised_morphology.MorphologySegmentor(
                morph_hmm_model)
            assert segmentor.segment_viterbi("123123789") == (
                ["prefix", "prefix", "stem"],
                [0, 3, 6, 9],
            )
    def test_emission_log_probs(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams(
            use_morph_likeness=False)
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt")

            assert morph_hmm_model.emission_log_probs(
                "stem", "1234") == math.log(1.1 / (42 * 1.1))
            assert morph_hmm_model.emission_log_probs(
                "suffix", "1") == math.log(0.1 / (29 * 1.1))
            assert (morph_hmm_model.emission_log_probs(
                "END", "1") == morph_hmm_model.SMALL_CONST)
    def test_morph_normal_init(self):
        """
        Check if normal initilization does not break.
        """
        stems = ["jump", "say", "work", "play"]
        prefixes = ["re"]
        suffixes = ["ing", "s", "ed"]

        txt_content = []
        for _ in range(1000):
            p, stem, s = "", "", ""
            if random.randint(1, 5) > 2:
                p_i = random.randint(0, len(prefixes) - 1)
                p = prefixes[p_i]
            if random.randint(1, 5) > 2:
                s_i = random.randint(0, len(suffixes) - 1)
                s = suffixes[s_i]
            stem_i = random.randint(0, len(stems) - 1)
            stem = stems[stem_i]
            txt_content.append(p + stem + s)

        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams(
            use_morph_likeness=False)
        with patch("builtins.open") as mock_open:
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            morph_hmm_model.init_params_with_normal_distribution(
                "no_exist_file.txt")

        assert morph_hmm_model.affix_trans_probs["END"]["START"] == 0
        assert morph_hmm_model.affix_trans_probs["END"]["prefix"] == 0
        assert morph_hmm_model.affix_trans_probs["END"]["stem"] == 0
        assert morph_hmm_model.affix_trans_probs["END"]["suffix"] == 0
        assert morph_hmm_model.affix_trans_probs["END"]["END"] == 0

        assert morph_hmm_model.affix_trans_probs["START"]["START"] == 0
        assert morph_hmm_model.affix_trans_probs["START"]["suffix"] == 0
        assert morph_hmm_model.affix_trans_probs["START"]["END"] == 0
    def test_segment_word_no_smoothing(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams(
            smoothing_const=0.0, use_morph_likeness=False)
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt")

            segmentor = unsupervised_morphology.MorphologySegmentor(
                morph_hmm_model)
            assert segmentor.segment_word("123123789789") == "123 123 789 789"
            assert (segmentor.segment_word(
                "123123789789",
                add_affix_symbols=True) == "123+ 123+ 789 +789")
            assert segmentor.segment_word("123") == segmentor.segment_word(
                "123", add_affix_symbols=True)