def test_get_expectations_from_viterbi(self):
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))

            unsupervised_model = unsupervised_morphology.UnsupervisedMorphology(
                "no_exist_file.txt",
                smoothing_const=0.0,
                use_hardEM=True,
                use_morph_likeness=False,
            )
            assert unsupervised_model.segmentor.segment_viterbi(
                "123123789") == (
                    ["prefix", "prefix", "stem"],
                    [0, 3, 6, 9],
                )
            e, t = unsupervised_model.get_expectations_from_viterbi(
                "123123789")
            assert e[("prefix", "123")] == 2
            assert e[("stem", "789")] == 1
            assert e[("suffix", "789")] == 0
            assert t[("START", "prefix")] == 1
            assert t[("prefix", "prefix")] == 1
            assert t[("prefix", "stem")] == 1
            assert t[("stem", "END")] == 1
 def test_forward_backward_long_str(self):
     with patch("builtins.open") as mock_open:
         txt_content = [
             "123 124 234 345",
             "112 122 123 345",
             "123456789",
             "123456 456789",
         ]
         mock_open.return_value.__enter__ = mock_open
         mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content))
         unsupervised_model = unsupervised_morphology.UnsupervisedMorphology(
             "no_exist_file.txt", smoothing_const=0.0
         )
         e = unsupervised_model.forward_backward("1232345")
         expected_morphs = {
             "1",
             "2",
             "3",
             "4",
             "5",
             "12",
             "23",
             "34",
             "45",
             "123",
             "234",
             "345",
             "2345",
         }
         self.check_emission_after_forward_backward("1232345", e, expected_morphs)
 def test_forward_backward_with_smoothing(self):
     """
     Making sure that the algorithm works end-to-end.
     """
     with patch("builtins.open") as mock_open:
         txt_content = ["123 12123"]
         mock_open.return_value.__enter__ = mock_open
         mock_open.return_value.__iter__ = Mock(
             return_value=iter(txt_content))
         unsupervised_model = unsupervised_morphology.UnsupervisedMorphology(
             "no_exist_file.txt", smoothing_const=0.1)
         e, t = unsupervised_model.forward_backward("123")
    def test_forward_backward(self):
        with patch("builtins.open") as mock_open:
            txt_content = ["123 12123"]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            unsupervised_model = unsupervised_morphology.UnsupervisedMorphology(
                "no_exist_file.txt", smoothing_const=0.0)
            e = unsupervised_model.forward_backward("123")

            # checking emission parameters
            self.check_emission_after_forward_backward(
                "123", e, get_all_substrings("123"))
    def test_EM(self):
        with patch("builtins.open") as mock_open:
            txt_content = [
                "work",
                "works",
                "worked",
                "working",
                "go",
                "goes",
                "gone",
                "going",
                "do",
                "does",
                "did",
                "doing",
                "see",
                "saw",
                "seen",
                "seeing",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            # Running with forward-backward.
            unsupervised_model = unsupervised_morphology.UnsupervisedMorphology(
                "no_exist_file.txt",
                smoothing_const=0.0,
                use_morph_likeness=False)
            unsupervised_model.expectation_maximization(10, 10)

            # Running with Viterbi-EM.
            unsupervised_model = unsupervised_morphology.UnsupervisedMorphology(
                "no_exist_file.txt",
                smoothing_const=0.0,
                use_hardEM=True,
                use_morph_likeness=False,
            )
            unsupervised_model.expectation_maximization(10, 10)
    def test_forward_backward_long_str(self):
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            unsupervised_model = unsupervised_morphology.UnsupervisedMorphology(
                "no_exist_file.txt", smoothing_const=0.0)
            e, t = unsupervised_model.forward_backward("1232345")
            expected_prefixes = {"1", "12", "123", "2", "23", "3"}
            expected_stems = {
                "12", "123", "23", "234", "2345", "345", "45", "34"
            }
            expected_suffixes = {"2", "3", "4", "5", "34", "45", "345"}
            self.check_emission_after_forward_backward("1232345", e,
                                                       expected_prefixes,
                                                       expected_stems,
                                                       expected_suffixes)

            assert t[("START", "START")] == 0
            assert t[("START", "END")] == 0
            assert t[("START", "prefix")] > 0
            assert t[("START", "stem")] > 0
            assert t[("START", "suffix")] == 0
            assert t[("END", "START")] == 0
            assert t[("END", "END")] == 0
            assert t[("END", "prefix")] == 0
            assert t[("END", "stem")] == 0
            assert t[("END", "suffix")] == 0
            assert t[("prefix", "START")] == 0
            assert t[("prefix", "END")] == 0
            assert t[("prefix", "prefix")] > 0
            assert t[("prefix", "stem")] > 0
            assert t[("prefix", "suffix")] == 0
            assert t[("stem", "START")] == 0
            assert t[("stem", "END")] > 0
            assert t[("stem", "prefix")] == 0
            assert t[("stem", "stem")] > 0
            assert t[("stem", "suffix")] > 0
            assert t[("suffix", "START")] == 0
            assert t[("suffix", "END")] > 0
            assert t[("suffix", "prefix")] == 0
            assert t[("suffix", "stem")] == 0
            assert t[("suffix", "suffix")] > 0
    def test_save_load(self):
        with patch("builtins.open") as mock_open:
            txt_content = [
                "work",
                "works",
                "worked",
                "working",
                "go",
                "goes",
                "gone",
                "going",
                "do",
                "does",
                "did",
                "doing",
                "see",
                "saw",
                "seen",
                "seeing",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content))
            unsupervised_model = unsupervised_morphology.UnsupervisedMorphology(
                "no_exist_file.txt", smoothing_const=0.0
            )
            unsupervised_model.expectation_maximization(3, 2)

        test_dir = tempfile.mkdtemp()
        unsupervised_model.params.save(path.join(test_dir, "test.pickle"))

        loaded_params = unsupervised_morphology.MorphologyHMMParams.load(
            path.join(test_dir, "test.pickle")
        )

        assert (
            unsupervised_model.params.morph_emit_probs == loaded_params.morph_emit_probs
        )
        assert unsupervised_model.params.word_counts == loaded_params.word_counts
        assert (
            unsupervised_model.params.smoothing_const == loaded_params.smoothing_const
        )
        assert unsupervised_model.params.SMALL_CONST == loaded_params.SMALL_CONST
        assert unsupervised_model.params.len_cost_pow == loaded_params.len_cost_pow
        shutil.rmtree(test_dir)
    def test_get_expectations_from_viterbi(self):
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content))

            um = unsupervised_morphology.UnsupervisedMorphology(
                "no_exist_file.txt", smoothing_const=0.0, use_hardEM=True
            )
            assert um.segmentor.segment_viterbi("123123789") == [0, 2, 3, 5, 6, 9]
            e = um.get_expectations_from_viterbi("123123789")
            assert e["12"] == 2
            assert e["789"] == 1
            assert e["89"] == 0
Пример #9
0
        default=False,
    )
    return parser


if __name__ == "__main__":
    arg_parser = get_arg_parser()
    options, args = arg_parser.parse_args()
    if options.model_path is None:
        print("Model path not specified")
        sys.exit(0)

    if options.train_file is not None and options.target_train_file is None:
        model = unsupervised_morphology.UnsupervisedMorphology(
            input_file=options.train_file,
            smoothing_const=options.smooth_const,
            use_hardEM=options.use_hardEM,
            len_cost_pow=options.len_cost_pow,
        )
        print("Number of training words", len(model.params.word_counts))
        model.expectation_maximization(
            options.em_iter,
            options.num_cpus,
            options.model_path if options.save_checkpoint else None,
        )
        if not options.save_checkpoint:
            model.params.save(options.model_path)
    elif options.train_file is not None and options.target_train_file is not None:
        model = unsupervised_bilingual_morphology.UnsupervisedBilingualMorphology(
            src_file=options.train_file,
            dst_file=options.target_train_file,
            smoothing_const=options.smooth_const,
Пример #10
0
        default=False,
    )
    return parser


if __name__ == "__main__":
    arg_parser = get_arg_parser()
    options, args = arg_parser.parse_args()
    if options.train_file is not None and options.model_path is not None:
        model = unsupervised_morphology.UnsupervisedMorphology(
            input_file=options.train_file,
            smoothing_const=options.smooth_const,
            use_normal_init=options.normal_init,
            normal_mean=options.normal_mean,
            normal_stddev=options.normal_stddev,
            use_hardEM=options.use_hardEM,
            use_morph_likeness=options.use_morph_likeness,
            perplexity_threshold=options.perplexity_threshold,
            perplexity_slope=options.perplexity_slope,
            length_threshold=options.length_threshold,
            length_slope=options.length_slope,
        )
        print("Number of training words", len(model.params.word_counts))
        model.expectation_maximization(
            options.em_iter,
            options.num_cpus,
            options.model_path if options.save_checkpoint else None,
        )
        if not options.save_checkpoint:
            model.params.save(options.model_path)