예제 #1
0
    def test_only_tagging(self, run_tokenization_mock, run_tagging_mock,
                          parse_mock, *args):
        options = self._default_options(tagged=True)
        models = self._default_models()
        run_tokenization_mock.return_value = [
            ["Hej", "mitt", "namn", "är"],
            ["Hej", "mitt", "namn", "är", "Slim", "Shady"],
        ]
        run_tagging_mock.side_effect = [
            (
                [],
                [],
                ["IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT"],
                [],
            ),
            (
                [],
                [],
                [
                    "IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT",
                    "PM|NOM", "PM|NOM"
                ],
                [],
            ),
        ]
        with open(os.devnull, 'w') as sys.stderr:
            open_mock = mock_open()
            with patch("swe_pipeline.open", open_mock, create=True):
                process_file(options, "file.txt", "", models)

        written_to_file = "".join(
            [call[0][0] for call in open_mock().write.call_args_list])
        self.assertEqual(
            written_to_file,
            dedent("""
                Hej
                mitt
                namn
                är

                Hej\tIN
                mitt\tPS|NEU|SIN|DEF
                namn\tNN|NEU|SIN|IND|NOM
                är\tVB|PRS|AKT

                Hej
                mitt
                namn
                är
                Slim
                Shady

                Hej\tIN
                mitt\tPS|NEU|SIN|DEF
                namn\tNN|NEU|SIN|IND|NOM
                är\tVB|PRS|AKT
                Slim\tPM|NOM
                Shady\tPM|NOM

            """).lstrip("\n"))
    def test_only_tokenization(
        self, run_tokenization_mock, run_tagging_mock, parse_mock, *args
    ):
        options = self._default_options()
        models = self._default_models()
        run_tokenization_mock.return_value = [
            ["Hej", "mitt", "namn", "är"],
            ["Hej", "mitt", "namn", "är", "Slim", "Shady"],
        ]
        with open(os.devnull, 'w') as sys.stderr:
            open_mock = mock_open()
            with patch("swe_pipeline.open", open_mock, create=True):
                process_file(options, "file.txt", "", models)

        written_to_file = "".join([
            call[0][0]
            for call in open_mock().write.call_args_list
        ])
        self.assertEqual(
            written_to_file,
            dedent("""
                Hej
                mitt
                namn
                är

                Hej
                mitt
                namn
                är
                Slim
                Shady

            """).lstrip("\n")
        )
예제 #3
0
    def test_only_tokenization(self, run_tokenization_mock, run_tagging_mock,
                               parse_mock, *args):
        options = self._default_options()
        models = self._default_models()
        run_tokenization_mock.return_value = [
            ["Hej", "mitt", "namn", "är"],
            ["Hej", "mitt", "namn", "är", "Slim", "Shady"],
        ]
        with open(os.devnull, 'w') as sys.stderr:
            open_mock = mock_open()
            with patch("swe_pipeline.open", open_mock, create=True):
                process_file(options, "file.txt", "", models)

        written_to_file = "".join(
            [call[0][0] for call in open_mock().write.call_args_list])
        self.assertEqual(
            written_to_file,
            dedent("""
                Hej
                mitt
                namn
                är

                Hej
                mitt
                namn
                är
                Slim
                Shady

            """).lstrip("\n"))
예제 #4
0
    def test_empty_options(self, run_tokenization_mock, run_tagging_mock,
                           parse_mock, open_mock, *args):
        options = self._default_options()
        models = self._default_models()
        with open(os.devnull, 'w') as sys.stderr:
            process_file(options, "file.txt", "", models)

        self.assertEqual(open_mock().write.call_count, 0)
    def test_empty_options(
        self, run_tokenization_mock, run_tagging_mock, parse_mock,
        open_mock, *args
    ):
        options = self._default_options()
        models = self._default_models()
        with open(os.devnull, 'w') as sys.stderr:
            process_file(options, "file.txt", "", models)

        self.assertEqual(open_mock().write.call_count, 0)
    def test_parsing(
        self, run_tokenization_mock, run_tagging_mock, parse_mock, *args
    ):
        options = self._default_options(parsed=True)
        models = self._default_models()
        run_tokenization_mock.return_value = [
            ["Hej", "mitt", "namn", "är"],
            ["Hej", "mitt", "namn", "är", "Slim", "Shady"],
        ]
        run_tagging_mock.side_effect = [
            (
                ["hej", "min", "namn", "vara"],
                [
                    "INTJ|_",
                    "DET|Definite=Def|Gender=Neut|Number=Sing|Poss=Yes",
                    "NOUN|Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
                    "AUX|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
                ],
                ["IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT"],
                [],
            ),
            (
                ["hej", "min", "namn", "vara", "Slim", "Shady"],
                [
                    "INTJ|_",
                    "DET|Definite=Def|Gender=Neut|Number=Sing|Poss=Yes",
                    "NOUN|Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
                    "AUX|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
                    "PROPN|Case=Nom",
                    "PROPN|Case=Nom",
                ],
                [
                    "IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT",
                    "PM|NOM", "PM|NOM"
                ],
                [],
            ),
        ]
        with open(os.devnull, 'w') as sys.stderr:
            open_mock = mock_open()
            with patch("swe_pipeline.open", open_mock, create=True):
                process_file(options, "file.txt", "", models)

        self.assertEqual(run_tagging_mock.call_count, 2)
        self.assertEqual(parse_mock.call_count, 1)
예제 #7
0
    def test_parsing(self, run_tokenization_mock, run_tagging_mock, parse_mock,
                     *args):
        options = self._default_options(parsed=True)
        models = self._default_models()
        run_tokenization_mock.return_value = [
            ["Hej", "mitt", "namn", "är"],
            ["Hej", "mitt", "namn", "är", "Slim", "Shady"],
        ]
        run_tagging_mock.side_effect = [
            (
                ["hej", "min", "namn", "vara"],
                [
                    "INTJ|_",
                    "DET|Definite=Def|Gender=Neut|Number=Sing|Poss=Yes",
                    "NOUN|Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
                    "AUX|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
                ],
                ["IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT"],
                [],
            ),
            (
                ["hej", "min", "namn", "vara", "Slim", "Shady"],
                [
                    "INTJ|_",
                    "DET|Definite=Def|Gender=Neut|Number=Sing|Poss=Yes",
                    "NOUN|Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
                    "AUX|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
                    "PROPN|Case=Nom",
                    "PROPN|Case=Nom",
                ],
                [
                    "IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT",
                    "PM|NOM", "PM|NOM"
                ],
                [],
            ),
        ]
        with open(os.devnull, 'w') as sys.stderr:
            open_mock = mock_open()
            with patch("swe_pipeline.open", open_mock, create=True):
                process_file(options, "file.txt", "", models)

        self.assertEqual(run_tagging_mock.call_count, 2)
        self.assertEqual(parse_mock.call_count, 1)
예제 #8
0
    def test_tagging_and_lemmatization_and_ner(self, run_tokenization_mock,
                                               run_tagging_mock, parse_mock,
                                               *args):
        options = self._default_options(tagged=True, lemmatized=True, ner=True)
        models = self._default_models()
        run_tokenization_mock.return_value = [
            ["Hej", "mitt", "namn", "är"],
            ["Hej", "mitt", "namn", "är", "Slim", "Shady"],
        ]
        run_tagging_mock.side_effect = [
            (
                ["hej", "min", "namn", "vara"],
                [
                    "INTJ|_",
                    "DET|Definite=Def|Gender=Neut|Number=Sing|Poss=Yes",
                    "NOUN|Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
                    "AUX|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
                ],
                ["IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT"],
                ["O", "O", "O", "O"],
            ),
            (
                ["hej", "min", "namn", "vara", "Slim", "Shady"],
                [
                    "INTJ|_",
                    "DET|Definite=Def|Gender=Neut|Number=Sing|Poss=Yes",
                    "NOUN|Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
                    "AUX|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
                    "PROPN|Case=Nom",
                    "PROPN|Case=Nom",
                ],
                [
                    "IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT",
                    "PM|NOM", "PM|NOM"
                ],
                ["O", "O", "O", "O", "B-person", "I-person"],
            ),
        ]
        with open(os.devnull, 'w') as sys.stderr:
            open_mock = mock_open()
            with patch("swe_pipeline.open", open_mock, create=True):
                process_file(options, "file.txt", "", models)

        written_to_file = "".join(
            [call[0][0] for call in open_mock().write.call_args_list])
        self.assertEqual(
            written_to_file,
            dedent("""
                Hej
                mitt
                namn
                är

                Hej\tIN\tINTJ\thej
                mitt\tPS|NEU|SIN|DEF\tDET\tmin
                namn\tNN|NEU|SIN|IND|NOM\tNOUN\tnamn
                är\tVB|PRS|AKT\tAUX\tvara

                Hej\tO
                mitt\tO
                namn\tO
                är\tO

                Hej
                mitt
                namn
                är
                Slim
                Shady

                Hej\tIN\tINTJ\thej
                mitt\tPS|NEU|SIN|DEF\tDET\tmin
                namn\tNN|NEU|SIN|IND|NOM\tNOUN\tnamn
                är\tVB|PRS|AKT\tAUX\tvara
                Slim\tPM|NOM\tPROPN\tSlim
                Shady\tPM|NOM\tPROPN\tShady

                Hej\tO
                mitt\tO
                namn\tO
                är\tO
                Slim\tB-person
                Shady\tI-person

            """).lstrip("\n"))
    def test_only_tagging(
        self, run_tokenization_mock, run_tagging_mock, parse_mock, *args
    ):
        options = self._default_options(tagged=True)
        models = self._default_models()
        run_tokenization_mock.return_value = [
            ["Hej", "mitt", "namn", "är"],
            ["Hej", "mitt", "namn", "är", "Slim", "Shady"],
        ]
        run_tagging_mock.side_effect = [
            (
                [],
                [],
                ["IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT"],
                [],
            ),
            (
                [],
                [],
                [
                    "IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT",
                    "PM|NOM", "PM|NOM"
                ],
                [],
            ),
        ]
        with open(os.devnull, 'w') as sys.stderr:
            open_mock = mock_open()
            with patch("swe_pipeline.open", open_mock, create=True):
                process_file(options, "file.txt", "", models)

        written_to_file = "".join([
            call[0][0]
            for call in open_mock().write.call_args_list
        ])
        self.assertEqual(
            written_to_file,
            dedent("""
                Hej
                mitt
                namn
                är

                Hej\tIN
                mitt\tPS|NEU|SIN|DEF
                namn\tNN|NEU|SIN|IND|NOM
                är\tVB|PRS|AKT

                Hej
                mitt
                namn
                är
                Slim
                Shady

                Hej\tIN
                mitt\tPS|NEU|SIN|DEF
                namn\tNN|NEU|SIN|IND|NOM
                är\tVB|PRS|AKT
                Slim\tPM|NOM
                Shady\tPM|NOM

            """).lstrip("\n")
        )
    def test_tagging_and_lemmatization_and_ner(
        self, run_tokenization_mock, run_tagging_mock, parse_mock, *args
    ):
        options = self._default_options(tagged=True, lemmatized=True, ner=True)
        models = self._default_models()
        run_tokenization_mock.return_value = [
            ["Hej", "mitt", "namn", "är"],
            ["Hej", "mitt", "namn", "är", "Slim", "Shady"],
        ]
        run_tagging_mock.side_effect = [
            (
                ["hej", "min", "namn", "vara"],
                [
                    "INTJ|_",
                    "DET|Definite=Def|Gender=Neut|Number=Sing|Poss=Yes",
                    "NOUN|Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
                    "AUX|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
                ],
                ["IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT"],
                ["O", "O", "O", "O"],
            ),
            (
                ["hej", "min", "namn", "vara", "Slim", "Shady"],
                [
                    "INTJ|_",
                    "DET|Definite=Def|Gender=Neut|Number=Sing|Poss=Yes",
                    "NOUN|Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
                    "AUX|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
                    "PROPN|Case=Nom",
                    "PROPN|Case=Nom",
                ],
                [
                    "IN", "PS|NEU|SIN|DEF", "NN|NEU|SIN|IND|NOM", "VB|PRS|AKT",
                    "PM|NOM", "PM|NOM"
                ],
                ["O", "O", "O", "O", "B-person", "I-person"],
            ),
        ]
        with open(os.devnull, 'w') as sys.stderr:
            open_mock = mock_open()
            with patch("swe_pipeline.open", open_mock, create=True):
                process_file(options, "file.txt", "", models)

        written_to_file = "".join([
            call[0][0]
            for call in open_mock().write.call_args_list
        ])
        self.assertEqual(
            written_to_file,
            dedent("""
                Hej
                mitt
                namn
                är

                Hej\tIN\tINTJ\thej
                mitt\tPS|NEU|SIN|DEF\tDET\tmin
                namn\tNN|NEU|SIN|IND|NOM\tNOUN\tnamn
                är\tVB|PRS|AKT\tAUX\tvara

                Hej\tO
                mitt\tO
                namn\tO
                är\tO

                Hej
                mitt
                namn
                är
                Slim
                Shady

                Hej\tIN\tINTJ\thej
                mitt\tPS|NEU|SIN|DEF\tDET\tmin
                namn\tNN|NEU|SIN|IND|NOM\tNOUN\tnamn
                är\tVB|PRS|AKT\tAUX\tvara
                Slim\tPM|NOM\tPROPN\tSlim
                Shady\tPM|NOM\tPROPN\tShady

                Hej\tO
                mitt\tO
                namn\tO
                är\tO
                Slim\tB-person
                Shady\tI-person

            """).lstrip("\n")
        )