示例#1
0
    def test_nlp_latin(self):
        time_0 = time.time()
        print("Starting complete `NLP()` test for 'lat' ...")

        lang: str = "lat"

        self.assertIsInstance(LatinPipeline.description, str)
        self.assertIsInstance(LatinPipeline.language, Language)
        self.assertIsInstance(LatinPipeline.language.family_id, str)
        self.assertIsInstance(LatinPipeline.language.glottolog_id, str)
        self.assertIsInstance(LatinPipeline.language.iso_639_3_code, str)
        self.assertIsInstance(LatinPipeline.language.latitude, float)
        self.assertIsInstance(LatinPipeline.language.level, str)
        self.assertIsInstance(LatinPipeline.language.longitude, float)
        self.assertIsInstance(LatinPipeline.language.parent_id, str)
        self.assertIsInstance(LatinPipeline.language.type, str)

        text = get_example_text(iso_code=lang)
        self.assertIsInstance(text, str)

        cltk_nlp: NLP = NLP(language=lang)
        self.assertIsInstance(cltk_nlp, NLP)

        cltk_doc = cltk_nlp.analyze(text=text)
        self.assertIsInstance(cltk_doc, Doc)
        self.assertIsInstance(cltk_doc.raw, str)
        self.assertEqual(cltk_doc.language, lang)
        self.assertIsInstance(cltk_doc.stanza_doc, Document)

        self.assertTrue(len(cltk_doc.words) > 0)
        all_words_pres = all(
            [isinstance(word, Word) for word in cltk_doc.words])
        self.assertTrue(all_words_pres)
        word = cltk_doc.words[0]
        self.assertIsInstance(word.category, MorphosyntacticFeatureBundle)
        self.assertIsInstance(word.dependency_relation, str)
        self.assertIsInstance(word.embedding, np.ndarray)
        self.assertIsInstance(word.governor, int)
        self.assertIsInstance(word.index_token, int)
        self.assertIsInstance(word.lemma, str)
        # self.assertIsInstance(word.named_entity, str)
        self.assertIsInstance(word.pos, POS)
        self.assertIsInstance(word.stanza_features, str)
        self.assertIsInstance(word.stop, bool)
        self.assertIsInstance(word.string, str)
        self.assertIsInstance(word.upos, str)
        self.assertIsInstance(word.xpos, str)

        print(
            f"Finished complete test of `NLP()` in {time.time() - time_0} secs."
        )
示例#2
0
    def test_dependency_tree(self):
        cltk_nlp = NLP(language="lat")
        doc = cltk_nlp.analyze(text=get_example_text("lat"))
        one_word = doc.words[0]
        one_word.embedding = list()
        f = Form.to_form(word=one_word)
        form_str = f.full_str()
        target = "Gallia_0 [lemma=mallis,pos=noun,upos=NOUN,xpos=A1|grn1|casA|gen2,Case=nominative,Degree=positive,Gender=feminine,Number=singular]"
        self.assertEqual(form_str, target)

        t = DependencyTree.to_tree(doc.sentences[0])
        self.assertEqual(len(t.get_dependencies()), 28)

        t = DependencyTree.to_tree(doc.words[:25])
        self.assertIsInstance(t.findall("."), list)
        self.assertIsInstance(t.findall(".")[0], Form)
 def test_nlp_latin_stops(self):
     lang = "lat"  # type: str
     cltk_nlp = NLP(language=lang)  # type: NLP
     self.assertIsInstance(cltk_nlp, NLP)
     lat_pipeline = cltk_nlp.pipeline  # type: Pipeline
     pipeline_just_stops = [
         proc for proc in lat_pipeline.processes
         if proc.__name__ == "StopsProcess"
     ]  # type: List[Process]
     self.assertEqual(len(pipeline_just_stops), 1)
     stops_class = pipeline_just_stops[0]  # type: StopsProcess
     self.assertIs(stops_class, StopsProcess)
     words = [
         Word(string=token)
         for token in split_punct_ws(get_example_text(lang))
     ]
     doc = Doc(words=words)
     stops_obj = stops_class(language=lang)
     output_doc = stops_obj.run(input_doc=doc)
     is_stops = [w.stop for w in output_doc.words]  # type: List[bool]
     self.assertEqual(len(words), len(is_stops))
     self.assertIsInstance(is_stops[0], bool)
示例#4
0
    def test_embeddings_processes(self):
        language = "arc"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = AramaicEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "got"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = GothicEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "grc"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = GreekEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "lat"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = LatinEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "ang"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = OldEnglishEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "pli"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = PaliEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "san"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = SanskritEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)
示例#5
0
    def test_main_analyze(self):
        """Testing methods from ``cltk/nlp.py``. Note that we
        change ``first_word.embedding`` into an empty list because
        otherwise we would have to add a long vector into our tests.
        """
        lang = "grc"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="ὅτι",
            pos="ADV",
            lemma="ὅτι",
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=6,
            features={},
            embedding=[],
            stop=True,
            named_entity=False,
        )
        self.assertEqual(first_word, target)

        lang = "chu"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="отьчє",
            pos="NOUN",
            lemma="отьць",
            scansion=None,
            xpos="Nb",
            upos="NOUN",
            dependency_relation="vocative",
            governor=7,
            features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"},
            embedding=None,
            stop=None,
            named_entity=None,
        )
        self.assertEqual(first_word, target)

        lang = "fro"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="Une",
            pos="DET",
            lemma=None,
            scansion=None,
            xpos="DETndf",
            upos="DET",
            dependency_relation=None,
            governor=-1,
            features={"Definite": "Ind", "PronType": "Art"},
            embedding=None,
            stop=False,
            named_entity=False,
        )
        self.assertEqual(first_word, target)

        lang = "got"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="swa",
            pos="ADV",
            lemma="swa",
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=1,
            features={},
            embedding=[],
            stop=None,
            named_entity=None,
        )
        self.assertEqual(first_word, target)
        self.assertEqual(len(cltk_doc.sentences), 3)

        # TODO: Re-enable coptic
        # raises ``KeyError: 'pretrain_path'`` from ``_set_up_model``
        # lang = "cop"
        # cltk_nlp = NLP(language=lang)
        # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        # first_word = cltk_doc.words[0]
        # target = Word(
        #     index_char_start=None,
        #     index_char_stop=None,
        #     index_token=0,
        #     index_sentence=0,
        #     string="ⲧⲏⲛ",
        #     pos="VERB",
        #     lemma="ⲧⲏⲛ",
        #     scansion=None,
        #     xpos="VSTAT",
        #     upos="VERB",
        #     dependency_relation="root",
        #     governor=-1,
        #     features={"VerbForm": "Fin"},
        #     embedding=None,
        #     stop=None,
        #     named_entity=None,
        # )
        # self.assertEqual(first_word, target)

        lang = "lzh"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="黃",
            pos="NOUN",
            lemma="黃",
            scansion=None,
            xpos="n,名詞,描写,形質",
            upos="NOUN",
            dependency_relation="nmod",
            governor=1,
            features={},
            embedding=None,
            stop=None,
            named_entity=None,
        )
        self.assertEqual(first_word, target)
示例#6
0
    def test_embeddings_processes(self):

        language = "ang"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = OldEnglishEmbeddingsProcess(
        )  # type: OldEnglishEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "arc"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = AramaicEmbeddingsProcess(
        )  # type: AramaicEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "got"  # type: str
        example_text = get_example_text(language)  # str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = GothicEmbeddingsProcess()  # type: GothicEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "grc"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = GreekEmbeddingsProcess()  # type: GreekEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "lat"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = LatinEmbeddingsProcess()  # type: LatinEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "pli"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = PaliEmbeddingsProcess()  # type: PaliEmbeddingsProcess
        a_doc = a_process.run(
            input_doc=Doc(raw=get_example_text(language), words=word_objs))
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "san"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = SanskritEmbeddingsProcess(
        )  # type: SanskritEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)
示例#7
0
    def test_main_analyze(self):
        """Testing methods from ``cltk/nlp.py``. Note that we
        change ``first_word.embedding`` into an empty list because
        otherwise we would have to add a long vector into our tests.
        """

        lang = "chu"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="отьчє",
            pos="noun",
            lemma="отьць",
            stem=None,
            scansion=None,
            xpos="Nb",
            upos="NOUN",
            dependency_relation="vocative",
            governor=7,
            features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"},
            embedding=None,
            stop=None,
            named_entity=None,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)

        # Re-enable later. Raises error upon run, at least on build server
        # Should probably be reported back to Stanza
        # https://travis-ci.org/github/cltk/cltk/jobs/721808293#L636
        # lang = "cop"
        # cltk_nlp = NLP(language=lang)
        # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        # first_word = cltk_doc.words[0]
        # target = Word(
        #     index_char_start=None,
        #     index_char_stop=None,
        #     index_token=0,
        #     index_sentence=0,
        #     string="ⲧⲏⲛ",
        #     pos="VERB",
        #     lemma="ⲧⲏⲛ",
        #     stem=None,
        #     scansion=None,
        #     xpos="VSTAT",
        #     upos="VERB",
        #     dependency_relation="root",
        #     governor=-1,
        #     features={"VerbForm": "Fin"},
        #     embedding=None,
        #     stop=None,
        #     named_entity=None,
        # )
        # self.assertEqual(first_word, target)

        lang = "fro"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="Une",
            pos="DET",
            lemma="Une",
            stem=None,
            scansion=None,
            xpos="DETndf",
            upos="DET",
            dependency_relation=None,
            governor=-1,
            features={"Definite": "Ind", "PronType": "Art"},
            embedding=None,
            stop=False,
            named_entity=False,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)

        lang = "got"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="swa",
            pos="ADV",
            lemma="swa",
            stem=None,
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=1,
            features={},
            embedding=[],
            stop=None,
            named_entity=None,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)
        self.assertEqual(len(cltk_doc.sentences), 3)

        lang = "grc"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()  # clear out the array, for easier checking
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="ὅτι",
            pos="ADV",
            lemma="ὅτι",
            stem=None,
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=6,
            features={},
            embedding=[],
            stop=False,
            named_entity=False,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)

        lang = "lzh"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="黃",
            pos="NOUN",
            lemma="黃",
            stem=None,
            scansion=None,
            xpos="n,名詞,描写,形質",
            upos="NOUN",
            dependency_relation="nmod",
            governor=1,
            features={},
            embedding=None,
            stop=None,
            named_entity=None,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)