def test_nlp_latin(self): time_0 = time.time() print("Starting complete `NLP()` test for 'lat' ...") lang: str = "lat" self.assertIsInstance(LatinPipeline.description, str) self.assertIsInstance(LatinPipeline.language, Language) self.assertIsInstance(LatinPipeline.language.family_id, str) self.assertIsInstance(LatinPipeline.language.glottolog_id, str) self.assertIsInstance(LatinPipeline.language.iso_639_3_code, str) self.assertIsInstance(LatinPipeline.language.latitude, float) self.assertIsInstance(LatinPipeline.language.level, str) self.assertIsInstance(LatinPipeline.language.longitude, float) self.assertIsInstance(LatinPipeline.language.parent_id, str) self.assertIsInstance(LatinPipeline.language.type, str) text = get_example_text(iso_code=lang) self.assertIsInstance(text, str) cltk_nlp: NLP = NLP(language=lang) self.assertIsInstance(cltk_nlp, NLP) cltk_doc = cltk_nlp.analyze(text=text) self.assertIsInstance(cltk_doc, Doc) self.assertIsInstance(cltk_doc.raw, str) self.assertEqual(cltk_doc.language, lang) self.assertIsInstance(cltk_doc.stanza_doc, Document) self.assertTrue(len(cltk_doc.words) > 0) all_words_pres = all( [isinstance(word, Word) for word in cltk_doc.words]) self.assertTrue(all_words_pres) word = cltk_doc.words[0] self.assertIsInstance(word.category, MorphosyntacticFeatureBundle) self.assertIsInstance(word.dependency_relation, str) self.assertIsInstance(word.embedding, np.ndarray) self.assertIsInstance(word.governor, int) self.assertIsInstance(word.index_token, int) self.assertIsInstance(word.lemma, str) # self.assertIsInstance(word.named_entity, str) self.assertIsInstance(word.pos, POS) self.assertIsInstance(word.stanza_features, str) self.assertIsInstance(word.stop, bool) self.assertIsInstance(word.string, str) self.assertIsInstance(word.upos, str) self.assertIsInstance(word.xpos, str) print( f"Finished complete test of `NLP()` in {time.time() - time_0} secs." )
def test_dependency_tree(self): cltk_nlp = NLP(language="lat") doc = cltk_nlp.analyze(text=get_example_text("lat")) one_word = doc.words[0] one_word.embedding = list() f = Form.to_form(word=one_word) form_str = f.full_str() target = "Gallia_0 [lemma=mallis,pos=noun,upos=NOUN,xpos=A1|grn1|casA|gen2,Case=nominative,Degree=positive,Gender=feminine,Number=singular]" self.assertEqual(form_str, target) t = DependencyTree.to_tree(doc.sentences[0]) self.assertEqual(len(t.get_dependencies()), 28) t = DependencyTree.to_tree(doc.words[:25]) self.assertIsInstance(t.findall("."), list) self.assertIsInstance(t.findall(".")[0], Form)
def test_nlp_latin_stops(self): lang = "lat" # type: str cltk_nlp = NLP(language=lang) # type: NLP self.assertIsInstance(cltk_nlp, NLP) lat_pipeline = cltk_nlp.pipeline # type: Pipeline pipeline_just_stops = [ proc for proc in lat_pipeline.processes if proc.__name__ == "StopsProcess" ] # type: List[Process] self.assertEqual(len(pipeline_just_stops), 1) stops_class = pipeline_just_stops[0] # type: StopsProcess self.assertIs(stops_class, StopsProcess) words = [ Word(string=token) for token in split_punct_ws(get_example_text(lang)) ] doc = Doc(words=words) stops_obj = stops_class(language=lang) output_doc = stops_obj.run(input_doc=doc) is_stops = [w.stop for w in output_doc.words] # type: List[bool] self.assertEqual(len(words), len(is_stops)) self.assertIsInstance(is_stops[0], bool)
def test_embeddings_processes(self): language = "arc" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = AramaicEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "got" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = GothicEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "grc" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = GreekEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "lat" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = LatinEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "ang" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = OldEnglishEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "pli" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = PaliEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "san" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = SanskritEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)
def test_main_analyze(self): """Testing methods from ``cltk/nlp.py``. Note that we change ``first_word.embedding`` into an empty list because otherwise we would have to add a long vector into our tests. """ lang = "grc" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="ὅτι", pos="ADV", lemma="ὅτι", scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=6, features={}, embedding=[], stop=True, named_entity=False, ) self.assertEqual(first_word, target) lang = "chu" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="отьчє", pos="NOUN", lemma="отьць", scansion=None, xpos="Nb", upos="NOUN", dependency_relation="vocative", governor=7, features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"}, embedding=None, stop=None, named_entity=None, ) self.assertEqual(first_word, target) lang = "fro" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="Une", pos="DET", lemma=None, scansion=None, xpos="DETndf", upos="DET", dependency_relation=None, governor=-1, features={"Definite": "Ind", "PronType": "Art"}, embedding=None, stop=False, named_entity=False, ) self.assertEqual(first_word, target) lang = "got" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="swa", pos="ADV", lemma="swa", scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=1, features={}, embedding=[], stop=None, named_entity=None, ) self.assertEqual(first_word, target) self.assertEqual(len(cltk_doc.sentences), 3) # TODO: Re-enable coptic # raises ``KeyError: 'pretrain_path'`` from ``_set_up_model`` # lang = "cop" # cltk_nlp = NLP(language=lang) # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) # first_word = cltk_doc.words[0] # target = Word( # index_char_start=None, # index_char_stop=None, # index_token=0, # index_sentence=0, # string="ⲧⲏⲛ", # pos="VERB", # lemma="ⲧⲏⲛ", # scansion=None, # xpos="VSTAT", # upos="VERB", # dependency_relation="root", # governor=-1, # features={"VerbForm": "Fin"}, # embedding=None, # stop=None, # named_entity=None, # ) # self.assertEqual(first_word, target) lang = "lzh" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="黃", pos="NOUN", lemma="黃", scansion=None, xpos="n,名詞,描写,形質", upos="NOUN", dependency_relation="nmod", governor=1, features={}, embedding=None, stop=None, named_entity=None, ) self.assertEqual(first_word, target)
def test_embeddings_processes(self): language = "ang" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = OldEnglishEmbeddingsProcess( ) # type: OldEnglishEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "arc" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = AramaicEmbeddingsProcess( ) # type: AramaicEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "got" # type: str example_text = get_example_text(language) # str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = GothicEmbeddingsProcess() # type: GothicEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "grc" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = GreekEmbeddingsProcess() # type: GreekEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "lat" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = LatinEmbeddingsProcess() # type: LatinEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "pli" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = PaliEmbeddingsProcess() # type: PaliEmbeddingsProcess a_doc = a_process.run( input_doc=Doc(raw=get_example_text(language), words=word_objs)) isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "san" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = SanskritEmbeddingsProcess( ) # type: SanskritEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray)
def test_main_analyze(self): """Testing methods from ``cltk/nlp.py``. Note that we change ``first_word.embedding`` into an empty list because otherwise we would have to add a long vector into our tests. """ lang = "chu" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="отьчє", pos="noun", lemma="отьць", stem=None, scansion=None, xpos="Nb", upos="NOUN", dependency_relation="vocative", governor=7, features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"}, embedding=None, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) # Re-enable later. Raises error upon run, at least on build server # Should probably be reported back to Stanza # https://travis-ci.org/github/cltk/cltk/jobs/721808293#L636 # lang = "cop" # cltk_nlp = NLP(language=lang) # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) # first_word = cltk_doc.words[0] # target = Word( # index_char_start=None, # index_char_stop=None, # index_token=0, # index_sentence=0, # string="ⲧⲏⲛ", # pos="VERB", # lemma="ⲧⲏⲛ", # stem=None, # scansion=None, # xpos="VSTAT", # upos="VERB", # dependency_relation="root", # governor=-1, # features={"VerbForm": "Fin"}, # embedding=None, # stop=None, # named_entity=None, # ) # self.assertEqual(first_word, target) lang = "fro" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="Une", pos="DET", lemma="Une", stem=None, scansion=None, xpos="DETndf", upos="DET", dependency_relation=None, governor=-1, features={"Definite": "Ind", "PronType": "Art"}, embedding=None, stop=False, named_entity=False, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) lang = "got" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="swa", pos="ADV", lemma="swa", stem=None, scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=1, features={}, embedding=[], stop=None, named_entity=None, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) self.assertEqual(len(cltk_doc.sentences), 3) lang = "grc" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() # clear out the array, for easier checking target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="ὅτι", pos="ADV", lemma="ὅτι", stem=None, scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=6, features={}, embedding=[], stop=False, named_entity=False, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) lang = "lzh" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="黃", pos="NOUN", lemma="黃", stem=None, scansion=None, xpos="n,名詞,描写,形質", upos="NOUN", dependency_relation="nmod", governor=1, features={}, embedding=None, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target)