def transliterate(self, text, mode="Latin"): """ Transliterates Anglo-Saxon runes into latin and vice versa. Sources: http://www.arild-hauge.com/eanglor.htm https://en.wikipedia.org/wiki/Anglo-Saxon_runes :param text: str: The text to be transcribed :param mode: Specifies transliteration mode, options: Latin (default): Transliterates Anglo-Saxon runes into the latin alphabet, using the Dickins system Anglo-Saxon/Anglo-Frisian : Transliterates Latin text into Anglo-Saxon runes Examples: >>> Transliterate().transliterate("Hƿæt Ƿe Gardena in geardagum", "Anglo-Saxon") 'ᚻᚹᚫᛏ ᚹᛖ ᚷᚪᚱᛞᛖᚾᚪ ᛁᚾ ᚷᛠᚱᛞᚪᚷᚢᛗ' >>> Transliterate().transliterate("ᚩᚠᛏ ᛋᚳᚣᛚᛞ ᛋᚳᛖᚠᛁᛝ ᛋᚳᛠᚦᛖᚾᚪ ᚦᚱᛠᛏᚢᛗ", "Latin") 'oft scyld scefin sceathena threatum' """ if mode == "Latin": return Transliterate.__transliterate_helper( text, L_Transliteration) elif mode in ["Anglo-Saxon", "Anglo-Frisian"]: return Transliterate.__transliterate_helper( text, R_Transliteration) else: LOG.error("The specified mode is currently not supported") raise UnimplementedAlgorithmError( "The specified mode is currently not supported")
def tag_ner(iso_code: str, input_tokens: List[str]) -> List[Union[bool, str]]: """Run NER for chosen language. Some languages return boolean True/False, others give string of entity type (e.g., ``LOC``). >>> from cltkv1.ner.ner import tag_ner >>> from cltkv1.languages.example_texts import get_example_text >>> from boltons.strutils import split_punct_ws >>> tokens = split_punct_ws(get_example_text(iso_code="lat")) >>> are_words_entities = tag_ner(iso_code="lat", input_tokens=tokens) >>> tokens[:5] ['Gallia', 'est', 'omnis', 'divisa', 'in'] >>> are_words_entities[:5] [True, False, False, False, False] >>> text = "ἐπὶ δ᾽ οὖν τοῖς πρώτοις τοῖσδε Περικλῆς ὁ Ξανθίππου ᾑρέθη λέγειν. καὶ ἐπειδὴ καιρὸς ἐλάμβανε, προελθὼν ἀπὸ τοῦ σήματος ἐπὶ βῆμα ὑψηλὸν πεποιημένον, ὅπως ἀκούοιτο ὡς ἐπὶ πλεῖστον τοῦ ὁμίλου, ἔλεγε τοιάδε." >>> tokens = split_punct_ws(text) >>> are_words_entities = tag_ner(iso_code="grc", input_tokens=tokens) >>> tokens[:9] ['ἐπὶ', 'δ᾽', 'οὖν', 'τοῖς', 'πρώτοις', 'τοῖσδε', 'Περικλῆς', 'ὁ', 'Ξανθίππου'] >>> are_words_entities[:9] [False, False, False, False, False, False, True, False, True] >>> tokens = split_punct_ws(get_example_text(iso_code="fro")) >>> are_words_entities = tag_ner(iso_code="fro", input_tokens=tokens) >>> tokens[30:50] ['Bretaigne', 'A', 'I', 'molt', 'riche', 'chevalier', 'Hardi', 'et', 'coragous', 'et', 'fier', 'De', 'la', 'Table', 'Reonde', 'estoit', 'Le', 'roi', 'Artu', 'que'] >>> are_words_entities[30:50] ['LOC', False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, 'CHI'] """ get_lang(iso_code=iso_code) if iso_code not in NER_DICT: msg = f"NER unavailable for language ``{iso_code}``." raise UnimplementedAlgorithmError(msg) ner_file_path = os.path.expanduser(NER_DICT[iso_code]) if iso_code == "fro": loader = importlib.machinery.SourceFileLoader("entities", ner_file_path) module = loader.load_module() # type: module entities = module.entities # type: Tuple(str, str) entities_type_list = list() for input_token in input_tokens: for entity_token, kind in entities: if input_token == entity_token: entities_type_list.append(kind) break entities_type_list.append(False) return entities_type_list else: with open(ner_file_path) as file_open: ner_str = file_open.read() ner_list = ner_str.split("\n") is_entity_list = list() # type: List[bool] for word_token in input_tokens: if word_token in ner_list: is_entity_list.append(True) else: is_entity_list.append(False) return is_entity_list
def _check_input_params(self): """Look at combination of parameters give to class and determine if any invalid combination or missing models. """ # 1. check if lang valid get_lang(self.iso_code) # check if iso_code valid # 2. check if any fasttext embeddings for this lang if not self._is_fasttext_lang_available(): available_embeddings_str = "', '".join( self.MAP_LANGS_CLTK_FASTTEXT.keys()) raise UnimplementedAlgorithmError( f"No embedding available for language '{self.iso_code}'. FastTextEmbeddings available for: '{available_embeddings_str}'." ) # 3. check if requested model type is available for fasttext valid_model_types = ["bin", "vec"] if self.model_type not in valid_model_types: valid_model_types_str = "', '" raise CLTKException( f"Invalid model type '{self.model_type}'. Choose: '{valid_model_types_str}'." ) # 4. check if requested training set is available for language for fasttext training_sets = ["common_crawl", "wiki"] if self.training_set not in training_sets: training_sets_str = "', '".join(training_sets) raise CLTKException( f"Invalid ``training_set`` '{self.training_set}'. Available: '{training_sets_str}'." ) available_vectors = list() if self.training_set == "wiki": available_vectors = [ "ang", "arb", "arc", "got", "lat", "pli", "san" ] elif self.training_set == "common_crawl": available_vectors = ["arb", "lat", "san"] else: CLTKException("Unanticipated exception.") if self.iso_code in available_vectors: pass else: available_vectors_str = "', '".join(available_vectors) raise CLTKException( f"Training set '{self.training_set}' not available for language '{self.iso_code}'. Languages available for this training set: '{available_vectors_str}'." )
def _check_input_params(self) -> None: """Confirm that input parameters are valid and in a valid configuration. """ # 1. check if lang valid get_lang(self.iso_code) # check if iso_code valid # 2. check if any fasttext embeddings for this lang if self.iso_code not in self.MAP_LANG_TO_URL: available_embeddings_str = "', '".join(self.MAP_LANG_TO_URL.keys()) raise UnimplementedAlgorithmError( f"No embedding available for language '{self.iso_code}'. Word2Vec models available for: '{available_embeddings_str}'." ) # 3. assert that model type is valid valid_types = ["bin", "txt"] if self.model_type not in valid_types: unavailable_types_str = "', '".join(valid_types) raise ValueError( f"Invalid ``model_type`` {self.model_type}. Valid model types: {unavailable_types_str}." )
def get_example_text(iso_code: str) -> str: """Take in search term of usual language name and find ISO code. >>> from cltkv1.languages.example_texts import get_example_text >>> get_example_text("got")[:25] 'swa liuhtjai liuhaþ izwar' >>> get_example_text("zkz") Traceback (most recent call last): ... cltkv1.core.exceptions.UnimplementedAlgorithmError: Example text unavailable for ISO 639-3 code 'zkz'. >>> get_example_text("xxx") Traceback (most recent call last): ... cltkv1.core.exceptions.UnknownLanguageError: Unknown ISO language code 'xxx'. """ get_lang(iso_code=iso_code) try: return EXAMPLE_TEXTS[iso_code] except KeyError: raise UnimplementedAlgorithmError( f"Example text unavailable for ISO 639-3 code '{iso_code}'.")
def _get_pipeline(self) -> Pipeline: """Select appropriate pipeline for given language. If custom processing is requested, ensure that user-selected choices are valid, both in themselves and in unison. >>> from cltkv1 import NLP >>> from cltkv1.core.data_types import Pipeline >>> cltk_nlp = NLP(language="lat") >>> lat_pipeline = cltk_nlp._get_pipeline() >>> isinstance(cltk_nlp.pipeline, Pipeline) True >>> isinstance(lat_pipeline, Pipeline) True >>> cltk_nlp = NLP(language="axm") Traceback (most recent call last): ... cltkv1.core.exceptions.UnimplementedAlgorithmError: Valid ISO language code, however this algorithm is not available for ``axm``. """ try: return iso_to_pipeline[self.language.iso_639_3_code]() except KeyError: raise UnimplementedAlgorithmError( f"Valid ISO language code, however this algorithm is not available for ``{self.language.iso_639_3_code}``." )
def __init__(self, language: str, treebank: Optional[str] = None, stanza_debug_level="ERROR") -> None: """Constructor for ``get_stanza_models`` wrapper class. TODO: Do tests for all langs and available models for each >>> stanza_wrapper = StanzaWrapper(language='grc', stanza_debug_level="INFO") >>> isinstance(stanza_wrapper, StanzaWrapper) True >>> stanza_wrapper.language 'grc' >>> stanza_wrapper.treebank 'proiel' >>> stanza_wrapper = StanzaWrapper(language="grc", treebank="perseus", stanza_debug_level="INFO") >>> isinstance(stanza_wrapper, StanzaWrapper) True >>> stanza_wrapper.language 'grc' >>> stanza_wrapper.treebank 'perseus' >>> from cltkv1.languages.example_texts import get_example_text >>> stanza_doc = stanza_wrapper.parse(get_example_text("grc")) >>> StanzaWrapper(language="xxx", stanza_debug_level="INFO") Traceback (most recent call last): ... cltkv1.core.exceptions.UnknownLanguageError: Language 'xxx' either not in scope for CLTK or not supported by Stanza. >>> stanza_wrapper = StanzaWrapper(language="grc", treebank="proiel", stanza_debug_level="INFO") >>> stanza_doc = stanza_wrapper.parse(get_example_text("grc")) >>> stanza_wrapper = StanzaWrapper(language="lat", treebank="perseus", stanza_debug_level="INFO") >>> stanza_doc = stanza_wrapper.parse(get_example_text("lat")) >>> stanza_wrapper = StanzaWrapper(language="lat", treebank="proiel", stanza_debug_level="INFO") >>> stanza_doc = stanza_wrapper.parse(get_example_text("lat")) >>> stanza_wrapper = StanzaWrapper(language="chu", stanza_debug_level="INFO") >>> stanza_doc = stanza_wrapper.parse(get_example_text("chu")) >>> stanza_wrapper = StanzaWrapper(language="cop", stanza_debug_level="INFO") >>> stanza_doc = stanza_wrapper.parse(get_example_text("cop")) >>> stanza_wrapper = StanzaWrapper(language="lzh", stanza_debug_level="INFO") >>> stanza_doc = stanza_wrapper.parse(get_example_text("lzh")) >>> stanza_wrapper = StanzaWrapper(language="lat", treebank="xxx", stanza_debug_level="INFO") Traceback (most recent call last): ... cltkv1.core.exceptions.UnimplementedAlgorithmError: Invalid treebank 'xxx' for language 'lat'. """ self.language = language self.treebank = treebank self.stanza_debug_level = stanza_debug_level # Setup language self.map_langs_cltk_stanza = { "chu": "Old_Church_Slavonic", "cop": "Coptic", "fro": "Old_French", "grc": "Ancient_Greek", "got": "Gothic", "lat": "Latin", "lzh": "Classical_Chinese", } self.wrapper_available = self.is_wrapper_available() # type: bool if not self.wrapper_available: raise UnknownLanguageError( "Language '{}' either not in scope for CLTK or not supported by Stanza." .format(self.language)) self.stanza_code = self._get_stanza_code() # Setup optional treebank if specified # TODO: Write tests for all treebanks self.map_code_treebanks = dict(grc=["proiel", "perseus"], la=["perseus", "proiel", "ittb"]) # if not specified, will use the default treebank chosen by stanza if self.treebank: valid_treebank = self._is_valid_treebank() if not valid_treebank: raise UnimplementedAlgorithmError( f"Invalid treebank '{self.treebank}' for language '{self.language}'." ) else: self.treebank = self._get_default_treebank() # check if model present # this fp is just to confirm that some model has already been downloaded. # TODO: This is a weak check for the models actually being downloaded and valid # TODO: Use ``models_dir`` var from below and make self. or global to module self.model_path = os.path.expanduser( f"~/stanza_resources/{self.stanza_code}/tokenize/{self.treebank}.pt" ) if not self._is_model_present(): # download model if necessary self._download_model() # instantiate actual stanza class # Note: `suppress_stdout` is used to prevent `stanza` # from printing a long log of its parameters to screen. # Though we should capture these, within `_load_pipeline()`, # for the log file. with suppress_stdout(): self.nlp = self._load_pipeline()
def _check_input_params(self): """Look at combination of parameters give to class and determine if any invalid combination or missing models. >>> from cltkv1.embeddings.embeddings import FastTextEmbeddings >>> fasttext_model = FastTextEmbeddings(iso_code="lat", interactive=False, overwrite=False, silent=True) >>> type(fasttext_model) <class 'cltkv1.embeddings.embeddings.FastTextEmbeddings'> >>> fasttext_model = FastTextEmbeddings(iso_code="ave", interactive=False, overwrite=False, silent=True) # doctest: +ELLIPSIS Traceback (most recent call last): .. cltkv1.core.exceptions.UnimplementedAlgorithmError: No embedding available for language 'ave'. FastTextEmbeddings available for: ... >>> fasttext_model = FastTextEmbeddings(iso_code="xxx", interactive=False, overwrite=False, silent=True) Traceback (most recent call last): .. cltkv1.core.exceptions.UnknownLanguageError: Unknown ISO language code 'xxx'. >>> fasttext_model = FastTextEmbeddings(iso_code="got", training_set="wiki", interactive=False, overwrite=False, silent=True) # doctest: +ELLIPSIS >>> type(fasttext_model) <class 'cltkv1.embeddings.embeddings.FastTextEmbeddings'> >>> fasttext_model = FastTextEmbeddings(iso_code="got", training_set="common_crawl", interactive=False, overwrite=False, silent=True) # doctest: +ELLIPSIS Traceback (most recent call last): .. cltkv1.core.exceptions.CLTKException: Training set 'common_crawl' not available for language 'got'. Languages available for this training set: ... TODO: Add tests for ``.bin`` files, too """ # 1. check if lang valid get_lang(self.iso_code) # check if iso_code valid # 2. check if any fasttext embeddings for this lang if not self._is_fasttext_lang_available(): available_embeddings_str = "', '".join( self.MAP_LANGS_CLTK_FASTTEXT.keys()) raise UnimplementedAlgorithmError( f"No embedding available for language '{self.iso_code}'. FastTextEmbeddings available for: '{available_embeddings_str}'." ) # 3. check if requested model type is available for fasttext valid_model_types = ["bin", "vec"] if self.model_type not in valid_model_types: valid_model_types_str = "', '" raise CLTKException( f"Invalid model type '{self.model_type}'. Choose: '{valid_model_types_str}'." ) # 4. check if requested training set is available for language for fasttext training_sets = ["common_crawl", "wiki"] if self.training_set not in training_sets: training_sets_str = "', '".join(training_sets) raise CLTKException( f"Invalid ``training_set`` '{self.training_set}'. Available: '{training_sets_str}'." ) available_vectors = list() if self.training_set == "wiki": available_vectors = [ "ang", "arb", "arc", "got", "lat", "pli", "san" ] elif self.training_set == "common_crawl": available_vectors = ["arb", "lat", "san"] else: CLTKException("Unanticipated exception.") if self.iso_code in available_vectors: pass else: available_vectors_str = "', '".join(available_vectors) raise CLTKException( f"Training set '{self.training_set}' not available for language '{self.iso_code}'. Languages available for this training set: '{available_vectors_str}'." )