Пример #1
0
def download_fasttext_model(iso_code: str,
                            model_source: str = "wiki",
                            interactive=False) -> None:
    """Download fasttext model.

    TODO: Add way to specify a Common Crawl model.
    """
    print(f"Going to download fasttext model for '{iso_code}'.")
    avail_sources = ["wiki", "common_crawl"]
    assert (
        model_source in avail_sources
    ), f"Invalid `model_source`. Choose from: {', '.join(avail_sources)}."
    all_wiki_models = ["ang", "arb", "arc", "got", "lat", "pli", "san"]
    if model_source == "wiki" and iso_code not in all_wiki_models:
        raise CLTKException(
            f"Language '{iso_code}' not available for `model_source` '{model_source}'. Choose from: {', '.join(all_wiki_models)}."
        )
    all_common_crawl_models = ["arb", "lat", "san"]
    if model_source == "common_crawl" and iso_code not in all_common_crawl_models:
        raise CLTKException(
            f"Language '{iso_code}' not available for `model_source` '{model_source}'. Choose from: {', '.join(all_common_crawl_models)}."
        )
    FastTextEmbeddings(iso_code=iso_code,
                       interactive=interactive,
                       overwrite=False,
                       silent=False)
    print(f"Finished downloading fasttext for '{iso_code}'.")
Пример #2
0
    def _check_input_params(self):
        """Look at combination of parameters give to class
        and determine if any invalid combination or missing
        models.
        """

        # 1. check if lang valid
        get_lang(self.iso_code)  # check if iso_code valid

        # 2. check if any fasttext embeddings for this lang
        if not self._is_fasttext_lang_available():
            available_embeddings_str = "', '".join(
                self.MAP_LANGS_CLTK_FASTTEXT.keys())
            raise UnimplementedAlgorithmError(
                f"No embedding available for language '{self.iso_code}'. FastTextEmbeddings available for: '{available_embeddings_str}'."
            )

        # 3. check if requested model type is available for fasttext
        valid_model_types = ["bin", "vec"]
        if self.model_type not in valid_model_types:
            valid_model_types_str = "', '"
            raise CLTKException(
                f"Invalid model type '{self.model_type}'. Choose: '{valid_model_types_str}'."
            )

        # 4. check if requested training set is available for language for fasttext
        training_sets = ["common_crawl", "wiki"]
        if self.training_set not in training_sets:
            training_sets_str = "', '".join(training_sets)
            raise CLTKException(
                f"Invalid ``training_set`` '{self.training_set}'. Available: '{training_sets_str}'."
            )
        available_vectors = list()
        if self.training_set == "wiki":
            available_vectors = [
                "ang", "arb", "arc", "got", "lat", "pli", "san"
            ]
        elif self.training_set == "common_crawl":
            available_vectors = ["arb", "lat", "san"]
        else:
            CLTKException("Unanticipated exception.")
        if self.iso_code in available_vectors:
            pass
        else:
            available_vectors_str = "', '".join(available_vectors)
            raise CLTKException(
                f"Training set '{self.training_set}' not available for language '{self.iso_code}'. Languages available for this training set: '{available_vectors_str}'."
            )
Пример #3
0
    def lookup(self, lemma: str) -> str:
        """Perform match of a lemma against headwords. This is case sensitive.
        If more than one match, then return the concatenated entries. For example:

        >>> onzl = OldNorseZoegaLexicon()
        >>> onzl.lookup("sonr")
        '(gen. sonar, dat. syni and søni; pl. synir, sønir; ace. sonu and syni), m. son.'
        """
        if not self.entries:
            raise CLTKException(
                "No dictionary entries found in the .yaml file. This should never happen."
            )

        if regex.match(r"^[0-9\.\?,\:;\!\<\>\-]*$", lemma) is not None:
            return ""

        keys = self.entries.keys()
        matches = [
            key for key in keys if regex.match(rf"^{lemma}[0-9]?$", key)
        ]
        n_matches = len(matches)
        if n_matches > 1:
            return "\n".join([self.entries[key] for key in matches])
        elif n_matches == 1:
            return self.entries[matches[0]]
        else:
            return ""
Пример #4
0
def download_prompt(
    iso_code: str,
    message: str,
    model_url: str,
    interactive: bool = True,
    silent: bool = False,
):
    """Ask user whether to download files.

    TODO: Make ft and stanza use this fn. Consider moving to other module.
    """
    fetch_corpus = FetchCorpus(language=iso_code)
    if not interactive:
        if not silent:
            print(message)
        fetch_corpus.import_corpus(corpus_name=f"{iso_code}_models_cltk")
        # get_file_with_progress_bar(model_url=model_url, file_path=self.fp_zip)
    else:
        print(message)
        dl_is_allowed = query_yes_no(
            f"Do you want to download '{model_url}' to '~/cltk_data/{iso_code}'?"
        )  # type: bool
        if dl_is_allowed:
            fetch_corpus.import_corpus(corpus_name=f"{iso_code}_models_cltk")
            # get_file_with_progress_bar(model_url=model_url, file_path=self.fp_zip)
        else:
            raise CLTKException(
                f"Download of necessary model declined for '{iso_code}'. Following functions will likely fail."
            )
Пример #5
0
def spacy_tag_ner(iso_code: str, text_tokens: List[str],
                  model_path: str) -> List[Union[str, bool]]:
    """Take a list of tokens and return label or None.

    >>> text_tokens = ["Gallia", "est", "omnis", "divisa", "in", "partes", "tres", ",", "quarum", "unam", "incolunt", "Belgae", ",", "aliam", "Aquitani", ",", "tertiam", "qui", "ipsorum", "lingua", "Celtae", ",", "nostra", "Galli", "appellantur", "."]
    >>> from cltk.utils import CLTK_DATA_DIR
    >>> spacy_tag_ner('lat', text_tokens=text_tokens, model_path=os.path.join(CLTK_DATA_DIR, "lat/model/lat_models_cltk/ner/spacy_model/"))
    ['LOCATION', False, False, False, False, False, False, False, False, False, False, 'LOCATION', False, False, 'LOCATION', False, False, False, False, False, 'LOCATION', False, False, 'LOCATION', False, False]
    """
    # make sure that we have a List[str]
    if not isinstance(text_tokens[0], str):
        raise CLTKException("`spacy_tag_ner()` requires `List[str]`.")
    if not os.path.isdir(model_path):
        msg = f"spaCy model path '{model_path}' not found. Going to try to download it ..."
        logging.warning(msg)
        dl_msg = f"This part of the CLTK depends upon models from the CLTK project."
        model_url = f"https://github.com/cltk/{iso_code}_models_cltk"
        download_prompt(iso_code=iso_code, message=dl_msg, model_url=model_url)
    spacy_nlp = spacy.load(model_path)
    # Create the tokenizer for the spacy model
    spacy_nlp.tokenizer = CustomTokenizer(vocab=spacy_nlp.vocab)
    # Create the spacy Doc Object that contains the metadata for entities
    spacy_doc = spacy_nlp(text_tokens)  # type: Doc
    # generate the final output
    token_labels = list()  # type: List[Union[str, bool]]
    for word in spacy_doc:
        if word.ent_type_:
            # word.ent_type_  # type: str
            token_labels.append(word.ent_type_)
        else:
            token_labels.append(False)
    return token_labels
Пример #6
0
 def _download_model(self) -> None:
     """Interface with the `stanza` model downloader."""
     if not self.interactive:
         if not self.silent:
             print(
                 f"CLTK message: Going to download required Stanza models to ``{self.model_path}`` ..."
             )  # pragma: no cover
         stanza.download(lang=self.stanza_code, package=self.treebank)
     else:
         print(  # pragma: no cover
             "CLTK message: This part of the CLTK depends upon the Stanza NLP library."
         )  # pragma: no cover
         dl_is_allowed = query_yes_no(
             f"CLTK message: Allow download of Stanza models to ``{self.model_path}``?"
         )  # type: bool
         if dl_is_allowed:
             stanza.download(lang=self.stanza_code, package=self.treebank)
         else:
             raise CLTKException(
                 f"Download of necessary Stanza model declined for '{self.language}'. Unable to continue with Stanza's processing."
             )
     # if file model still not available after attempted DL, then raise error
     if not file_exists(self.model_path):
         raise FileNotFoundError(
             "Missing required models for ``stanza`` at ``{0}``.".format(
                 self.model_path
             )
         )
Пример #7
0
 def algorithm(self):
     if self.language == "lat":
         lex_class = LatinLewisLexicon()
     else:
         raise CLTKException(
             f"No lookup algorithm for language '{self.language}'.")
     return lex_class
Пример #8
0
def from_ud(feature_name: str,
            feature_value: str) -> Optional[MorphosyntacticFeature]:
    """For a given Universal Dependencies feature name and value,
    return the appropriate feature class/value.
    >>> from_ud('Case', 'Abl')
    ablative
    >>> from_ud('Abbr', 'Yes')
    pos
    >>> from_ud('PronType', 'Ind')
    indefinite
    """
    if feature_name in from_ud_map:
        feature_map = from_ud_map[feature_name]
    else:
        msg = f"{feature_name}: Unrecognized UD feature name"
        print("From `from_ud():`", msg)
        # raise CLTKException(msg)
        return None

    values = feature_value.split(",")
    for value in values:
        if value in feature_map:
            return feature_map[value]
        else:
            raise CLTKException(
                f"{value}: Unrecognized value for UD feature {feature_name}")
Пример #9
0
    def __getitem__(
        self, feature_name: Union[str, Type[MorphosyntacticFeature]]
    ) -> List[MorphosyntacticFeature]:
        """
        Use dict-type syntax for accessing the values of features.
        >>> f1 = f(F.pos, N.pos)
        >>> f1[F]
        [pos]
        >>> f1[V]
        Traceback (most recent call last):
        cltk.core.exceptions.CLTKException: {F: [pos], N: [pos]} unspecified for V
        >>> f1['F']
        [pos]
        """
        if type(feature_name) == str:
            if feature_name not in globals():
                raise TypeError(feature_name +
                                " is not a morphosytactic feature")
            feature_name = globals()[feature_name]

        if not issubclass(feature_name, MorphosyntacticFeature):
            raise TypeError(
                str(feature_name) + " is not a morphosytactic feature")

        if feature_name in self.features:
            return self.features[feature_name]
        else:
            raise CLTKException(f"{self} unspecified for {feature_name}")
Пример #10
0
    def download_fasttext_models(self):
        """Perform complete download of fastText models and save
        them in appropriate ``cltk_data`` dir.

        TODO: Add tests
        TODO: Implement ``overwrite``
        TODO: error out better or continue to _load_model?
        """
        model_url = self._build_fasttext_url()
        if not self.interactive:
            if not self.silent:
                print(
                    f"CLTK message: Going to download file '{model_url}' to '{self.model_fp} ..."
                )  # pragma: no cover
            get_file_with_progress_bar(model_url=model_url,
                                       file_path=self.model_fp)
        else:
            print(  # pragma: no cover
                "CLTK message: This part of the CLTK depends upon word embedding models from the Fasttext project."
            )  # pragma: no cover
            dl_is_allowed = query_yes_no(
                f"Do you want to download file '{model_url}' to '{self.model_fp}'?"
            )  # type: bool
            if dl_is_allowed:
                get_file_with_progress_bar(model_url=model_url,
                                           file_path=self.model_fp)
            else:
                raise CLTKException(
                    f"Download of necessary Stanza model declined for '{self.iso_code}'. Unable to continue with Stanza's processing."
                )
Пример #11
0
 def _download_nlpl_models(self) -> None:
     """Perform complete download of Word2Vec models and save
     them in appropriate ``cltk_data`` dir.
     """
     model_url = self.MAP_LANG_TO_URL[self.iso_code]
     if not self.interactive:
         if not self.silent:
             print(
                 f"CLTK message: Going to download file '{model_url}' to '{self.fp_zip} ..."
             )  # pragma: no cover
         get_file_with_progress_bar(model_url=model_url,
                                    file_path=self.fp_zip)
     else:
         print(  # pragma: no cover
             "CLTK message: This part of the CLTK depends upon word embedding models from the NLPL project."
         )  # pragma: no cover
         dl_is_allowed = query_yes_no(
             f"Do you want to download file '{model_url}' to '{self.fp_zip}'?"
         )  # type: bool
         if dl_is_allowed:
             get_file_with_progress_bar(model_url=model_url,
                                        file_path=self.fp_zip)
         else:
             raise CLTKException(
                 f"Download of necessary Stanza model declined for '{self.language}'. Unable to continue with Stanza's processing."
             )
Пример #12
0
    def divide_works(self, corpus):
        """Use the work-breaking option.
        TODO: Maybe incorporate this into ``convert_corpus()``
        TODO: Write test for this

        """
        if corpus == "tlg":
            orig_dir = make_cltk_path("originals/tlg")
            works_dir = make_cltk_path("grc/text/tlg/individual_works")
            file_prefix = "TLG"
            lat = False
        elif corpus == "phi5":
            orig_dir = make_cltk_path("originals/phi5")
            works_dir = make_cltk_path("lat/text/phi5/individual_works")
            file_prefix = "LAT"
            lat = True  # this is for the optional TLGU argument to convert()
        elif corpus == "phi7":
            raise CLTKException(
                "``phi7`` cannot be divided into individual works.")
        else:
            raise CLTKException(
                f"Invalid corpus '{corpus}'. This should never happen.")

        if not os.path.exists(works_dir):
            os.makedirs(works_dir)

        files = os.listdir(orig_dir)
        texts = [
            x for x in files
            if x.endswith(".TXT") and x.startswith(file_prefix)
        ]

        for file in texts:
            orig_file_path = os.path.join(orig_dir, file)
            new_file_path = os.path.join(works_dir, file)

            try:
                self.convert(orig_file_path,
                             new_file_path,
                             divide_works=True,
                             lat=lat)
                logger.info("Writing files at %s to %s.", orig_file_path,
                            works_dir)
            except Exception as err:
                logger.error("Failed to convert files: %s.", err)
Пример #13
0
 def algorithm(self):
     valid_variants = ["fasttext", "nlpl"]
     if self.variant == "fasttext":
         return FastTextEmbeddings(iso_code=self.language)
     elif self.variant == "nlpl":
         return Word2VecEmbeddings(iso_code=self.language)
     else:
         valid_variants_str = "', '".join(valid_variants)
         raise CLTKException(
             f"Invalid embeddings ``variant`` ``{self.variant}``. Available: '{valid_variants_str}'."
         )
Пример #14
0
def download_stanza_model(iso_code: str) -> None:
    """Download language models, from the ``stanza`` project,
    that are supported by the CLTK or in scope. More here:
    `<https://stanfordnlp.github.io/stanza/models.html>_.

    TODO: Re-enable `treebank` parameter
    """
    print(f"Going to download Stanza model for '{iso_code}'.")
    if iso_code not in AVAIL_STANZA_LANGS:
        raise CLTKException(f"Language '{iso_code}' not available for Stanza.")
    StanzaWrapper(language=iso_code, interactive=False, silent=False)
    print(f"Finished downloading Stanza for '{iso_code}'.")
Пример #15
0
 def run(self, input_doc: Doc) -> Doc:
     lookup_algo = self.algorithm
     output_doc = deepcopy(input_doc)
     for word in output_doc.words:
         if self.language == "lat":
             word.definition = lookup_algo.lookup(word.lemma)
         elif self.language == "non":
             word.definition = lookup_algo.lookup(word.string)
         else:
             raise CLTKException(
                 f"``LexiconProcess()`` not available for language '{self.language}' This should never happen."
             )
     return output_doc
Пример #16
0
    def _build_fasttext_filepath(self):
        """Create filepath at which to save a downloaded
        fasttext model.

        .. todo::
           Do better than test for just name. Try trimming up to user home dir.

        >>> from cltk.embeddings.embeddings import FastTextEmbeddings  # doctest: +SKIP
        >>> embeddings_obj = FastTextEmbeddings(iso_code="lat", silent=True)  # doctest: +SKIP
        >>> vec_fp = embeddings_obj._build_fasttext_filepath()  # doctest: +SKIP
        >>> os.path.split(vec_fp)[1]  # doctest: +SKIP
        'wiki.la.vec'
        >>> embeddings_obj = FastTextEmbeddings(iso_code="lat", training_set="bin", silent=True)  # doctest: +SKIP
        >>> bin_fp = embeddings_obj._build_fasttext_filepath()  # doctest: +SKIP
        >>> os.path.split(bin_fp)[1]  # doctest: +SKIP
        'wiki.la.bin'
        >>> embeddings_obj = FastTextEmbeddings(iso_code="lat", training_set="common_crawl", model_type="vec", silent=True)  # doctest: +SKIP
        >>> os.path.split(vec_fp)[1]  # doctest: +SKIP
        'cc.la.300.vec'
        >>> embeddings_obj = FastTextEmbeddings(iso_code="lat", training_set="common_crawl", model_type="bin", silent=True)  # doctest: +SKIP
        >>> bin_fp = embeddings_obj._build_fasttext_filepath()  # doctest: +SKIP
        >>> vec_fp = embeddings_obj._build_fasttext_filepath()  # doctest: +SKIP
        >>> os.path.split(bin_fp)[1]  # doctest: +SKIP
        'cc.la.300.bin'
        """
        fasttext_code = MAP_LANGS_CLTK_FASTTEXT[self.iso_code]

        fp_model = None
        if self.training_set == "wiki":
            fp_model = os.path.join(
                CLTK_DATA_DIR,
                self.iso_code,
                "embeddings",
                "fasttext",
                f"wiki.{fasttext_code}.{self.model_type}",
            )
        elif self.training_set == "common_crawl":
            fp_model = os.path.join(
                CLTK_DATA_DIR,
                self.iso_code,
                "embeddings",
                "fasttext",
                f"cc.{fasttext_code}.300.{self.model_type}",
            )
        else:
            raise CLTKException(
                f"Unexpected ``training_set`` ``{self.training_set}``.")
        return fp_model
Пример #17
0
 def _build_fasttext_url(self):
     """Make the URL at which the requested model may be
     downloaded."""
     fasttext_code = self.MAP_LANGS_CLTK_FASTTEXT[self.iso_code]
     if self.training_set == "wiki":
         if self.model_type == "vec":
             ending = "vec"
         else:
             # for .bin
             ending = "zip"
         url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{fasttext_code}.{ending}"
     elif self.training_set == "common_crawl":
         url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.{fasttext_code}.300.{self.model_type}.gz"
     else:
         raise CLTKException("Unexpected exception.")
     return url
Пример #18
0
    def syllabify(self, word: str, mode="SSP") -> Union[List[str], str]:
        """

        :param word: word to syllabify
        :param mode: syllabification algorithm SSP (Sonority Sequence Principle)
         or MOP (Maximum Onset Principle)
        :return: syllabifier word
        """
        if mode == "SSP":
            res = self.syllabify_ssp(word)
        elif mode == "MOP":
            res = self.syllabify_mop(word)
        else:
            raise CLTKException("Wrong given mode")

        if self.sep:
            return self.sep.join(res)
        return res
Пример #19
0
    def lookup(self, lemma: str) -> str:
        """Perform match of a lemma against headwords. If more than one match,
        then return the concatenated entries. For example:

        >>> from cltk.lexicon.lat import LatinLewisLexicon
        >>> lll = LatinLewisLexicon(interactive=False)
        >>> lll.lookup("clemens")[:50]
        'clēmēns entis (abl. -tī; rarely -te, L.), adj. wit'
        >>> all(word in lll.lookup("levis") for word in ["levis","lēvis"]) # Test for concatenated entries
        True
        >>> lll.lookup("omnia")
        ''
        >>> lll.lookup(".")
        ''
        >>> lll.lookup("123")
        ''
        >>> lll.lookup("175.")
        ''
        >>> lll.lookup("(") # Test for regex special character
        ''
        """
        if not self.entries:
            raise CLTKException(
                "No lexicon entries found in the .yaml file. This should never happen."
            )

        if regex.match(r"^[0-9\.\?,\:;\!\<\>\-]*$", lemma) is not None:
            return ""

        lemma = regex.escape(lemma.lower())

        keys = self.entries.keys()
        matches = [
            key for key in keys if regex.match(rf"^{lemma}[0-9]?$", key)
        ]
        n_matches = len(matches)
        if n_matches > 1:
            return "\n".join([self.entries[key] for key in matches])
        elif n_matches == 1:
            return self.entries[matches[0]]
        else:
            return ""
Пример #20
0
 def _check_and_download_tlgu_source(self):
     """Check if tlgu downloaded, if not download it."""
     path = make_cltk_path("grc/software/grc_software_tlgu/tlgu.h")
     if not os.path.isfile(path):
         dl_msg = f"This part of the CLTK depends upon TLGU, software written by Dimitri Marinakis `<http://tlgu.carmen.gr/>`_."
         print(dl_msg)
         repo_url = "https://github.com/cltk/grc_software_tlgu.git"
         dl_dir = os.path.split(path)[0]
         dl_question = (
             f"Do you want to download TLGU from '{repo_url}' to '{dl_dir}'?"
         )
         if self.interactive:
             do_download = query_yes_no(question=dl_question)
         else:
             do_download = True
         if do_download:
             fetch_corpus = FetchCorpus(language="grc")
             fetch_corpus.import_corpus(corpus_name="grc_software_tlgu")
         else:
             raise CLTKException(
                 f"TLGU software required for this class to work.")
Пример #21
0
def from_ud(feature_name: str,
            feature_value: str) -> Optional[MorphosyntacticFeature]:
    """For a given Universal Dependencies feature name and value,
    return the appropriate feature class/value.
    >>> from_ud('Case', 'Abl')
    ablative
    >>> from_ud('Abbr', 'Yes')
    pos
    >>> from_ud('PronType', 'Ind')
    indefinite
    """
    # Do cleanup on certain inputs that look like ``"Number[psor]``
    # Thus this is rewritten to ``feature_name = Number``
    # and ``feature_value = psor``.
    if "[" in feature_name and "]" in feature_name:
        feature_name_split: List[str] = feature_name.split("[", maxsplit=1)
        feature_name = feature_name_split[0]
        feature_value = feature_name_split[1][:-1]
        feature_value = feature_value.title()

    if feature_name in from_ud_map:
        feature_map = from_ud_map[feature_name]
    else:
        msg1: str = f"Unrecognized UD `feature_name` ('{feature_name}') with `feature_value` ('{feature_value}')."
        msg2: str = f"Please raise an issue at <https://github.com/cltk/cltk/issues> and include a small sample to reproduce the error."
        print(msg1)
        print(msg2)
        # raise CLTKException(msg)
        return None

    values = feature_value.split(",")
    for value in values:
        if value in feature_map:
            return feature_map[value]
        else:
            raise CLTKException(
                f"{value}: Unrecognized value for UD feature {feature_name}")
Пример #22
0
    def lookup(self, lemma: str) -> str:
        """Perform match of a lemma against headwords. If more than one match,
        then return the concatenated entries. For example:

        >>> lll = LatinLewisLexicon()
        >>> lll.lookup("clemens")[:50]
        'clēmēns entis (abl. -tī; rarely -te, L.), adj. wit'
        >>> lll.lookup("omnia")
        ''
        >>> lll.lookup(".")
        ''
        >>> lll.lookup("123")
        ''
        >>> lll.lookup("175.")
        ''
        """
        if not self.entries:
            raise CLTKException(
                "No lexicon entries found in the .yaml file. This should never happen."
            )

        if regex.match(r"^[0-9\.\?,\:;\!\<\>\-]*$", lemma) is not None:
            return ""

        lemma = lemma.lower()

        keys = self.entries.keys()
        matches = [
            key for key in keys if regex.match(rf"^{lemma}[0-9]?$", key)
        ]
        n_matches = len(matches)
        if n_matches > 1:
            return "\n".join([self.entries[key] for key in matches])
        elif n_matches == 1:
            return self.entries[matches[0]]
        else:
            return ""
Пример #23
0
 def __init__(self, interactive: bool = True):
     self.interactive = interactive
     self.lewis_yaml_fp = make_cltk_path(
         "lat", "lexicon", "cltk_lat_lewis_elementary_lexicon",
         "lewis.yaml")
     try:
         self.entries = self._load_entries()
     except FileNotFoundError:
         if self.interactive:
             dl_msg = f"This part of the CLTK depends upon Lewis's *An Elementary Latin Dictionary* (1890)."
             print(dl_msg)
             dl_question = "Do you want to download this?"
             do_download = query_yes_no(question=dl_question)
         else:
             do_download = True
         if do_download:
             fetch_corpus = FetchCorpus(language="lat")
             fetch_corpus.import_corpus(
                 corpus_name="cltk_lat_lewis_elementary_lexicon")
         else:
             raise CLTKException(
                 f"File '{self.lewis_yaml_fp}' is not found. It is required for this class."
             )
         self.entries = self._load_entries()
Пример #24
0
 def __init__(self, interactive: bool = True):
     self.interactive = interactive
     self.zoega_yaml_fp = make_cltk_path("non", "dictionary",
                                         "cltk_non_zoega_dictionary",
                                         "dictionary.yaml")
     try:
         self.entries = self._load_entries()
     except FileNotFoundError:
         if self.interactive:
             dl_msg = f"This part of the CLTK depends upon Zoëga's *A Concise Old Norse Dictionary* (1890)."
             print(dl_msg)
             dl_question = "Do you want to download this?"
             do_download = query_yes_no(question=dl_question)
         else:
             do_download = True
         if do_download:
             fetch_corpus = FetchCorpus(language="non")
             fetch_corpus.import_corpus(
                 corpus_name="cltk_non_zoega_dictionary")
         else:
             raise CLTKException(
                 f"File '{self.zoega_yaml_fp}' is not found. It is required for this class."
             )
         self.entries = self._load_entries()
Пример #25
0
def cltk_doc_to_features_table(
    cltk_doc: Doc,
) -> Tuple[List[str], List[List[Union[str, int, float, None]]]]:
    """Take a CLTK ``Doc`` and return a list of lists ready for
    machine learning.

    This expects the default features available for Greek and Latin
    (word embeddings, morphology, syntax, lemmata). This should be
    improved to fail gracefully when less features available in the
    input ``Doc``.

    TODO: Fail gracefully when missing info in ``Doc``.
    """

    if len(cltk_doc.sentences) < 1:
        raise CLTKException("Must contain at least one ``Doc.sentence``.")

    list_of_list_features = (
        list()
    )  # type: List[List[Union[str, int, float, None, np.ndarray]]]

    for sentence in cltk_doc.sentences:
        for word in sentence:
            word_features_list = (
                list()
            )  # type: List[Union[str, int, float, None, np.ndarray]]
            # note: this gets made and remade; only needs to be done once, at beginning or at end; need to add check that len == the actual instance row
            variable_names = list()  # type: List[str]
            # Get word token chars
            word_features_list.append(word.string)
            variable_names.append("string")
            # Get lemma
            word_features_list.append(word.lemma)
            variable_names.append("lemma")
            # Get embedding
            word_features_list.append(word.embedding)
            variable_names.append("embedding")
            # Get stopword binary
            word_features_list.append(word.stop)
            variable_names.append("is_stop")
            # Get NER binary
            word_features_list.append(word.named_entity)
            variable_names.append("lemma")

            # Get morphological info
            pos_label = get_pos(word=word)
            word_features_list.append(
                pos_label
            )  # note: incorrectly labels upper-cased words as proper_noun, eg 'Βίβλος'
            variable_names.append("pos")
            feature_names, features_present = get_features(word=word)
            word_features_list += (
                features_present  # add the features list to the big list
            )
            variable_names += feature_names

            # Get dependency info
            governing_word = get_governor_word(word=word, sentence=sentence)
            pos_label_governor = get_pos(word=governing_word)
            word_features_list.append(pos_label_governor)
            variable_names.append("governing_word")
            feature_names_governor, features_present_governor = get_features(
                word=governing_word, prepend_to_label="governor_"
            )
            word_features_list += (
                features_present_governor  # add the features list to the big list
            )
            variable_names += feature_names_governor
            # governor_edge = get_governor_relationship(word=word, sentence=sentence)
            # word_features_list.append(governor_edge)
            relation_type = word.dependency_relation
            word_features_list.append(relation_type)
            variable_names.append("governing_relationship")

            list_of_list_features.append(word_features_list)

    assert len(variable_names) == len(
        list_of_list_features[0]
    ), f"The names of variables ({len(variable_names)}) does not match then actual number of variables ({len(list_of_list_features[0])}). These must be equal."

    return variable_names, list_of_list_features
Пример #26
0
    def decline(self,
                lemma: str,
                flatten: bool = False,
                collatinus_dict: bool = False) -> List[Tuple[str, str]]:
        """ Decline a lemma

        .. warning:: POS are incomplete as we do not detect the type outside of verbs, participle and adjective.

        :raise CLTKException: When the lemma is unknown to our data

        :param lemma: Lemma (Canonical form) to decline
        :type lemma: str
        :param flatten: If set to True, returns a list of forms without natural language information about them
        :type flatten: bool
        :param collatinus_dict: If sets to True, Dictionary of grammatically valid forms, including variants, with keys\
         corresponding to morpho informations.
        :type collatinus_dict: bool
        :return: List of tuple where first value is the form and second the pos, ie [("sum", "v1ppip---")]
        :rtype: list or dict

        """

        if lemma in self._lemmas:
            # Get data information
            lemma_entry = self._lemmas[lemma]
        elif lemma in self._mapped and self._mapped[lemma] in self._lemmas:
            # Get data information
            lemma = self._mapped[lemma]
            lemma_entry = self._lemmas[self._mapped[lemma]]
        else:
            raise CLTKException("%s is unknown" % lemma)
        model = self._models[lemma_entry["model"]]

        # Get the roots
        roots = self._getRoots(lemma, model=model)
        # Get the known forms in order
        keys = sorted([int(key) for key in model["des"].keys()])
        forms_data = [(key, model["des"][str(key)]) for key in keys]

        # Generate the return dict
        forms = {key: [] for key in keys}
        for key, form_list in forms_data:
            for form in form_list:
                root_id, endings = tuple(form)
                for root in roots[root_id]:
                    for ending in endings:
                        forms[key].append(root + ending)

        # sufd means we have the original forms of the parent but we add a suffix
        if len(model["sufd"]):
            # For each constant form1
            for key, iter_forms in forms.items():
                new_forms = []
                # We add the constant suffix
                for sufd in model["sufd"]:
                    new_forms += [form + sufd for form in iter_forms]
                forms[key] = new_forms

        # If we need a secure version of the forms. For example, if we have variants
        if len(model["suf"]):
            cached_forms = {k: v + []
                            for k, v in forms.items()
                            }  # Making cache without using copy

            # For each suffix
            # The format is [suffix characters, [modified forms]]
            for suffixes in model["suf"]:
                suffix, modified_forms = suffixes[0], suffixes[1]
                for modified_form in modified_forms:
                    forms[modified_form] += [
                        f + suffix for f in cached_forms[modified_form]
                    ]
            # We update with the new roots

        # If some form do not exist, we delete them prehentively
        if len(model["abs"]):
            for abs_form in model["abs"]:
                if abs_form in forms:
                    del forms[abs_form]

        if flatten:
            return list(
                [form for case_forms in forms.values() for form in case_forms])
        elif collatinus_dict:
            return forms
        else:
            return list([(form, self.__getPOS(key))
                         for key, case_forms in forms.items()
                         for form in case_forms])
Пример #27
0
    def _getRoots(self, lemma, model):
        """ Retrieve the known roots of a lemma

        :param lemma: Canonical form of the word (lemma)
        :type lemma: str
        :param model: Model data from the loaded self.__data__. Can be passed by decline()
        :type model: dict
        :return: Dictionary of roots with their root identifier as key
        :rtype: dict
        """

        if lemma not in self._lemmas:
            raise CLTKException("%s is unknown" % lemma)

        ROOT_IDS = {"K": "lemma", "1": "geninf", "2": "perf"}

        lemma_entry = self._lemmas[lemma]
        if "quantity" in lemma_entry and lemma_entry["quantity"]:
            lemma_in_lemma_entry = lemma_entry["quantity"]
        else:
            lemma_in_lemma_entry = self._remove_disambiguation(
                lemma_entry["lemma"])

        original_roots = {
            root_id: lemma_entry[root_name].split(",")
            for root_id, root_name in ROOT_IDS.items()
            if root_id != "K" and lemma_entry[root_name]
        }
        returned_roots = {}

        if not model:
            model = self._models[lemma_entry["model"]]

        # For each registered root in the model,
        for model_root_id, model_root_data in model["R"].items():

            # If we have K, it's equivalent to canonical form
            if model_root_data[0] == "K":
                returned_roots[model_root_id] = lemma_in_lemma_entry.split(",")
            # Otherwise we have deletion number and addition char
            else:
                deletion, addition = int(
                    model_root_data[0]), model_root_data[1] or ""

                # If a the root is declared already,
                # we retrieve the information
                if model_root_id != "1" and model_root_id in returned_roots:
                    lemma_roots = returned_roots[model_root_id]
                else:
                    lemma_roots = lemma_in_lemma_entry.split(",")
                # We construct the roots
                returned_roots[model_root_id] = [
                    lemma_root[:-deletion] + addition
                    for lemma_root in lemma_roots
                ]

            if model_root_id in original_roots:
                returned_roots[model_root_id].extend(
                    original_roots[model_root_id])
            returned_roots[model_root_id] = list(
                set(returned_roots[model_root_id]))
        original_roots.update(returned_roots)

        return original_roots
Пример #28
0
 def _check_install(self):
     """Check if tlgu installed, if not install it."""
     try:
         subprocess.check_output(["which", "tlgu"])
     except subprocess.SubprocessError as sub_err:
         print("TLGU not installed.")
         logger.info("TLGU not installed: %s", sub_err)
         logger.info("Installing TLGU.")
         if not subprocess.check_output(["which", "gcc"]):
             logger.error("GCC seems not to be installed.")
         else:
             tlgu_path = make_cltk_path("grc/software/grc_software_tlgu")
             if self.interactive:
                 install_question = "Do you want to install TLGU?"
                 do_install = query_yes_no(question=install_question)
                 if not do_install:
                     raise CLTKException(
                         "TLGU installation required for this class to work."
                     )
             else:
                 print("Non-interactive installation. Continuing ...")
             command = "cd {0} && make install".format(tlgu_path)
             print(f"Going to run command: ``{command}``")
             try:
                 p_out = subprocess.call(command, shell=True)
             except subprocess.SubprocessError as sub_err:
                 print(
                     "Error executing installation. Going to check output of ``subprocess.call()`` ..."
                 )
                 raise CLTKException(sub_err)
             if p_out == 0:
                 msg = "TLGU installed."
                 print(msg)
                 logger.info(msg)
                 return True
             else:
                 msg = "TLGU install without sudo failed. Going to try again with sudo (usually required for Linux) ..."
                 print(msg)
                 logger.error(msg)
             command = "cd {0} && sudo make install".format(tlgu_path)
             if self.interactive:
                 install_question = "Do you want to install TLGU? with sudo?"
                 do_install = query_yes_no(question=install_question)
                 if not do_install:
                     raise CLTKException(
                         "TLGU installation required for this class to work."
                     )
                 p_out = subprocess.call(command, shell=True)
             else:
                 print("Going to run command:", command)
                 p_out = subprocess.call(command, shell=True)
             if p_out == 0:
                 msg = "TLGU installed."
                 print(msg)
                 logger.info(msg)
             else:
                 msg = "TLGU install with sudo failed."
                 print(msg)
                 logger.error(msg)
                 raise CLTKException(
                     "TLGU installation required for this class to work.")
Пример #29
0
PARSER = argparse.ArgumentParser()
PARSER.add_argument(
    "--languages",
    help="What languages to download. Comma separated, no spaces.")
ARGS = PARSER.parse_args()
SELECTED_LANGS = list()  # type: List[str]
ALL_AVAILABLE_LANGS = list(iso_to_pipeline.keys())  # type: List[str]
if not ARGS.languages:
    SELECTED_LANGS = ALL_AVAILABLE_LANGS
else:
    SELECTED_LANGS_SPLIT = ARGS.languages.split(",")
    for LANG in SELECTED_LANGS_SPLIT:
        if LANG not in ALL_AVAILABLE_LANGS:
            raise CLTKException(
                f"Unavailable language '{LANG}' chosen. Choose from: {', '.join(ALL_AVAILABLE_LANGS)}"
            )
    SELECTED_LANGS = SELECTED_LANGS_SPLIT


def download_stanza_model(iso_code: str) -> None:
    """Download language models, from the ``stanza`` project,
    that are supported by the CLTK or in scope. More here:
    `<https://stanfordnlp.github.io/stanza/models.html>_.

    TODO: Re-enable `treebank` parameter
    """
    print(f"Going to download Stanza model for '{iso_code}'.")
    if iso_code not in AVAIL_STANZA_LANGS:
        raise CLTKException(f"Language '{iso_code}' not available for Stanza.")
    StanzaWrapper(language=iso_code, interactive=False, silent=False)