Exemplo n.º 1
0
 def tag_bigram(self, untagged_string: str):
     """Tag POS with bigram tagger.
     :type untagged_string: str
     :param : An untagged, untokenized string of text.
     :rtype tagged_text: str
     """
     untagged_tokens = wordpunct_tokenize(untagged_string)
     pickle_path = self.available_taggers["bigram"]
     tagger = open_pickle(pickle_path)
     tagged_text = tagger.tag(untagged_tokens)
     return tagged_text
Exemplo n.º 2
0
    def __init__(self: object,
                 train: List[list] = None,
                 seed: int = 3,
                 verbose: bool = False):
        self.models_path = BackoffLatinLemmatizer.models_path

        missing_models_message = "BackoffLatinLemmatizer requires the ```latin_models_cltk``` to be in cltk_data. Please load this corpus."

        try:
            self.train = open_pickle(
                os.path.join(self.models_path,
                             "latin_pos_lemmatized_sents.pickle"))
            self.LATIN_OLD_MODEL = open_pickle(
                os.path.join(self.models_path, "latin_lemmata_cltk.pickle"))
            self.LATIN_MODEL = open_pickle(
                os.path.join(self.models_path, "latin_model.pickle"))
        except FileNotFoundError as err:
            raise type(err)(missing_models_message)

        self.latin_sub_patterns = latin_sub_patterns  # Move to latin_models_cltk

        self.seed = seed
        self.VERBOSE = verbose

        def _randomize_data(train: List[list], seed: int):
            import random

            random.seed(seed)
            random.shuffle(train)
            pos_train_sents = train[:4000]
            lem_train_sents = [[(item[0], item[1]) for item in sent]
                               for sent in train]
            train_sents = lem_train_sents[:4000]
            test_sents = lem_train_sents[4000:5000]

            return pos_train_sents, train_sents, test_sents

        self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data(
            self.train, self.seed)
        self._define_lemmatizer()
Exemplo n.º 3
0
 def __init__(self, language: str = None, lang_vars: object = None):
     """
     :param language : language for sentences tokenization
     :type language: str
     """
     self.language = language
     if self.language == "lat":
         self.language_old = "latin"
     self.lang_vars = lang_vars
     super().__init__(language=self.language)
     if self.language:
         self.models_path = self._get_models_path(self.language)
         try:
             self.model = open_pickle(
                 os.path.join(
                     os.path.expanduser(self.models_path),
                     f"{self.language_old}_punkt.pickle",
                 ))
         except FileNotFoundError as err:
             raise type(err)(
                 BasePunktSentenceTokenizer.missing_models_message)
Exemplo n.º 4
0
    def __init__(self: object, strict: bool = False):
        """Constructor for ``LatinPunktSentenceTokenizer``.

        :param strict : allow for stricter punctuation for sentences tokenization
        :type strict: bool
        """
        self.lang_vars = LatinLanguageVars()
        self.strict = strict
        super().__init__(language="lat", lang_vars=self.lang_vars)

        fp_sentence_tok_model_dir = "lat/model/lat_models_cltk/tokenizers/sentence/"
        models_path = os.path.join(CLTK_DATA_DIR, fp_sentence_tok_model_dir)
        self.models_path = os.path.join(models_path, "latin_punkt.pickle")

        try:
            self.model = open_pickle(self.models_path)
        except FileNotFoundError as err:
            msg = f"``LatinPunktSentenceTokenizer`` could not find required file ``{self.models_path}``. Download the corpus ``lat_models_cltk``."
            raise FileNotFoundError(msg)

        if self.strict:
            PunktLanguageVars.sent_end_chars = STRICT_PUNCTUATION
        else:
            PunktLanguageVars.sent_end_chars = PUNCTUATION