示例#1
0
文件: backoff.py 项目: cltk/cltk
    def __init__(self: object, train: List[list] = None, seed: int = 3, verbose: bool = False):
        self.models_path = BackoffGreekLemmatizer.models_path

        missing_models_message = "BackoffGreekLemmatizer requires the ```greek_models_cltk``` to be in cltk_data. Please load this corpus."

        try:
            self.train =  open_pickle(os.path.join(self.models_path, 'greek_lemmatized_sents.pickle'))
            self.GREEK_OLD_MODEL =  open_pickle(os.path.join(self.models_path, 'greek_lemmata_cltk.pickle'))
            self.GREEK_MODEL =  open_pickle(os.path.join(self.models_path, 'greek_model.pickle'))
        except FileNotFoundError as err:
            raise type(err)(missing_models_message)

        self.greek_sub_patterns = greek_sub_patterns # Move to greek_models_cltk

        self.seed = seed
        self.VERBOSE=verbose

        def _randomize_data(train: List[list], seed: int):
            import random
            random.seed(seed)
            random.shuffle(train)
            pos_train_sents = train[:4000]
            lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train]
            train_sents = lem_train_sents[:4000]
            test_sents = lem_train_sents[4000:5000]

            return pos_train_sents, train_sents, test_sents

        self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data(self.train, self.seed)
        self._define_lemmatizer()
示例#2
0
文件: backoff.py 项目: groschene/cltk
    def __init__(self: object, train: List[list] = None, seed: int = 3, verbose: bool = False):
        self.models_path = BackoffLatinLemmatizer.models_path

        missing_models_message = "BackoffLatinLemmatizer requires the ```latin_models_cltk``` to be in cltk_data. Please load this corpus."

        try:
            self.train =  open_pickle(os.path.join(self.models_path, 'latin_pos_lemmatized_sents.pickle'))
            self.LATIN_OLD_MODEL =  open_pickle(os.path.join(self.models_path, 'latin_lemmata_cltk.pickle'))
            self.LATIN_MODEL =  open_pickle(os.path.join(self.models_path, 'latin_model.pickle'))
        except FileNotFoundError as err:
            raise type(err)(missing_models_message)

        self.latin_sub_patterns = latin_sub_patterns # Move to latin_models_cltk

        self.seed = seed
        self.VERBOSE=verbose

        def _randomize_data(train: List[list], seed: int):
            import random
            random.seed(seed)
            random.shuffle(train)
            pos_train_sents = train[:4000]
            lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train]
            train_sents = lem_train_sents[:4000]
            test_sents = lem_train_sents[4000:5000]

            return pos_train_sents, train_sents, test_sents

        self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data(self.train, self.seed)
        self._define_lemmatizer()
示例#3
0
    def __init__(self, train, seed=3):
        self.train = train
        self.seed = seed

        rel_path = os.path.join(
            '~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
        path = os.path.expanduser(rel_path)

        # Check for presence of LATIN_OLD_MODEL
        file = 'latin_lemmata_cltk.pickle'

        old_model_path = os.path.join(path, file)
        if os.path.isfile(old_model_path):
            self.LATIN_OLD_MODEL = open_pickle(old_model_path)
        else:
            self.LATIN_OLD_MODEL = {}
            print('The file %s is not available in cltk_data' % file)

        # Check for presence of LATIN_MODEL
        file = 'latin_model.pickle'

        model_path = os.path.join(path, file)
        if os.path.isfile(model_path):
            self.LATIN_MODEL = open_pickle(model_path)
        else:
            self.LATIN_MODEL = {}
            print('The file %s is not available in cltk_data' % file)

        # Check for presence of misc_patterns
        self.latin_sub_patterns = latin_sub_patterns

        # Check for presence of verb_patterns
        self.latin_verb_patterns = latin_verb_patterns

        # Check for presence of latin_pps
        self.latin_pps = latin_pps

        def _randomize_data(train, seed):
            import random
            random.seed(seed)
            random.shuffle(train)
            pos_train_sents = train[:4000]
            lem_train_sents = [[(item[0], item[1]) for item in sent]
                               for sent in train]
            train_sents = lem_train_sents[:4000]
            test_sents = lem_train_sents[4000:5000]

            return pos_train_sents, train_sents, test_sents

        self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data(
            self.train, self.seed)
        self._define_lemmatizer()
示例#4
0
    def run(self, input_doc: Doc) -> Doc:
        """Compute the embeddings."""
        output_doc = deepcopy(input_doc)
        # For word2vec-style embedding, used for word embeddings
        embeddings_obj = self.algorithm
        for index, word_obj in enumerate(output_doc.words):
            if not self.embedding_length:
                self.embedding_length = embeddings_obj.get_embedding_length()
            word_embedding = embeddings_obj.get_word_vector(
                word=word_obj.string)
            if not isinstance(word_embedding, np.ndarray):
                word_embedding = np.zeros([self.embedding_length])
            word_obj.embedding = word_embedding
            output_doc.words[index] = word_obj

        # For sentence embeddings, uses TF-IDF
        # This checks whether a file of Tf-IDF embeddings is available
        if not self.idf_model:
            # First check if user has hard coded the path as an OS variable
            fp_idf_os_env: Optional[str] = os.environ.get("WORD_IDF_FILE")
            if fp_idf_os_env:
                self.idf_model = open_pickle(path=fp_idf_os_env)
            # Check if IDF embeddings available available in CLTK repo
            elif TFIDF_MAP.get(self.language):
                model_path: str = TFIDF_MAP[self.language]
                if not os.path.isdir(model_path):
                    msg = f"TF-IDF model path '{model_path}' not found. Going to try to download it ..."
                    logger.warning(msg)
                    dl_msg = f"This part of the CLTK depends upon models from the CLTK project."
                    model_url = f"https://github.com/cltk/{self.language}_models_cltk"
                    download_prompt(iso_code=self.language,
                                    message=dl_msg,
                                    model_url=model_url)
                self.idf_model = open_pickle(path=f"{model_path}word_idf.pkl")
        # Min and max values are needed while generating sentence embeddings
        if self.idf_model and not self.min_idf:
            tfidf_values: ValuesView = self.idf_model.values()
            tfidf_values_array: np.array = np.array(list(tfidf_values))
            self.min_idf: np.float64 = tfidf_values_array.min()
            self.max_idf: np.float64 = tfidf_values_array.max()
        if self.idf_model:
            for index, sent_obj in enumerate(output_doc.sentences):
                output_doc.sentence_embeddings[index] = get_sent_embeddings(
                    sent=sent_obj,
                    idf_model=self.idf_model,
                    min_idf=self.min_idf,
                    max_idf=self.max_idf,
                    dimensions=self.embedding_length,
                )
        return output_doc
示例#5
0
    def __init__(self, train, seed=3):
        self.train = train
        self.seed = seed
        
        rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
        path = os.path.expanduser(rel_path)

        # Check for presence of LATIN_OLD_MODEL
        file = 'latin_lemmata_cltk.pickle'      

        old_model_path = os.path.join(path, file)
        if os.path.isfile(old_model_path):
            self.LATIN_OLD_MODEL = open_pickle(old_model_path)
        else:
            self.LATIN_OLD_MODEL = {}
            print('The file %s is not available in cltk_data' % file)  
        
        # Check for presence of LATIN_MODEL
        file = 'latin_model.pickle'      

        model_path = os.path.join(path, file)
        if os.path.isfile(model_path):
            self.LATIN_MODEL = open_pickle(model_path)
        else:
            self.LATIN_MODEL = {}
            print('The file %s is not available in cltk_data' % file)  
        
        # Check for presence of misc_patterns
        self.latin_sub_patterns = latin_sub_patterns

        # Check for presence of verb_patterns
        self.latin_verb_patterns = latin_verb_patterns

        # Check for presence of latin_pps
        self.latin_pps = latin_pps

        def _randomize_data(train, seed):
            import random
            random.seed(seed)
            random.shuffle(train)
            pos_train_sents = train[:4000]
            lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train]
            train_sents = lem_train_sents[:4000]
            test_sents = lem_train_sents[4000:5000]

            return pos_train_sents, train_sents, test_sents

        self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data(self.train, self.seed)
        self._define_lemmatizer()
示例#6
0
 def test_open_pickle(self):
     """Test opening pickle. This requires ``greek_models_cltk``
     to have been run in ``setUp()``.
     """
     pickle_path_rel = '~/cltk_data/greek/model/greek_models_cltk/tokenizers/sentence/greek.pickle'  # pylint: disable=line-too-long
     pickle_path = os.path.expanduser(pickle_path_rel)
     a_pickle = open_pickle(pickle_path)
     self.assertTrue(a_pickle)
示例#7
0
 def test_open_pickle(self):
     """Test opening pickle. This requires ``greek_models_cltk``
     to have been run in ``setUp()``.
     """
     pickle_path_rel = '~/cltk_data/greek/model/greek_models_cltk/tokenizers/sentence/greek.pickle'  # pylint: disable=line-too-long
     pickle_path = os.path.expanduser(pickle_path_rel)
     a_pickle = open_pickle(pickle_path)
     self.assertTrue(a_pickle)
示例#8
0
文件: pos.py 项目: diyclassics/cltk
    def _load_model(self, name):
        model = self.models.get(name, None)

        if model is None:
            pickle_path = self.available_taggers[name]
            model = open_pickle(pickle_path)
            self.models[name] = model

        return model
示例#9
0
文件: grc.py 项目: todd-cook/cltk
    def __init__(
        self: object, train: List[list] = None, seed: int = 3, verbose: bool = False
    ):
        self.models_path = models_path

        missing_models_message = "GreekBackoffLemmatizer requires the ```grc_models_cltk``` to be in cltk_data. Please load this corpus."

        try:
            self.train = open_pickle(
                os.path.join(self.models_path, "greek_lemmatized_sents.pickle")
            )
            self.GREEK_OLD_MODEL = open_pickle(
                os.path.join(self.models_path, "greek_lemmata_cltk.pickle")
            )
            self.GREEK_MODEL = open_pickle(
                os.path.join(self.models_path, "greek_model.pickle")
            )
        except FileNotFoundError as err:
            raise type(err)(missing_models_message)

        self.greek_sub_patterns = greek_sub_patterns

        self.seed = seed
        self.VERBOSE = verbose

        def _randomize_data(train: List[list], seed: int):
            import random

            random.seed(seed)
            random.shuffle(train)
            train_size = int(0.9 * len(train))
            pos_train_sents = train[:train_size]
            lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train]
            train_sents = lem_train_sents[:train_size]
            test_sents = lem_train_sents[train_size:]

            return pos_train_sents, train_sents, test_sents

        self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data(
            self.train, self.seed
        )
        self._define_lemmatizer()
示例#10
0
文件: pos.py 项目: Akirato/cltk
 def tag_ngram_123_backoff(self, untagged_string: str):
     """Tag POS with 1-, 2-, 3-gram tagger.
     :type untagged_string: str
     :param : An untagged, untokenized string of text.
     :rtype tagged_text: str
     """
     untagged_tokens = wordpunct_tokenize(untagged_string)
     pickle_path = self.available_taggers['ngram_123_backoff']
     tagger = open_pickle(pickle_path)
     tagged_text = tagger.tag(untagged_tokens)
     return tagged_text
示例#11
0
文件: pos.py 项目: yash-nisar/cltk
 def tag_unigram(self, untagged_string: str):
     """Tag POS with unigram tagger.
     :type untagged_string: str
     :param : An untagged, untokenized string of text.
     :rtype tagged_text: str
     """
     untagged_tokens = wordpunct_tokenize(untagged_string)
     pickle_path = self.available_taggers['unigram']
     tagger = open_pickle(pickle_path)
     tagged_text = tagger.tag(untagged_tokens)
     return tagged_text
示例#12
0
文件: pos.py 项目: yash-nisar/cltk
 def tag_ngram_123_backoff(self, untagged_string: str):
     """Tag POS with 1-, 2-, 3-gram tagger.
     :type untagged_string: str
     :param : An untagged, untokenized string of text.
     :rtype tagged_text: str
     """
     untagged_tokens = wordpunct_tokenize(untagged_string)
     pickle_path = self.available_taggers['ngram_123_backoff']
     tagger = open_pickle(pickle_path)
     tagged_text = tagger.tag(untagged_tokens)
     return tagged_text
示例#13
0
文件: pos.py 项目: Akirato/cltk
 def tag_unigram(self, untagged_string: str):
     """Tag POS with unigram tagger.
     :type untagged_string: str
     :param : An untagged, untokenized string of text.
     :rtype tagged_text: str
     """
     untagged_tokens = wordpunct_tokenize(untagged_string)
     pickle_path = self.available_taggers['unigram']
     tagger = open_pickle(pickle_path)
     tagged_text = tagger.tag(untagged_tokens)
     return tagged_text
示例#14
0
    def tokenize_sentences(self, untokenized_string: str):
        """Tokenize sentences by reading trained tokenizer and invoking
        ``PunktSentenceTokenizer()``.
        :type untokenized_string: str
        :param untokenized_string: A string containing one of more sentences.
        :rtype : list of strings
        """
        # load tokenizer
        assert isinstance(untokenized_string, str), \
            'Incoming argument must be a string.'

        if self.language == 'latin':
            self.models_path = self._get_models_path(self.language)
            try:
                self.model = open_pickle(
                    os.path.expanduser(
                        os.path.join(self.models_path, 'latin_punkt.pickle')))
            except FileNotFoundError as err:
                raise type(err)(TokenizeSentence.missing_models_message +
                                self.models_path)
            tokenizer = self.model
            tokenizer._lang_vars = self.lang_vars
        elif self.language == 'greek':  # Workaround for regex tokenizer
            self.sent_end_chars = GreekLanguageVars.sent_end_chars
            self.sent_end_chars_regex = '|'.join(self.sent_end_chars)
            self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s'
        else:
            tokenizer = open_pickle(self.tokenizer_path)
            tokenizer = self._setup_tokenizer(tokenizer)

        # mk list of tokenized sentences
        if self.language == 'latin':
            return tokenizer.tokenize(untokenized_string)
        elif self.language == 'greek':
            return re.split(self.pattern, untokenized_string)
        else:
            tokenized_sentences = [
                sentence for sentence in tokenizer.sentences_from_text(
                    untokenized_string, realign_boundaries=True)
            ]
            return tokenized_sentences
示例#15
0
文件: sentence.py 项目: cltk/cltk
    def __init__(self: object, language:str = 'latin'):
        """
        :param language : language for sentence tokenization
        :type language: str
        """
        self.lang_vars = LatinLanguageVars()
        super().__init__(language='latin', lang_vars=self.lang_vars)
        self.models_path = LatinPunktSentenceTokenizer.models_path

        try:
            self.model =  open_pickle(os.path.join(self.models_path, 'latin_punkt.pickle'))
        except FileNotFoundError as err:
            raise type(err)(LatinPunktSentenceTokenizer.missing_models_message)
示例#16
0
    def tokenize_sentences(self, untokenized_string: str):
        """Tokenize sentences by reading trained tokenizer and invoking
        ``PunktSentenceTokenizer()``.
        :type untokenized_string: str
        :param untokenized_string: A string containing one of more sentences.
        :rtype : list of strings
        """
        # load tokenizer
        assert isinstance(untokenized_string, str), \
            'Incoming argument must be a string.'

        if self.language == 'latin':
            self.models_path = self._get_models_path(self.language)
            try:
                self.model = open_pickle(
                    os.path.expanduser(os.path.join(self.models_path, 'latin_punkt.pickle')))
            except FileNotFoundError as err:
                raise type(err)(TokenizeSentence.missing_models_message + self.models_path)
            tokenizer = self.model
            tokenizer._lang_vars = self.lang_vars
        elif self.language == 'greek': # Workaround for regex tokenizer
            self.sent_end_chars=GreekLanguageVars.sent_end_chars
            self.sent_end_chars_regex = '|'.join(self.sent_end_chars)
            self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s'
        else:
            tokenizer = open_pickle(self.tokenizer_path)
            tokenizer = self._setup_tokenizer(tokenizer)

        # mk list of tokenized sentences
        if self.language == 'latin':
            return tokenizer.tokenize(untokenized_string)
        elif self.language == 'greek':
            return re.split(self.pattern, untokenized_string)
        else:
            tokenized_sentences = [sentence for sentence in
                                   tokenizer.sentences_from_text(untokenized_string,
                                                                 realign_boundaries=True)]
            return tokenized_sentences
示例#17
0
    def __init__(self: object, language: str = 'latin'):
        """
        :param language : language for sentence tokenization
        :type language: str
        """
        self.lang_vars = LatinLanguageVars()
        super().__init__(language='latin', lang_vars=self.lang_vars)
        self.models_path = LatinPunktSentenceTokenizer.models_path

        try:
            self.model = open_pickle(
                os.path.join(self.models_path, 'latin_punkt.pickle'))
        except FileNotFoundError as err:
            raise type(err)(LatinPunktSentenceTokenizer.missing_models_message)
示例#18
0
文件: sentence.py 项目: cltk/cltk
    def __init__(self: object, language: str = 'greek'):
        """
        :param language : language for sentence tokenization
        :type language: str
        """
        super().__init__(language='greek')
        self.models_path = GreekPunktSentenceTokenizer.models_path

        try:
            self.model =  open_pickle(os.path.join(os.path.expanduser(self.models_path), 'greek_punkt.pickle'))
        except FileNotFoundError as err:
            raise type(err)(GreekPunktSentenceTokenizer.missing_models_message)

        self.lang_vars = GreekLanguageVars()
示例#19
0
 def __init__(self, language: str = None, lang_vars: object = None):
     """
     :param language : language for sentence tokenization
     :type language: str
     """
     self.language = language
     self.lang_vars = lang_vars
     super().__init__(language=self.language)
     if self.language:
         self.models_path = self._get_models_path(self.language)
         try:
             self.model = open_pickle(os.path.join(os.path.expanduser(self.models_path),
                                                   f'{self.language}_punkt.pickle'))
         except FileNotFoundError as err:
             raise type(err)(BasePunktSentenceTokenizer.missing_models_message)
示例#20
0
    def __init__(self: object, language: str = "greek"):
        """
        :param language : language for sentence tokenization
        :type language: str
        """
        super().__init__(language="greek")
        self.models_path = GreekPunktSentenceTokenizer.models_path

        try:
            self.model = open_pickle(
                os.path.join(os.path.expanduser(self.models_path),
                             "greek_punkt.pickle"))
        except FileNotFoundError as err:
            raise type(err)(GreekPunktSentenceTokenizer.missing_models_message)

        self.lang_vars = GreekLanguageVars()
示例#21
0
文件: sentence.py 项目: oudalab/cltk
    def tokenize_sentences(self, untokenized_string):
        """Tokenize sentences by reading trained tokenizer and invoking
        ``PunktSentenceTokenizer()``.
        :type untokenized_string: str
        :param untokenized_string: A string containing one of more sentences.
        :rtype : list of strings
        """
        # load tokenizer
        tokenizer = open_pickle(self.tokenizer_path)
        tokenizer = self._setup_tokenizer(tokenizer)

        # mk list of tokenized sentences
        tokenized_sentences = []
        for sentence in tokenizer.sentences_from_text(untokenized_string, realign_boundaries=True):  # pylint: disable=C0301
            tokenized_sentences.append(sentence)
        return tokenized_sentences
示例#22
0
 def __init__(self, language: str = None, lang_vars: object = None):
     """
     :param language : language for sentence tokenization
     :type language: str
     """
     self.language = language
     self.lang_vars = lang_vars
     super().__init__(language=self.language)
     if self.language:
         self.models_path = self._get_models_path(self.language)
         try:
             self.model = open_pickle(
                 os.path.join(os.path.expanduser(self.models_path),
                              f'{self.language}_punkt.pickle'))
         except FileNotFoundError as err:
             raise type(err)(
                 BasePunktSentenceTokenizer.missing_models_message)
示例#23
0
    def __init__(self, seed: int = 3, verbose: bool = False):
        self.models_path = BackoffMHGLemmatizer.models_path

        missing_models_message = "BackoffMHGLemmatizer requires the ```middle_high_german_models_cltk``` " \
                                 "to be in cltk_data. Please load this corpus."
        self.seed = seed
        self.verbose = verbose

        self.token_to_lemmata = []
        self.lemma_to_tokens = []

        try:
            self.token_to_lemmata = open_pickle(
                os.path.join(self.models_path, "token_to_lemma.pickle"))
        except FileNotFoundError as err:
            raise type(err)(missing_models_message)

        self._define_lemmatizer()
示例#24
0
    def tokenize_sentences(self: object, untokenized_string: str):
        """Tokenize sentences by reading trained tokenizer and invoking
        ``PunktSentenceTokenizer()``.
        :type untokenized_string: str
        :param untokenized_string: A string containing one of more sentences.
        :rtype : list of strings
        """
        # load tokenizer
        assert isinstance(untokenized_string, str), \
            'Incoming argument must be a string.'
        tokenizer = open_pickle(self.tokenizer_path)
        tokenizer = self._setup_tokenizer(tokenizer)

        # mk list of tokenized sentences
        tokenized_sentences = []
        for sentence in tokenizer.sentences_from_text(untokenized_string, realign_boundaries=True):  # pylint: disable=C0301
            tokenized_sentences.append(sentence)
        return tokenized_sentences
示例#25
0
    def __init__(self, language: str = None, lang_vars: object = None):
        """Constructor.

        :param language : language for sentences tokenization
        :type language: str
        """
        super().__init__(language=language)
        if self.language == "lat":
            self.language_old = "lat"
        self.lang_vars = lang_vars
        if self.language:
            self.models_path = self._get_models_path(self.language)
            try:
                self.model = open_pickle(
                    os.path.join(
                        os.path.expanduser(self.models_path),
                        f"{self.language_old}_punkt.pickle",
                    ))
            except FileNotFoundError as err:
                raise type(err)(PunktSentenceTokenizer.missing_models_message)
示例#26
0
    def __init__(self: object, language: str = 'latin', strict: bool = False):
        """
        :param language : language for sentence tokenization
        :type language: str
        :param strict : allow for stricter puctuation for sentence tokenization
        :type strict: bool
        """
        self.lang_vars = LatinLanguageVars()
        self.strict = strict
        super().__init__(language='latin', lang_vars=self.lang_vars)
        self.models_path = LatinPunktSentenceTokenizer.models_path

        try:
            self.model = open_pickle(
                os.path.join(self.models_path, 'latin_punkt.pickle'))
        except FileNotFoundError as err:
            raise type(err)(LatinPunktSentenceTokenizer.missing_models_message)

        if self.strict:
            PunktLanguageVars.sent_end_chars = STRICT_PUNCTUATION
        else:
            PunktLanguageVars.sent_end_chars = PUNCTUATION
示例#27
0
    def __init__(self: object, strict: bool = False):
        """Constructor for ``LatinPunktSentenceTokenizer``.

        :param strict : allow for stricter punctuation for sentences tokenization
        :type strict: bool
        """
        self.lang_vars = LatinLanguageVars()
        self.strict = strict
        super().__init__(language="lat", lang_vars=self.lang_vars)

        fp_sentence_tok_model_dir = "lat/model/lat_models_cltk/tokenizers/sentence/"
        models_path = os.path.join(CLTK_DATA_DIR, fp_sentence_tok_model_dir)
        self.models_path = os.path.join(models_path, "latin_punkt.pickle")

        try:
            self.model = open_pickle(self.models_path)
        except FileNotFoundError as err:
            msg = f"``LatinPunktSentenceTokenizer`` could not find required file ``{self.models_path}``. Download the corpus ``lat_models_cltk``."
            raise FileNotFoundError(msg)

        if self.strict:
            PunktLanguageVars.sent_end_chars = STRICT_PUNCTUATION
        else:
            PunktLanguageVars.sent_end_chars = PUNCTUATION
示例#28
0
# Latin Lemmatizer (OLD)
# la_corpus_importer = CorpusImporter('latin')
# la_corpus_importer.import_corpus('latin_text_latin_library')
# la_corpus_importer.import_corpus('latin_models_cltk')
# la_lemmatizer = LemmaReplacer('latin')


# Latin Lemmatizer (NEW with backoff)
# Set up training sentences
rel_path = os.path.join('/Users/christiancasey/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)
# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'
latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)
la_lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)

# Greek Lemmatizer
grc_corpus_importer = CorpusImporter('greek')
grc_corpus_importer.import_corpus('greek_models_cltk')
grc_lemmatizer = LemmaReplacer('greek')

# Initialize lemmatizers once outside of the loop,
# then select based on langauge inside the loop -- get_words_from_file()
tagLat = POSTag('latin')
tagGrk = POSTag('greek')
示例#29
0
 def getmodel(fname):
     return open_pickle(join(path, fname))
示例#30
0
from extract_features import parse_tess
from cltk.utils.file_operations import open_pickle
from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer

PunktLanguageVars.sent_end_chars = ('.', ';', ';')
PunktLanguageVars.internal_punctuation = (',', '·', ':')

text = parse_tess('tesserae/texts/grc/xenophon.anabasis.tess')
tokenizer = open_pickle('tokenizers/ancient_greek.pickle')
print('Xenophon tokens: ' + str(len(tokenizer.tokenize(text))))
print()

trainer = PunktTrainer(lang_vars=PunktLanguageVars())
trainer.INCLUDE_ALL_COLLOCS = True
trainer.INCLUDE_ABBREV_COLLOCS = True
trainer.train(text, verbose=True)

new_tokenizer = PunktSentenceTokenizer(trainer.get_params())
print('tokenizers equal? ' + str(tokenizer == new_tokenizer))
print('tokenization equal? ' +
      str(tokenizer.tokenize(text) == new_tokenizer.tokenize(text)))

old_tok_out = open('feature_data/old_tok.txt', mode='w')
old_tok_out.write('\n'.join(tokenizer.tokenize(text)))
new_tok_out = open('feature_data/new_tok.txt', mode='w')
new_tok_out.write('\n'.join(new_tokenizer.tokenize(text)))
'''
There seem to be very few abbreviations in the tesserae corpus. This means training the PunktSentenceTokenizer might not yield any improvement.
From paper abstract: "[Punkt sentence tokenization training] is based on the assumption that a large number of ambiguities in the determination of sentence boundaries can be eliminated once abbreviations have been identified."

示例#31
0
import os
from cltk.utils.file_operations import open_pickle
from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer
from progress_bar import print_progress_bar
from extract_features import file_parsers

PunktLanguageVars.sent_end_chars = ('.', ';', ';')
PunktLanguageVars.internal_punctuation = (',', '·', ':')

cltk_params = open_pickle('tokenizers/ancient_greek.pickle')._params
kjohnson_params = open_pickle(
    'feature_data/kjohnson_greek.pickle').get_params()

#Are the attributes from ~/cltk_data/greek/model/greek_models_cltk/tokenizers/sentence/greek.pickle the same as https://github.com/cltk/greek_training_set_sentence_cltk/blob/master/greek.pickle ? Yes they are
print(cltk_params.abbrev_types)
print(cltk_params.abbrev_types == kjohnson_params.abbrev_types)
print()
print(cltk_params.collocations)
print(cltk_params.collocations == kjohnson_params.collocations)
print()
print(cltk_params.sent_starters)
print(cltk_params.sent_starters == kjohnson_params.sent_starters)
print()
print(cltk_params.ortho_context)
print(cltk_params.ortho_context == kjohnson_params.ortho_context)
print()
p = PunktSentenceTokenizer()._params
print('Defaults')
print(p.abbrev_types)
print(p.collocations)
print(p.sent_starters)
示例#32
0
lemmatizedTextList = []  # holds the versions of the title as we lemmatize them
lemmatizer = LemmaReplacer('latin')
lengthOfDataFile = 0  # number of rows in data file
numberOfFails = 0
numberOfSuccesses = 0
preprocessedTitle = ""  # a temp string where we store the ongoing preprocessing work on a title
successfulHits = []
word_tokenizer = WordTokenizer('latin')

# build standard dictionary/model # courtesy of Patrick Burns
rel_path = os.path.join(
    '~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)
file = 'latin_lemmata_cltk.pickle'
old_model_path = os.path.join(path, file)
LATIN_OLD_MODEL = open_pickle(old_model_path)

# make standard lemmatizer # as an instance of TrainLemmatizer # courtesy of Patrick Burns
lemmatizer = TrainLemmatizer(model=LATIN_OLD_MODEL, backoff=default)

# import custom dictionary csv as python dictionary
customDictionaryPath = os.path.join(cwd, 'customDictionary.csv')
with open(
        customDictionaryPath,
        'r') as f:  # this should close the file after the end of the with loop
    reader = csv.DictReader(f)
    for row in reader:
        customDictionaryCurrentLength += 1
        if row['lemma'] == "":
            continue  # in case a token has been added to custom dictionary but no lemma has yet been provided it
        customDictionary[row['token']] = row['lemma']
示例#33
0
 def test_open_pickle_fail_missing(self):
     """Test failure to unpickle a file that doesn't exist"""
     bad_file = 'cltk/tests/doesnt_exist.pickle'
     with self.assertRaises(FileNotFoundError):
         open_pickle(bad_file)
示例#34
0
 def test_open_pickle_fail_corrupt(self):
     """Test failure to open corrupted pickle."""
     bad_file = 'cltk/tests/bad_pickle.pickle'
     with self.assertRaises(EOFError):
         open_pickle(bad_file)
示例#35
0
 def test_open_pickle_fail_corrupt(self):
     """Test failure to open corrupted pickle."""
     bad_file = 'cltk/tests/bad_pickle.pickle'
     with self.assertRaises(EOFError):
         open_pickle(bad_file)
示例#36
0
 def test_open_pickle_fail_missing(self):
     """Test failure to unpickle a file that doesn't exist"""
     bad_file = 'cltk/tests/doesnt_exist.pickle'
     with self.assertRaises(FileNotFoundError):
         open_pickle(bad_file)
示例#37
0
import os
from cltk.utils.file_operations import open_pickle
from extract_features import file_parsers
from progress_bar import print_progress_bar

xeno_tokenizer = open_pickle('tokenizers/ancient_greek.pickle')
tess_tokenizer = open_pickle('feature_data/tesserae_greek.pickle')
corpus_dir = 'tesserae' + os.sep + 'texts' + os.sep + 'grc'
file_extension = 'tess'
#Obtain all the files to parse by traversing through the directory
file_names = sorted(list({current_path + os.sep + current_file_name for current_path, current_dir_names, current_file_names in \
os.walk(corpus_dir) for current_file_name in current_file_names if current_file_name.endswith('.' + file_extension)}))

counter = 1
for file_name in [
        'tesserae/texts/grc/achilles_tatius.leucippe_et_clitophon.tess'
]:  #file_names:
    file_text = file_parsers[file_extension](file_name)
    x_tokens = xeno_tokenizer.tokenize(file_text)
    t_tokens = tess_tokenizer.tokenize(file_text)
    if t_tokens != x_tokens:
        xeno_out = open('feature_data/xeno_token_achilles.txt', mode='w')
        xeno_out.write('\n'.join(x_tokens))
        tess_out = open('feature_data/tess_token_achilles.txt', mode='w')
        tess_out.write('\n'.join(t_tokens))
    # print_progress_bar(counter, len(file_names))
    counter += 1
'''
I trained Punkt on the entire tesserae corpus (feature_data/tesserae_greek.pickle). It's performance was actually worse than the tokenizer that was created from training on just Xenophon (tokenizers/ancient_greek.pickle). The tokenizer created from training on just Xenophon does well, except for failing to tokenize sentences where the terminal punctuation is not followed by a space.
'''
示例#38
0
#The greek.pickle used by cltk for ancient greek will unserialize as a PunktTrainer object.
#This script converts it into a PunktSentenceTokenizer

import os
import pickle
from cltk.utils.file_operations import open_pickle
from nltk.tokenize.punkt import PunktLanguageVars, PunktSentenceTokenizer

lang = 'greek'
file = 'greek.pickle'
PunktLanguageVars.sent_end_chars = ('.', ';', ';')
PunktLanguageVars.internal_punctuation = (',', '·', ':')
rel_path = os.path.join('~/cltk_data', lang,
                        'model/' + lang + '_models_cltk/tokenizers/sentence')
path = os.path.expanduser(rel_path)
tokenizer_path = os.path.join(path, file)

trainer = open_pickle(tokenizer_path)
trainer.INCLUDE_ALL_COLLOCS = True
trainer.INCLUDE_ABBREV_COLLOCS = True
tokenizer = PunktSentenceTokenizer(trainer.get_params())

with open('ancient_greek.pickle', 'wb') as pickle_file:
    pickle_file.write(pickle.dumps(tokenizer))
示例#39
0
        lemmatizer = self._define_lemmatizer()
        return lemmatizer.evaluate(self.test_sents)


if __name__ == "__main__":

    # Set up training sentences
    rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
    path = os.path.expanduser(rel_path)

    # Check for presence of latin_pos_lemmatized_sents
    file = 'latin_pos_lemmatized_sents.pickle'      

    latin_pos_lemmatized_sents_path = os.path.join(path, file)
    if os.path.isfile(latin_pos_lemmatized_sents_path):
        latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
    else:
        latin_pos_lemmatized_sents = []
        print('The file %s is not available in cltk_data' % file)  
  

    RUN = 10
    ACCURACIES = []

    for I in range(RUN):
        LEMMATIZER = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)
        ACC = LEMMATIZER.evaluate()
        ACCURACIES.append(ACC)
        print('{:.2%}'.format(ACC))

    print('\nTOTAL (Run %d) times' % RUN)
示例#40
0
from cltk.utils.file_operations import open_pickle
from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer
from extract_features import parse_tess

PunktLanguageVars.sent_end_chars = ('.', ';', ';')
PunktLanguageVars.internal_punctuation = (',', '·', ':')

text = parse_tess('tesserae/texts/grc/xenophon.anabasis.tess')
new_xeno_trainer = PunktTrainer()
# new_xeno_trainer.INCLUDE_ALL_COLLOCS = True
# new_xeno_trainer.INCLUDE_ABBREV_COLLOCS = True
new_xeno_trainer.train(text)
new_xeno_params = new_xeno_trainer.get_params()

tess_xeno_params = open_pickle('tokenizers/ancient_greek.pickle')._params

print(new_xeno_params.abbrev_types)
print(new_xeno_params.abbrev_types == tess_xeno_params.abbrev_types)
print()
print(new_xeno_params.collocations)
print(new_xeno_params.collocations == tess_xeno_params.collocations)
print()
print(new_xeno_params.sent_starters)
print(new_xeno_params.sent_starters == tess_xeno_params.sent_starters)
print()
print(new_xeno_params.ortho_context)
print(new_xeno_params.ortho_context == tess_xeno_params.ortho_context)
print()
'''
I got the internal PunktParameters object from the cltk pickle file that was trained on Xenophon's Anabasis (https://github.com/cltk/greek_training_set_sentence_cltk/blob/master/training_sentences.txt), and I also got the internal PunktParameters object from an PunktTrainer that I created from training on Xenophon's Anabasis from the tesserae corpus (https://github.com/tesserae/tesserae/blob/master/texts/grc/xenophon.anabasis.tess).