Exemplo n.º 1
0
def lemmanade(lines):

    count = 0
    lemons = []

    # initialize cltk tools
    #jvReplace = JVReplacer()
    wordTokenizer = WordTokenizer('latin')
    lemmatizer = LemmaReplacer('latin')

    for verse in lines:

        count = count + 1

        # lowercase
        #verse = jvReplace.replace(verse.lower())

        #tokenize the words
        chunkTok = wordTokenizer.tokenize(verse.lower())
        chunkTok = [
            whiteTok(tok) for tok in chunkTok if whiteTok(tok) is not None
        ]

        #lemmatize the tokens
        lemmata = lemmatizer.lemmatize(chunkTok)

        #add all the lemmatized tokens together in a string
        lemons.append(lemmata)

    return lemons
Exemplo n.º 2
0
 def __init__(self):
     self.sent_tokenizer = SentenceTokenizer()
     self.word_tokenizer = WordTokenizer('greek')
     self.corpus_reader = get_corpus_reader(
         corpus_name='greek_text_perseus', language='greek')
     self.lemmatizer = LemmaReplacer('greek')
     self.tfidf_vectorizer = TfidfVectorizer(input="filename")
Exemplo n.º 3
0
def declineEachWordInList(word_list, error_type_context,logger):
    """
    A declension function which might not be needed anymore.
    """
    error_type = error_type_context + "-declension"
    error_count = 0
    total_list_length = len(word_list)

    words_string = ' '.join(word_list)
    normalized_string = normalizeLatinWordsInNonstandardGlyphs(words_string)
    jv_replaced_string = jv_replace(normalized_string)
    word_list = jv_replaced_string.split()
    lemmatizer = LemmaReplacer('latin')
    try:
        word_list = lemmatizer.lemmatize(word_list)
    except:
        print("Lemmatization error with " + word)
        #logger.addToLogger(error_type, "WARNING", "lemmatization failed: " + word)
        error_count += 1

    decliner = CollatinusDecliner()
    declined_forms = []
    for word in word_list:
        try:
            declined = decliner.decline(word, flatten=True)
            declined_forms = declined_forms + declined
        except:
            logger.addToLogger(error_type, "WARNING", "Lemma couldn't be declined: " + word)
            error_count += 1

    print('[' + str(error_count) + ' / ' + str(total_list_length) + '] declension errors')
    return declined_forms
Exemplo n.º 4
0
def declineWord(word, error_type_context,logger):
    """
    A declension function which might not be needed anymore.
    """
    error_type = error_type_context + "-declension"

    normalized_string = normalizeLatinWordsInNonstandardGlyphs(word)
    jv_replaced_string = jv_replace(normalized_string)
    lemmatizer = LemmaReplacer('latin')
    try:
        word_list = lemmatizer.lemmatize(word_list)
    except:
        print("Lemmatization error with " + word)
        #logger.addToLogger(error_type, "WARNING", "lemmatization failed: " + word)
        error_count += 1

    decliner = CollatinusDecliner()
    declined_forms = []
    for word in word_list:
        try:
            declined = decliner.decline(word, flatten=True)
            declined_forms = declined_forms + declined
        except:
            logger.addToLogger(error_type, "WARNING", "Lemma couldn't be declined: " + word)
            error_count += 1

    return declined_forms
Exemplo n.º 5
0
def main():
    input = open('./Gratian1.txt', 'r').read()
    input = re.sub('['+string.punctuation+']', '', input)
    input = input.lower()
    lemmatizer = LemmaReplacer('latin')
    lemmata = lemmatizer.lemmatize(input)
    dictionary_1r = {}
    for lemma in lemmata:
        if lemma in dictionary_1r:
            dictionary_1r[lemma] += 1
        else:
            dictionary_1r[lemma] = 1
    # lemmata = dictionary_1r.keys()
    # for lemma in lemmata:
    #     print("%2d\t%s" % (dictionary_1r[lemma], lemma))
    input = open('./Gratian2.txt', 'r').read()
    input = re.sub('['+string.punctuation+']', '', input)
    input = input.lower()
    lemmata = lemmatizer.lemmatize(input)
    dictionary_2r = {}
    for lemma in lemmata:
        if lemma in dictionary_2r:
            dictionary_2r[lemma] += 1
        else:
            dictionary_2r[lemma] = 1
    lemmata = dictionary_2r.keys()
    for lemma in lemmata:
        if lemma not in dictionary_1r:
            print("%2d\t%s" % (dictionary_2r[lemma], lemma))
Exemplo n.º 6
0
 def test_lemmatizer_inlist_outlemma_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['τὴν/τὴν', 'διάγνωσιν/διάγνωσις', 'ἔρχεσθαι/ἔρχομαι']
     self.assertEqual(lemmatized, target)
Exemplo n.º 7
0
 def test_lemmatizer_instr_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['τὴν', 'διάγνωσις', 'ἔρχομαι']
     self.assertEqual(lemmatized, target)
Exemplo n.º 8
0
 def test_lemmatizer_instr_outlemma_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'τὴν/τὴν διάγνωσιν/διάγνωσις ἔρχεσθαι/ἔρχομαι'
     self.assertEqual(lemmatized, target)
Exemplo n.º 9
0
 def test_lemmatizer_inlist_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['h**o', 'divus', 'voluptas']
     self.assertEqual(lemmatized, target)
Exemplo n.º 10
0
 def test_lemmatizer_inlist_outlemma_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'hominum/h**o divomque/divus voluptas/voluptas'
     self.assertEqual(lemmatized, target)
Exemplo n.º 11
0
 def test_lemmatizer_instr_outlemma_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['hominum/h**o', 'divomque/divus', 'voluptas/voluptas']
     self.assertEqual(lemmatized, target)
Exemplo n.º 12
0
 def test_lemmatizer_inlist_outlemma_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['τὴν/τὴν', 'διάγνωσιν/διάγνωσις', 'ἔρχεσθαι/ἔρχομαι']
     self.assertEqual(lemmatized, target)
Exemplo n.º 13
0
 def test_lemmatizer_instr_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['τὴν', 'διάγνωσις', 'ἔρχομαι']
     self.assertEqual(lemmatized, target)
Exemplo n.º 14
0
 def test_lemmatizer_instr_outlemma_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'τὴν/τὴν διάγνωσιν/διάγνωσις ἔρχεσθαι/ἔρχομαι'
     self.assertEqual(lemmatized, target)
Exemplo n.º 15
0
def lemmatize():
    req_data = request.get_json()
    if req_data and req_data['input_text']:
        input_text = req_data['input_text']
        lemmatizer = LemmaReplacer('greek')
        return jsonify(lemmatizer.lemmatize(input_text))

    return jsonify({})
Exemplo n.º 16
0
 def test_lemmatizer_inlist_outlemma_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'hominum/h**o divomque/divus voluptas/voluptas'
     self.assertEqual(lemmatized, target)
Exemplo n.º 17
0
 def test_lemmatizer_inlist_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['h**o', 'divus', 'voluptas']
     self.assertEqual(lemmatized, target)
Exemplo n.º 18
0
 def test_lemmatizer_instr_outlemma_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['hominum/h**o', 'divomque/divus', 'voluptas/voluptas']
     self.assertEqual(lemmatized, target)
Exemplo n.º 19
0
 def test_lemmatizer_inlist_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized, return_lemma=False, return_string=True)
     target = 'τὴν διάγνωσις ἔρχομαι'
     self.assertEqual(lemmatized, target)
Exemplo n.º 20
0
 def test_lemmatizer_instr_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=True)
     target = 'h**o divus voluptas'
     self.assertEqual(lemmatized, target)
Exemplo n.º 21
0
 def test_lemmatizer_instr_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=True)
     target = 'h**o divus voluptas'
     self.assertEqual(lemmatized, target)
Exemplo n.º 22
0
def main():
    corpus_importer = CorpusImporter('latin')
    corpora_list = corpus_importer.list_corpora
    print(corpora_list)
    corpus_importer.import_corpus('latin_models_cltk')
    sentence = 'Aeneadum genetrix, hominum divomque voluptas, alma Venus, caeli subter labentia signa quae mare navigerum, quae terras frugiferentis concelebras, per te quoniam genus omne animantum concipitur visitque exortum lumina solis.'
    sentence = sentence.lower()
    lemmatizer = LemmaReplacer('latin')
    lemmatized_sentence = lemmatizer.lemmatize(sentence)
    print(lemmatized_sentence)
Exemplo n.º 23
0
 def test_lemmatizer_inlist_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized,
                                     return_lemma=False,
                                     return_string=True)
     target = 'τὴν διάγνωσις ἔρχομαι'
     self.assertEqual(lemmatized, target)
Exemplo n.º 24
0
def lemmatizeList(list):
    tagger = POSTag('greek')

    lemmatizer = LemmaReplacer('greek')
    lemmWords = lemmatizer.lemmatize(list)

    # Remove Stopwords and numbers and lowercases all words.
    lemmWords = [w.lower() for w in lemmWords if not w in STOPS_LIST]
    lemmWords = removeNumbers(lemmWords)

    return lemmWords
Exemplo n.º 25
0
def lemmatizeWord(word):
    """
    CLTK-based lemmatization function to lemmatize a single word.

    Since CLTK lemmatization always returns a list, it will only return the
    first element of that list. If you want the whole list or lemmatize more
    than one word, use lemmatizeAllWordsFromList.
    This function has no error checking in form of try-catch or anything.
    It's possible that lemmatization fails and thus, the returned string is empty.
    """
    lemmatizer = LemmaReplacer('latin')
    result = lemmatizer.lemmatize(word)
    # always returns list
    return result[0]
Exemplo n.º 26
0
    def lemmatize(self, return_string=True, return_raw=False):
        """Transforms words into their lemmata.

        Gives a new version of the text in which every word is lemmatized. All
        verbs are transformed into the first person singular present active,
        all nouns are transformed into the singular masculine nominative, et.c.

        Returns:
            :obj:`self.__class__` New version of the text with tokens transformed to their lemmata

        Example:
            >>> text = LatinText('Gallia est omnis divisa in partes tres')
            >>> print(text.lemmatize())
            gallia edo1 omne divido in pars tres
        """ # noqa
        from cltk.stem.lemma import LemmaReplacer
        return self.__class__(
            text=LemmaReplacer(
                self.options['language']
            ).lemmatize(
                self.data.lower(),
                return_string=return_string,
                return_raw=return_raw
            ),
            options=self.options
        )
Exemplo n.º 27
0
def tokenize(text, language="latin"):
    jv_replacer = JVReplacer()
    text = jv_replacer.replace(text.lower())

    t = WordTokenizer(language)
    l = LemmaReplacer(language)

    text_word_tokens = t.tokenize(text)

    # Garde les mots de plus de trois characters
    ## text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']]
    text_word_tokens = [token for token in text_word_tokens if len(token) > 3]

    text_word_tokens = l.lemmatize(text_word_tokens)

    return text_word_tokens
Exemplo n.º 28
0
 def lemmatize(self,
               return_string=True,
               return_raw=False):  # pragma: no cover
     return self.__class__(data=LemmaReplacer(self.language).lemmatize(
         self.data.lower(),
         return_string=return_string,
         return_raw=return_raw),
                           metadata=self.metadata)
Exemplo n.º 29
0
    def __init__(
            self, pathDF, language='english',
            dataType='pickle', dataIndex='multi', colname='text',
            maxValues=2500, pathMeta=False, pathType=False, showLogging=False,
            model_params=(4,5,300)
            ):

        super(CorpusML, self).__init__(
            pathDF, dataType, dataIndex, colname,
            maxValues, pathMeta, pathType
            )

        if showLogging:
            logging.basicConfig(
                format='%(asctime)s : %(levelname)s : %(message)s',
                level=logging.INFO
                )

        self.model = gensim.models.Word2Vec(
            workers=model_params[0],
            min_count=model_params[1],
            size=model_params[2]
            )

        # self.model.random.seed(42)

        self.language = language

        if self.language == 'latin' or self.language == 'greek':
            from cltk.corpus.utils.importer import CorpusImporter
            corpus_importer = CorpusImporter(self.language)
            corpus_importer.import_corpus(
                '{0}_models_cltk'.format(self.language)
                )
            from cltk.stem.lemma import LemmaReplacer
            from cltk.tokenize.word import nltk_tokenize_words as tokenizer
            lemmatizer = LemmaReplacer(self.language)
            if self.language == 'latin':
                from cltk.stem.latin.j_v import JVReplacer
                from cltk.stop.latin.stops import STOPS_LIST as stopwords
                self.jvreplacer = JVReplacer()
            elif self.language == 'greek':
                from cltk.stop.greek.stops import STOPS_LIST as stopwords
        elif self.language == 'english' or 'german':
            import nltk
            nltk.download('stopwords')
            from nltk.stem import WordNetLemmatizer
            from nltk.tokenize import word_tokenize as tokenizer
            from nltk.corpus import stopwords
            stopwords = stopwords.words(self.language)
            lemmatizer = WordNetLemmatizer()
        else:
            raise ValueError(
                'Could not find lemmatizer, tokenizer,\
                 and stopwords for chosen language.')
        self.lemmatizer = lemmatizer
        self.tokenizer = tokenizer
        self.stopwords = stopwords
Exemplo n.º 30
0
def preprocess(doc):
    assert (type(doc) == str)
    word_tokenizer = WordTokenizer('latin')
    doc_word_tokens = word_tokenizer.tokenize(doc)
    doc_word_tokens_no_punt = [
        token.lower() for token in doc_word_tokens
        if token not in ['.', ',', ':', ';']
    ]

    # lemmeatization
    corpus_importer = CorpusImporter('latin')
    corpus_importer.import_corpus('latin_models_cltk')
    jv_replacer = JVReplacer()

    lemmatizer = LemmaReplacer('latin')
    lemmata = lemmatizer.lemmatize(" ".join(doc_word_tokens_no_punt))
    cleaned = remove_latin_library_items(" ".join(lemmata))
    return cleaned
Exemplo n.º 31
0
def main():
	jv = JVReplacer();
	more = 0
	lemmatizer = LemmaReplacer('latin');
	word_counts = {};
	lines = open(sys.argv[1]);
	for line in lines:
		words = line.split();
		for i in range(0, len(words)):
			words[i] = jv.replace(remove_punctuation(words[i]).lower());
		for word in words:
			#if word in stops_augmented:
			#	continue;
			if "&" in word:
				continue
			if (len(lemmatizer.lemmatize(word)) == 0):
				more += 1
				continue;

			stem = lemmatizer.lemmatize(word)[0];
			if not stem in word_counts:
				word_counts[stem] = 1;
			else:
				word_counts[stem] = word_counts[stem] + 1;
	words_to_show = 400
	sorted_words = sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True);
	top_words = [word[0] for word in sorted_words][0:words_to_show];
	word_freqs = [word[1] for word in sorted_words][0:words_to_show];

	for i in range(0, words_to_show):
		print(str(i) + " " + top_words[i] + " " + str(word_freqs[i]));
	count = 0;
	for i in range(0, words_to_show):
		count += word_freqs[i];
	print(str(count));
	print(str(more))


	s = np.arange(0.0, words_to_show, 1);
	t = word_freqs;
	plt.plot(s, t);
	plt.ylabel('# appearances in Elegiae');
	plt.xlabel('rank of word frequency');
	plt.show();
    def lemmatizeList(self, lines):
        from cltk.corpus.utils.formatter import cltk_normalize

        tagger = POSTag('greek')

        lemmatizer = LemmaReplacer('greek')

        # can help when using certain texts (doc says it, so i does it)
        lines = cltk_normalize(lines)

        # print(lines)
        # exit(0)
        lines = lemmatizer.lemmatize(lines)

        # Remove Stopwords and numbers and lowercases all words.
        lines = [w.lower() for w in lines if not w in STOPS_LIST]
        # lemmWords = removeNumbers(lemmWords)

        return ' '.join(lines)
Exemplo n.º 33
0
 def lemmatizeLat(self,
                  tokenized_words: list,
                  return_raw: bool = False) -> ([str]) or ([str], [str]):
     """
     Lemmatizes given list of words against the cltk perseus corpus. If second parameter
     is set to true -> returns a list with words BUT additionally with derived "source-word" after
     a "/" seperator.
     :param tokenized_words: String list of words to be lemmatized.
     :param return_raw: Boolean, decides if return should contain raw "source word" or not.
     :return: First index position -> List of lemmmatas; Second index position -_> if second parameter was true
     then list of lemmatas with "source_words" attached to each lemmata string BEFORE the lemmatized word.
     """
     lemmatizer = LemmaReplacer('latin')
     lemmata: [str] = lemmatizer.lemmatize(tokenized_words, False)
     if return_raw:
         lemmata_with_source: [str] = lemmatizer.lemmatize(
             tokenized_words, True)
         return lemmata, lemmata_with_source
     else:
         return lemmata
def get_lemma(input_words, language):
    lang = None

    if language == "Latin":
        lemmatizer = LemmaReplacer("latin")

        # Required for CLTK module
        input_words = latin_lem_replacement(input_words)

    if language == "Greek":
        lemmatizer = LemmaReplacer("greek")

    if type(input_words) == list:
        results = lemmatizer.lemmatize(input_words)
        return results
    else:
        input_words = normalize_word(input_words)
        results = lemmatizer.lemmatize(input_words)
        if len(results) > 0:
            return results[0]
        else:
            return input_words
Exemplo n.º 35
0
def runTest(text):
   '''Test cltk tools for latin'''
   print('Test phrase:')
   print(' -> ' + text)
   print()

#   print('[1/3] Testing JVReplacer')
#   jv = JVReplacer()
#   text = jv.replace(text)
#   print(' -> ' + text)
#   print()

   print('[2/3] Testing WordTokenizer')
   tokenizer = WordTokenizer('latin')
   tok = tokenizer.tokenize(text)
   print(' -> ' + ', '.join(["'{}'".format(t) for t in tok]))
   print()

   print('[3/3] Testing LemmaReplacer')
   lemmatizer = LemmaReplacer('latin')
   lem = lemmatizer.lemmatize(tok)
   print(' -> ' + ', '.join(["'{}'".format(l) for l in lem]))
   print()
Exemplo n.º 36
0
def get_docs(letters):

    docs = []
    count = 0
    for i, entry in enumerate(letters):
        letter, tag = entry
        NO_PUNCT_RE = re.compile(r'[?!\.\'\"<>():;,]')
        replacer = JVReplacer()
        letter = replacer.replace(letter)
        words = re.sub(NO_PUNCT_RE, '', letter).lower().split()

        for i, word in enumerate(words):
            if word.endswith('-'):
                words[i + 1] = '%s%s' % (word.strip('-'), words[i + 1])
        words = [w for w in words if not w.endswith('-')]
        words = [w for w in words if w not in STOPS_LIST]
        words = ' '.join(words)
        lemmatizer = LemmaReplacer('latin')
        words = lemmatizer.lemmatize(words)
        count += len(words)
        doc = TaggedDocument(words, [tag])
        docs.append(doc)
    return docs
Exemplo n.º 37
0
def pre_process(letters):

    pre_processed = []
    for letter in letters:
        NO_PUNCT_RE = re.compile(r'[?!\.\'\"<>():;,]')
        replacer = JVReplacer()
        letter = replacer.replace(letter)
        words = re.sub(NO_PUNCT_RE, '', letter).lower().split()

        for i, word in enumerate(words):
            if word.endswith('-'):
                words[i + 1] = '%s%s' % (word.strip('-'), words[i + 1])
        words = [w for w in words if not w.endswith('-')]
        words = [w for w in words if w not in STOPS_LIST]
        words = ' '.join(words)
        lemmatizer = LemmaReplacer('latin')
        words = lemmatizer.lemmatize(words)
        # very common words that seemed to be cofounding the topic model
        words = [
            w for w in words if w not in ['magnus', 'bonus', 'ago', 'valeo']
        ]
        pre_processed.append(words)
    return pre_processed
Exemplo n.º 38
0
    def entities(self, lemmatize=False, unique=False):
        """Returns a list of entities recognized in the text.

        Uses cltk's built in named-entity recognition. Reorganizes cltk's raw
        output from list of tuples to list of strings. Every entity recognized
        is added to the list returned. Unless unique option is set, entities
        which appear multiple times will be returned multiple times in the
        list.

        Args:
            lemmatize (:obj:`bool`, optional) Set True to lemmatize text before searching for entities
            unique (:obj:`bool`, optional) Set True and no entity appears in the return list more than once
        Example:
            >>> text = LatinText('Gallia est omnis divisa in partes tres')
            >>> print(text.entities())
            ['Gallia']
        """ # noqa
        from cltk.stem.lemma import LemmaReplacer
        from cltk.tag import ner
        entity_list = []
        # filtering non-entities
        for result in ner.tag_ner(
            self.options['language'],
            input_text=self.data,
            output_type=list
        ):
            # appending if item flagged as entity in tuple[1]
            try:
                if result[1] == 'Entity':
                    entity_list.append(result[0])
            # do nothing if 'Entity' not specified
            except:
                pass
            # removing duplicate entities if unique option specified
        if unique:
            entity_list = list(set(entity_list))
        # lemmatizing entities if option has been specified
        if lemmatize:
            entity_list = LemmaReplacer(self.options['language']).lemmatize(
                entity_list,
                return_string=False,
                return_raw=False
            )
        return entity_list
Exemplo n.º 39
0
 def entities(self, lemmatize=False, unique=False):
     entity_list = []
     # filtering non-entities
     for result in ner.tag_ner(self.language,
                               input_text=self.data,
                               output_type=list):
         # appending if item flagged as entity in tuple[1]
         try:
             if result[1] == 'Entity':
                 entity_list.append(result[0])
         # do nothing if 'Entity' not specified
         except:
             pass
         # removing duplicate entities if unique option specified
     if unique:
         entity_list = list(set(entity_list))
     # lemmatizing entities if option has been specified
     if lemmatize:
         entity_list = LemmaReplacer(self.language).lemmatize(
             entity_list, return_string=False, return_raw=False)
     return entity_list
Exemplo n.º 40
0
def lemmata(text):
    lemmatizer = LemmaReplacer('greek')
    return [
        word for word in set(lemmatizer.lemmatize(text.lower()))
        if not word in STOPS_LIST
    ]
Exemplo n.º 41
0
def flatten_list(word_list):
    flat_list = []
    for word in word_list:
        flat_list.append(word.text)
    return flat_list


def remove_digits(some_string):
    return ''.join([i for i in some_string if not i.isdigit()])


la_corpus_importer = CorpusImporter('latin')
la_corpus_importer.import_corpus('latin_text_latin_library')
la_corpus_importer.import_corpus('latin_models_cltk')
la_lemmatizer = LemmaReplacer('latin')
grc_corpus_importer = CorpusImporter('greek')
grc_corpus_importer.import_corpus('greek_models_cltk')
grc_lemmatizer = LemmaReplacer('greek')


def lemmatize(word_list, copy):
    for word in word_list:
        if copy:
            word.lemmatization = word.text
            return
        if word.language in LATIN_CODES:
            word.lemmatization = \
                remove_digits(la_lemmatizer.lemmatize(word.text)[0])
        elif word.language in GREEK_CODES:
            word.lemmatization = \
Exemplo n.º 42
0
#import codecs


# Import module For XML
from xml.dom.minidom import parse, parseString

# For CLTK
#from cltk.corpus.utils.importer import CorpusImporter
#corpus_importer = CorpusImporter('latin')
#corpus_importer.import_corpus('latin_models_cltk')
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.tag.pos import POSTag


lemmatizer = LemmaReplacer('latin')
tagger = POSTag('latin')
j = JVReplacer()

text = []
#text = ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres']
with open('/home/ilbuonme/siti/paolo.monella/ursus/lemma/recycleBin/textForOrig-myCLTK.txt', 'r') as f:
    for x in f.readlines():
        for w in x.split(' '):
            text.append(w)

for t in text:
    if t:
        # Note: the tagger likes 'divisa', while the lemmatizer likes 'diuisa'
        lemmaList = lemmatizer.lemmatize(t.lower())
        posList   = tagger.tag_tnt(j.replace(t.lower()))
Exemplo n.º 43
0
def lemmata(text):
    lemmatizer = LemmaReplacer('greek')
    return [word for word in set(lemmatizer.lemmatize(text.lower())) if not word in STOPS_LIST]
Exemplo n.º 44
0
# - report to DC list/wiki



# Import modules

# For XML
from xml.dom.minidom import parse, parseString
import codecs
# For CLTK
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.tag.pos import POSTag

# Initialize CLTK
lemmatizer = LemmaReplacer('latin')
tagger = POSTag('latin')
j = JVReplacer()

# Parse XML

xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml')
#xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml')
wordElementList = xmldoc.getElementsByTagName('w')

for w in wordElementList:
        form = w.attributes['ana'].value
        print(form)
        # Parse the inflected word
        try:
            lemmaList = lemmatizer.lemmatize(form.lower())
Exemplo n.º 45
0
def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = WordTokenizer('latin')
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = WordTokenizer('greek')

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [w[1:] if w.startswith('-') else w for w in sentence]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence
Exemplo n.º 46
0
 def getLemma(self):
   lemmatizer = LemmaReplacer('latin')
   return lemmatizer.lemmatize(self.text)