def test_tokenize_arabic_words(self): word_tokenizer = WordTokenizer('arabic') tests = ['اللُّغَةُ الْعَرَبِيَّةُ جَمِيلَةٌ.', 'انما الْمُؤْمِنُونَ اخوه فاصلحوا بَيْنَ اخويكم', 'الْعَجُزُ عَنِ الْإِدْرَاكِ إِدْرَاكٌ، وَالْبَحْثَ فِي ذاتِ اللَّه اشراك.', 'اللَّهُمُّ اُسْتُرْ عُيُوبَنَا وَأَحْسَنَ خَوَاتِيمَنَا الْكَاتِبِ: نَبِيلُ جلهوم', 'الرَّأْي قَبْلَ شَجَاعَة الشّجعَانِ', 'فَأَنْزَلْنَا مِنْ السَّمَاء مَاء فَأَسْقَيْنَاكُمُوهُ', 'سُئِلَ بَعْضُ الْكُتَّابِ عَنِ الْخَطّ، مَتَى يَسْتَحِقُّ أَنْ يُوصَفَ بِالْجَوْدَةِ ؟' ] results = [] for test in tests: result = word_tokenizer.tokenize(test) results.append(result) target = [['اللُّغَةُ', 'الْعَرَبِيَّةُ', 'جَمِيلَةٌ', '.'], ['انما', 'الْمُؤْمِنُونَ', 'اخوه', 'فاصلحوا', 'بَيْنَ', 'اخويكم'], ['الْعَجُزُ', 'عَنِ', 'الْإِدْرَاكِ', 'إِدْرَاكٌ', '،', 'وَالْبَحْثَ', 'فِي', 'ذاتِ', 'اللَّه', 'اشراك', '.'], ['اللَّهُمُّ', 'اُسْتُرْ', 'عُيُوبَنَا', 'وَأَحْسَنَ', 'خَوَاتِيمَنَا', 'الْكَاتِبِ', ':', 'نَبِيلُ', 'جلهوم'], ['الرَّأْي', 'قَبْلَ', 'شَجَاعَة', 'الشّجعَانِ'], ['فَأَنْزَلْنَا', 'مِنْ', 'السَّمَاء', 'مَاء', 'فَأَسْقَيْنَاكُمُوهُ'], ['سُئِلَ', 'بَعْضُ', 'الْكُتَّابِ', 'عَنِ', 'الْخَطّ', '،', 'مَتَى', 'يَسْتَحِقُّ', 'أَنْ', 'يُوصَفَ', 'بِالْجَوْدَةِ', '؟'] ] self.assertEqual(results, target)
def test_latin_word_tokenizer(self): """Test Latin-specific word tokenizer.""" word_tokenizer = WordTokenizer('latin') text = 'atque haec abuterque nihil' tokens = word_tokenizer.tokenize(text) target = ['atque', 'haec', 'abuter', '-que', 'nihil'] self.assertEqual(tokens, target)
def test_tokenize_arabic_words(self): word_tokenizer = WordTokenizer('arabic') tests = ['اللُّغَةُ الْعَرَبِيَّةُ جَمِيلَةٌ.', 'انما الْمُؤْمِنُونَ اخوه فاصلحوا بَيْنَ اخويكم', 'الْعَجُزُ عَنِ الْإِدْرَاكِ إِدْرَاكٌ، وَالْبَحْثَ فِي ذاتِ اللَّه اشراك.', 'اللَّهُمُّ اُسْتُرْ عُيُوبَنَا وَأَحْسَنَ خَوَاتِيمَنَا الْكَاتِبِ: نَبِيلُ جلهوم', 'الرَّأْي قَبْلَ شَجَاعَة الشّجعَانِ', 'فَأَنْزَلْنَا مِنْ السَّمَاء مَاء فَأَسْقَيْنَاكُمُوهُ', 'سُئِلَ بَعْضُ الْكُتَّابِ عَنِ الْخَطّ، مَتَى يَسْتَحِقُّ أَنْ يُوصَفَ بِالْجَوْدَةِ ؟' ] results = [] for test in tests: result = word_tokenizer.tokenize(test) results.append(result) target = [['اللُّغَةُ', 'الْعَرَبِيَّةُ', 'جَمِيلَةٌ', '.'], ['انما', 'الْمُؤْمِنُونَ', 'اخوه', 'فاصلحوا', 'بَيْنَ', 'اخويكم'], ['الْعَجُزُ', 'عَنِ', 'الْإِدْرَاكِ', 'إِدْرَاكٌ', '،', 'وَالْبَحْثَ', 'فِي', 'ذاتِ', 'اللَّه', 'اشراك', '.'], # pylint: disable=line-too-long ['اللَّهُمُّ', 'اُسْتُرْ', 'عُيُوبَنَا', 'وَأَحْسَنَ', 'خَوَاتِيمَنَا', 'الْكَاتِبِ', ':', 'نَبِيلُ', 'جلهوم'], # pylint: disable=line-too-long ['الرَّأْي', 'قَبْلَ', 'شَجَاعَة', 'الشّجعَانِ'], ['فَأَنْزَلْنَا', 'مِنْ', 'السَّمَاء', 'مَاء', 'فَأَسْقَيْنَاكُمُوهُ'], ['سُئِلَ', 'بَعْضُ', 'الْكُتَّابِ', 'عَنِ', 'الْخَطّ', '،', 'مَتَى', 'يَسْتَحِقُّ', 'أَنْ', 'يُوصَفَ', 'بِالْجَوْدَةِ', '؟'] # pylint: disable=line-too-long ] self.assertEqual(results, target)
def test_latin_word_tokenizer_base(self): """Test Latin-specific word tokenizer.""" word_tokenizer = WordTokenizer('latin') #Test sources: # - V. Aen. 1.1 # - Prop. 2.5.1-2 # - Ov. Am. 1.8.65-66 # - Cic. Phillip. 13.14 # - Plaut. Capt. 937 # - Lucr. DRN. 5.1351-53 # - Plaut. Bacch. 837-38 # - Plaut. Amph. 823 # - Caes. Bel. 6.29.2 tests = ['Arma virumque cano, Troiae qui primus ab oris.', 'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?', 'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!', 'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.', 'Quid opust verbis? lingua nullast qua negem quidquid roges.', 'Textile post ferrumst, quia ferro tela paratur, nec ratione alia possunt tam levia gigni insilia ac fusi, radii, scapique sonantes.', # pylint: disable=line-too-long 'Dic sodes mihi, bellan videtur specie mulier?', 'Cenavin ego heri in navi in portu Persico?', 'quae ripas Ubiorum contingebat in longitudinem pedum ducentorum rescindit'] results = [] for test in tests: result = word_tokenizer.tokenize(test) results.append(result) target = [['Arma', 'virumque', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris', '.'], ['Hoc', 'verumst', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'], ['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'tecum', ',', 'pauper', 'amator', ',', 'avos', '!'], ['Neque', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'nec', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur', '.'], ['Quid', 'opust', 'verbis', '?', 'lingua', 'nullast', 'qua', 'negem', 'quidquid', 'roges', '.'], ['Textile', 'post', 'ferrumst', ',', 'quia', 'ferro', 'tela', 'paratur', ',', 'nec', 'ratione', 'alia', 'possunt', 'tam', 'levia', 'gigni', 'insilia', 'ac', 'fusi', ',', 'radii', ',', 'scapique', 'sonantes', '.'], ['Dic', 'sodes', 'mihi', ',', 'bellan', 'videtur', 'specie', 'mulier', '?'], ['Cenavin', 'ego', 'heri', 'in', 'navi', 'in', 'portu', 'Persico', '?'], ['quae', 'ripas', 'Ubiorum', 'contingebat', 'in', 'longitudinem', 'pedum', 'ducentorum', 'rescindit']] self.assertEqual(results, target)
def lemmanade(lines): count = 0 lemons = [] # initialize cltk tools #jvReplace = JVReplacer() wordTokenizer = WordTokenizer('latin') lemmatizer = LemmaReplacer('latin') for verse in lines: count = count + 1 # lowercase #verse = jvReplace.replace(verse.lower()) #tokenize the words chunkTok = wordTokenizer.tokenize(verse.lower()) chunkTok = [ whiteTok(tok) for tok in chunkTok if whiteTok(tok) is not None ] #lemmatize the tokens lemmata = lemmatizer.lemmatize(chunkTok) #add all the lemmatized tokens together in a string lemons.append(lemmata) return lemons
def test_latin_word_tokenizer(self): """Test Latin-specific word tokenizer.""" word_tokenizer = WordTokenizer('latin') #Test sources: # - V. Aen. 1.1 # - Prop. 2.5.1-2 # - Ov. Am. 1.8.65-66 # - Cic. Phillip. 13.14 tests = ['Arma virumque cano, Troiae qui primus ab oris.', 'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?', 'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!', 'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.'] results = [] for test in tests: result = word_tokenizer.tokenize(test) results.append(result) target = [['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.'], ['Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'], ['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper', 'amator', ',', 'avos', '!'], ['que', 'Ne', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'c', 'ne', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur.']] self.assertEqual(results, target)
def test_backoff_latin_lemmatizer_verbose(self): """Test backoffLatinLemmatizer""" train = [ [ ("ceterum", "ceterus"), ("antequam", "antequam"), ("destinata", "destino"), ("componam", "compono"), ] ] # pylint: disable=line-too-long lemmatizer = BackoffLatinLemmatizer(verbose=True) test_str = """Ceterum antequam destinata componam""" target = [ ("ceterum", "ceterum", "<UnigramLemmatizer: CLTK Sentence Training Data>"), ( "antequam", "antequam", "<UnigramLemmatizer: CLTK Sentence Training Data>", ), ( "destinata", "destino", "<UnigramLemmatizer: CLTK Sentence Training Data>", ), ("componam", "compono", "<DictLemmatizer: Morpheus Lemmas>"), ] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer("latin") test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def wtokenizeLatin( self, text: str, removeSplitSyllable: bool = False) -> ([str]) or ([str], [str]): """ Uses the latin word tokenizer from cltk to tokenize the words for given text. Removes punctuation internally. :param text: Text to tokenize. :param removeSplitSyllable: true ..> when "big" words are split by the cltk tokenizer it adds split syllabi (like "-que") to the return array. If this param is set to true these split syllabi are being removed otherwise not. :return: Tuple with List Comprehension of tokenized words on first index position. When paramater removeSplitSyllable was assigned true ...> returns on second index position the removed words (also as list comprehension). """ text = text.lower() wordTokenizer = WordTokenizer("latin") tokens: [] = wordTokenizer.tokenize(text) return_tuple: tuple = () removed_words: [] = [] if removeSplitSyllable: for word in tokens: if "-" in word: removed_words.append(word) tokens.remove(word) return_tuple = (tokens, removed_words) return return_tuple return_tuple = (tokens) return return_tuple
def __init__(self): self.sent_tokenizer = SentenceTokenizer() self.word_tokenizer = WordTokenizer('greek') self.corpus_reader = get_corpus_reader( corpus_name='greek_text_perseus', language='greek') self.lemmatizer = LemmaReplacer('greek') self.tfidf_vectorizer = TfidfVectorizer(input="filename")
def stemmer_middle_high_german(text_l, rem_umlauts=True, exceptions=exc_dict): """text_l: text in string format rem_umlauts: choose whether to remove umlauts from string exceptions: hard-coded dictionary for the cases the algorithm fails""" #Normalize text text_l = normalize_middle_high_german(text_l, to_lower_all=False, to_lower_beginning=True) #Tokenize text word_tokenizer = WordTokenizer("middle_high_german") text_l = word_tokenizer.tokenize(text_l) text = [] for word in text_l: try: text.append( exceptions[word]) #test if word in exception dictionary except: if word[0].isupper(): #MHG only uses upper case for locations, people, etc. So any word that starts with a capital #letter while not being at the start of a sentence will automatically be excluded. text.append(word) elif word in MHG_STOPS: text.append(word) #Filter stop words else: text.append(stem_helper(word, rem_umlaut=rem_umlauts)) return text
def test_backoff_latin_lemmatizer(self): """Test backoffLatinLemmatizer""" train = [ [ ("ceterum", "ceterus"), ("antequam", "antequam"), ("destinata", "destino"), ("componam", "compono"), ] ] # pylint: disable=line-too-long lemmatizer = BackoffLatinLemmatizer() test_str = """Ceterum antequam destinata componam""" target = [ ("ceterum", "ceterum"), ("antequam", "antequam"), ("destinata", "destino"), ("componam", "compono"), ] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer("latin") test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_middle_english_tokenizer(self): text = " Fers am I ferd of oure fare;\n Fle we ful fast þer-fore. \n Can Y no cownsel bot care.\n\n" target = ['Fers', 'am', 'I', 'ferd', 'of', 'oure', 'fare', ';', 'Fle', 'we', 'ful', 'fast', 'þer', '-', 'fore', '.', 'Can', 'Y', 'no', 'cownsel', 'bot', 'care', '.'] tokenizer = WordTokenizer('middle_english') tokenized = tokenizer.tokenize(text) self.assertTrue(tokenized == target)
def stemmer_middle_high_german(text_l, rem_umlauts = True, exceptions = exc_dict): """text_l: text in string format rem_umlauts: choose whether to remove umlauts from string exceptions: hard-coded dictionary for the cases the algorithm fails""" #Normalize text text_l = normalize_middle_high_german(text_l, to_lower_all = False, to_lower_beginning = True) #Tokenize text word_tokenizer = WordTokenizer("middle_high_german") text_l = word_tokenizer.tokenize(text_l) text = [] for word in text_l: try: text.append(exceptions[word]) #test if word in exception dictionary except: if word[0].isupper(): #MHG only uses upper case for locations, people, etc. So any word that starts with a capital #letter while not being at the start of a sentence will automatically be excluded. text.append(word) elif word in MHG_STOPS: text.append(word) #Filter stop words else: text.append(stem_helper(word, rem_umlaut = rem_umlauts)) return text
def test_sanskirt_tokenizer(self): text = "यद्यप्येते न पश्यन्ति लोभोपहतचेतसः । कुलक्षयकृतं दोषं मित्रद्रोहे च पातकम् ॥" target = [ 'यद्यप्येते', 'न', 'पश्यन्ति', 'लोभोपहतचेतसः', '।', 'कुलक्षयकृतं', 'दोषं', 'मित्रद्रोहे', 'च', 'पातकम्', '॥' ] tokenizer = WordTokenizer('sanskrit') tokenized_lines = tokenizer.tokenize(text) self.assertTrue(tokenized_lines == target)
def test_old_norse_word_tokenizer(self): text = "Gylfi konungr var maðr vitr ok fjölkunnigr. " \ "Hann undraðist þat mjök, er ásafólk var svá kunnigt, at allir hlutir gengu at vilja þeira." target = ['Gylfi', 'konungr', 'var', 'maðr', 'vitr', 'ok', 'fjölkunnigr', '.', 'Hann', 'undraðist', 'þat', 'mjök', ',', 'er', 'ásafólk', 'var', 'svá', 'kunnigt', ',', 'at', 'allir', 'hlutir', 'gengu', 'at', 'vilja', 'þeira', '.'] word_tokenizer = WordTokenizer('old_norse') result = word_tokenizer.tokenize(text) self.assertTrue(result == target)
def tokenizeLatinWords(string): """ Uses the CLTK Latin Tokenizer for Latin-specific tokenization. Accepts string, returns list of tokens. """ print("Tokenizing...") word_tokenizer = WordTokenizer('latin') text_tokens = word_tokenizer.tokenize(string) return text_tokens
def __init__(self): self.lemmatizer = Lemmata(dictionary='lemmata', language='latin') self.jv = JVReplacer() self.word_tokenizer = WordTokenizer('latin') self.count_dictionary = dict() self.punctuation_list = [ '!', ';', ':', '?', '-', '–', '&', '*', '(', ')', '[', ']', ',', '"', '\'' ]
def test_french_lemmatizer(self): text = "Li rois pense que par folie, Sire Tristran, vos aie amé ; Mais Dé plevis ma loiauté, Qui sor mon cors mete flaele, S'onques fors cil qui m’ot pucele Out m'amistié encor nul jor !" text = str.lower(text) tokenizer = WordTokenizer('french') lemmatizer = LemmaReplacer() tokens = tokenizer.tokenize(text) lemmas = lemmatizer.lemmatize(tokens) target = [('li', 'li'), ('rois', 'rois'), ('pense', 'pense'), ('que', 'que'), ('par', 'par'), ('folie', 'folie'), (',', ['PUNK']), ('sire', 'sire'), ('tristran', 'None'), (',', ['PUNK']), ('vos', 'vos'), ('aie', ['avoir']), ('amé', 'amer'), (';', ['PUNK']), ('mais', 'mais'), ('dé', 'dé'), ('plevis', 'plevir'), ('ma', 'ma'), ('loiauté', 'loiauté'), (',', ['PUNK']), ('qui', 'qui'), ('sor', 'sor'), ('mon', 'mon'), ('cors', 'cors'), ('mete', 'mete'), ('flaele', 'flaele'), (',', ['PUNK']), ("s'", "s'"), ('onques', 'onques'), ('fors', 'fors'), ('cil', 'cil'), ('qui', 'qui'), ("m'", "m'"), ('ot', 'ot'), ('pucele', 'pucele'), ('out', ['avoir']), ("m'", "m'"), ('amistié', 'amistié'), ('encor', 'encor'), ('nul', 'nul'), ('jor', 'jor'), ('!', ['PUNK'])] self.assertEqual(lemmas, target)
def test_akkadian_sign_tokenizer(self): """ Tests sign_tokenizer. """ tokenizer = WordTokenizer('akkadian') word = ("{gisz}isz-pur-ram", "akkadian") output = tokenizer.tokenize_sign(word) goal = [("gisz", "determinative"), ("isz", "akkadian"), ("pur", "akkadian"), ("ram", "akkadian")] self.assertEqual(output, goal)
def test_middle_high_german_stopwords(self): """Test filtering Middle High German stopwords.""" sentence = "Swer was ze Bêârosche komn, doch hete Gâwân dâ genomn den prîs ze bêder sît al ein wan daz dervor ein ritter schein, bî rôtem wâpen unrekant, des prîs man in die hœhe bant." lowered = sentence.lower() tokenizer = WordTokenizer('middle_high_german') tokens = tokenizer.tokenize(lowered) no_stops = [w for w in tokens if w not in MHG_STOPS] target_list = ['swer', 'bêârosche', 'komn', ',', 'gâwân', 'genomn', 'prîs', 'bêder', 'sît', 'dervor', 'ritter', 'schein', ',', 'rôtem', 'wâpen', 'unrekant', ',', 'prîs', 'hœhe', 'bant', '.'] self.assertEqual(no_stops,target_list)
def test_middle_high_german_tokenizer(self): text = "Gâwân het êre unde heil,\nieweders volleclîchen teil:\nnu nâht och sînes kampfes zît." target = [ 'Gâwân', 'het', 'êre', 'unde', 'heil', ',', 'ieweders', 'volleclîchen', 'teil', ':', 'nu', 'nâht', 'och', 'sînes', 'kampfes', 'zît', '.' ] tokenizer = WordTokenizer('middle_high_german') tokenized_lines = tokenizer.tokenize(text) self.assertTrue(tokenized_lines == target)
def normalize_fr(string): string = string.lower() word_tokenizer = WordTokenizer('french') tokens = word_tokenizer.tokenize(string) normalized_text = [] for token in tokens: for matches_rule, apply_rule in rules: if matches_rule(token): normalized = apply_rule(token) normalized_text.append(normalized) return normalized_text
def read_text(): """Read in a file from the greek texts directory""" word_tokenizer = WordTokenizer('greek') filename = train_text_dir + train_text_file text = os.path.expanduser(filename) with open(text) as f: r = f.read() return word_tokenizer.tokenize( r) # Need to remove non-greek characters
def test_middle_high_german_tokenize(self): """ Test tokenizing Middle High German """ word_tokenizer = WordTokenizer('middle_high_german') text = "Mīn ougen wurden liebes alsō vol, \n\n\ndō ich die minneclīchen ērst gesach,\ndaȥ eȥ mir hiute und iemer mē tuot wol." tokenized = word_tokenizer.tokenize(text) target = ['Mīn', 'ougen', 'wurden', 'liebes', 'alsō', 'vol', ',', 'dō', 'ich', 'die', 'minneclīchen', 'ērst', 'gesach', ',', 'daȥ', 'eȥ', 'mir', 'hiute', 'und', 'iemer', 'mē', 'tuot', 'wol', '.'] self.assertEqual(tokenized, target)
def test_akkadian_word_tokenizer(self): """ Tests word_tokenizer. """ tokenizer = WordTokenizer('akkadian') line = 'u2-wa-a-ru at-ta e2-kal2-la-ka _e2_-ka wu-e-er' output = tokenizer.tokenize(line) goal = [('u2-wa-a-ru', 'akkadian'), ('at-ta', 'akkadian'), ('e2-kal2-la-ka', 'akkadian'), ('_e2_-ka', 'sumerian'), ('wu-e-er', 'akkadian')] self.assertEqual(output, goal)
def test_identity_lemmatizer(self): """Test identity_lemmatizer()""" lemmatizer = IdentityLemmatizer() test_str = 'Ceterum antequam destinata componam' target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_latin_lemmata(self): """Test Lemmata class lookup() method""" lemmatizer = Lemmata(dictionary='lemmata', language='latin') test_str = 'Ceterum antequam destinata componam' target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) self.assertEqual(lemmas, target)
def __init__(self, text): self.text = text self.tokenizer = WordTokenizer('old_norse') self.tokenized_text = self.tokenizer.tokenize(text) self.short_lines = None self.first_sounds = [] self.syllabified = [] self.transcribed = [] self.alliterations = [] self.phonological_features_text = [] self.n_alliterations = 0 self.syllabified_phonological_features_text = []
def stopwords_filter(string): text = string # strip tashkeel because the stop words list contains voweled words text = araby.strip_tashkeel(text) word_tokenizer = WordTokenizer("arabic") tokens = word_tokenizer.tokenize(text) # filter stop words no_stops = [w for w in tokens if w not in ARABIC_STOPS] return no_stops
def test_latin_lemmata(self): """Test Lemmata class lookup() method""" lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') test_str = 'Ceterum antequam destinata componam' target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) self.assertEqual(lemmas, target)
def test_bigram_pos_lemmatizer(self): train = [[('dixissem', 'dico', 'v')], [('de', 'de', 'r'), ('te', 'tu', 'p'), ('autem', 'autem', 'c'), (',', 'punc', 'u'), ('catilina', 'catilina', 'n'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('quiescunt', 'quiesco', 'v'), (',', 'punc', 'u'), ('probant', 'probo', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('patiuntur', 'patior', 'v'), (',', 'punc', 'u'), ('decernunt', 'decerno', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('tacent', 'taceo', 'v'), (',', 'punc', 'u'), ('clamant', 'clamo', 'v'), (',', 'punc', 'u'), ('neque', 'neque', 'c'), ('hi', 'hic', 'p'), ('solum', 'solus', 'd'), ('quorum', 'qui', 'p'), ('tibi', 'tu', 'p'), ('auctoritas', 'auctoritas', 'n'), ('est', 'sum', 'v'), ('uidelicet', 'uidelicet', 'd'), ('cara', 'carus', 'a'), (',', 'punc', 'u'), ('uita', 'uita', 'n'), ('uilissima', 'uilis', 'a'), (',', 'punc', 'u'), ('sed', 'sed', 'c'), ('etiam', 'etiam', 'c'), ('illi', 'ille', 'p'), ('equites', 'eques', 'n'), ('romani', 'romanus', 'a'), (',', 'punc', 'u'), ('honestissimi', 'honestus', 'a'), ('atque', 'atque', 'c'), ('optimi', 'bonus', 'a'), ('uiri', 'uir', 'n'), (',', 'punc', 'u'), ('ceteri', 'ceterus', 'a'), ('-que', '-que', 'c'), ('fortissimi', 'fortis', 'a'), ('ciues', 'ciuis', 'n'), ('qui', 'qui', 'p'), ('circumstant', 'circumsto', 'v'), ('senatum', 'senatus', 'n'), (',', 'punc', 'u'), ('quorum', 'qui', 'p'), ('tu', 'tu', 'p'), ('et', 'et', 'c'), ('frequentiam', 'frequentia', 'n'), ('uidere', 'uideo', 'v'), ('et', 'et', 'c'), ('studia', 'studium', 'n'), ('perspicere', 'perspicio', 'v'), ('et', 'et', 'c'), ('uoces', 'uox', 'n'), ('paulo', 'paulus', 'd'), ('ante', 'ante', 'd'), ('exaudire', 'exaudio', 'v'), ('potuisti', 'possum', 'v'), ('.', 'punc', 'u')]] lemmatizer = BigramPOSLemmatizer(train=train, include=['cum']) test_str = """Quod cum esset intellectum et animadversum fecit animo libentissimo populus Romanus""" target = [('quod', None), ('cum', 'cum2'), ('esset', None), ('intellectum', None), ('et', None), ('animaduersum', None), ('fecit', None), ('animo', None), ('libentissimo', None), ('populus', None), ('romanus', None)] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def tokenize_txt(text: str) -> str: if language.lower() == 'japanese': # It is assumed that japanese is already tokenized when parsing. result = text elif language.lower() == 'greek': tokenizer = CLTK_WordTokenizer(language) tokenized_txt = tokenizer.tokenize(text) result = ' '.join(tokenized_txt) else: tokenized_txt = word_tokenize(text, language, False) result = ' '.join(tokenized_txt) return result
def test_french_lemmatizer(self): text = "Li rois pense que par folie, Sire Tristran, vos aie amé ; Mais Dé plevis ma loiauté, Qui sor mon cors mete flaele, S'onques fors cil qui m’ot pucele Out m'amistié encor nul jor !" text = str.lower(text) tokenizer = WordTokenizer("french") lemmatizer = LemmaReplacer() tokens = tokenizer.tokenize(text) lemmas = lemmatizer.lemmatize(tokens) target = [ ("li", "li"), ("rois", "rois"), ("pense", "pense"), ("que", "que"), ("par", "par"), ("folie", "folie"), (",", ["PUNK"]), ("sire", "sire"), ("tristran", "None"), (",", ["PUNK"]), ("vos", "vos"), ("aie", ["avoir"]), ("amé", "amer"), (";", ["PUNK"]), ("mais", "mais"), ("dé", "dé"), ("plevis", "plevir"), ("ma", "ma"), ("loiauté", "loiauté"), (",", ["PUNK"]), ("qui", "qui"), ("sor", "sor"), ("mon", "mon"), ("cors", "cors"), ("mete", "mete"), ("flaele", "flaele"), (",", ["PUNK"]), ("s'", "s'"), ("onques", "onques"), ("fors", "fors"), ("cil", "cil"), ("qui", "qui"), ("m'", "m'"), ("ot", "ot"), ("pucele", "pucele"), ("out", ["avoir"]), ("m'", "m'"), ("amistié", "amistié"), ("encor", "encor"), ("nul", "nul"), ("jor", "jor"), ("!", ["PUNK"]), ] self.assertEqual(lemmas, target)
def test_regex_lemmatizer(self): """Test regex_lemmatizer()""" sub = [('(.)ab(o|is|it|imus|itis|unt)$', r'\1o')] lemmatizer = RegexpLemmatizer(sub) test_str = 'amabimus' target = [('amabimus', 'amo')] jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_unigram_lemmatizer(self): """Test unigram_lemmatizer()""" train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]] # pylint: disable=line-too-long lemmatizer = UnigramLemmatizer(train=train) test_str = """Ceterum antequam destinata componam""" target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_model_lemmatizer(self): """Test model_lemmatizer()""" model = {'ceterum': 'ceterus', 'antequam': 'antequam', 'destinata': 'destino', 'componam': 'compono'} # pylint: disable=line-too-long lemmatizer = TrainLemmatizer(model=model) test_str = 'Ceterum antequam destinata componam' target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_regex_lemmatizer(self): """Test regex_lemmatizer()""" pattern = [(r'(\w*)abimus', 'o')] lemmatizer = RegexpLemmatizer(pattern) test_str = 'amabimus' target = [('amabimus', 'amo')] jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_roman_numeral_lemmatizer_with_default(self): """Test roman_numeral_lemmatizer()""" rn_patterns = [(r'(?=^[MDCLXVUI]+$)(?=^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|IU|V?I{0,3}|U?I{0,3})$)', 'NUM'), (r'(?=^[mdclxvui]+$)(?=^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|iu|v?i{0,3}|u?i{0,3})$)', 'NUM')] lemmatizer = RomanNumeralLemmatizer(rn_patterns, default="RN") test_str = 'i ii' target = [('i', 'RN'), ('ii', 'RN')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_backoff_latin_lemmatizer_verbose(self): """Test backoffLatinLemmatizer""" train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]] # pylint: disable=line-too-long lemmatizer = BackoffLatinLemmatizer(verbose=True) test_str = """Ceterum antequam destinata componam""" target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')] # pylint: disable=line-too-long jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_regex_lemmatizer(self): """Test regex_lemmatizer()""" sub = [("(.)ab(o|is|it|imus|itis|unt)$", r"\1o")] lemmatizer = RegexpLemmatizer(sub) test_str = "amabimus" target = [("amabimus", "amo")] jv_replacer = JVReplacer() tokenizer = WordTokenizer("latin") test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_greek_word_tokenizer(self): """Test Latin-specific word tokenizer.""" word_tokenizer = WordTokenizer('greek') # Test sources: # - Thuc. 1.1.1 test = "Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν Πελοποννησίων καὶ Ἀθηναίων, ὡς ἐπολέμησαν πρὸς ἀλλήλους, ἀρξάμενος εὐθὺς καθισταμένου καὶ ἐλπίσας μέγαν τε ἔσεσθαι καὶ ἀξιολογώτατον τῶν προγεγενημένων, τεκμαιρόμενος ὅτι ἀκμάζοντές τε ᾖσαν ἐς αὐτὸν ἀμφότεροι παρασκευῇ τῇ πάσῃ καὶ τὸ ἄλλο Ἑλληνικὸν ὁρῶν ξυνιστάμενον πρὸς ἑκατέρους, τὸ μὲν εὐθύς, τὸ δὲ καὶ διανοούμενον." target = ['Θουκυδίδης', 'Ἀθηναῖος', 'ξυνέγραψε', 'τὸν', 'πόλεμον', 'τῶν', 'Πελοποννησίων', 'καὶ', 'Ἀθηναίων', ',', 'ὡς', 'ἐπολέμησαν', 'πρὸς', 'ἀλλήλους', ',', 'ἀρξάμενος', 'εὐθὺς', 'καθισταμένου', 'καὶ', 'ἐλπίσας', 'μέγαν', 'τε', 'ἔσεσθαι', 'καὶ', 'ἀξιολογώτατον', 'τῶν', 'προγεγενημένων', ',', 'τεκμαιρόμενος', 'ὅτι', 'ἀκμάζοντές', 'τε', 'ᾖσαν', 'ἐς', 'αὐτὸν', 'ἀμφότεροι', 'παρασκευῇ', 'τῇ', 'πάσῃ', 'καὶ', 'τὸ', 'ἄλλο', 'Ἑλληνικὸν', 'ὁρῶν', 'ξυνιστάμενον', 'πρὸς', 'ἑκατέρους', ',', 'τὸ', 'μὲν', 'εὐθύς', ',', 'τὸ', 'δὲ', 'καὶ', 'διανοούμενον', '.'] result = word_tokenizer.tokenize(test) self.assertEqual(result, target)
def test_latin_pp_lemmatizer(self): """Test latin_pp_lemmatizer()""" pattern = [(r'(\w*)[a|ie]bimus\b', 1)] pps = { 'amo': [1, 'am', 'amare', 'amau', 'amat'] } lemmatizer = PPLemmatizer(pattern, pps=pps) test_str = 'amabimus' target = [('amabimus', 'amo')] jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lemmatize(tokens) self.assertEqual(lemmas, target)
def test_word_tokenizer_french(self): word_tokenizer = WordTokenizer('french') tests = ["S'a table te veulz maintenir, Honnestement te dois tenir Et garder les enseignemens Dont cilz vers sont commancemens."] # pylint: disable=line-too-long results = [] for test in tests: result = word_tokenizer.tokenize(test) results.append(result) target = [["S'", 'a', 'table', 'te', 'veulz', 'maintenir', ',', 'Honnestement', 'te', 'dois', 'tenir', 'Et', 'garder', 'les', 'enseignemens', 'Dont', 'cilz', 'vers', 'sont', 'commancemens', '.']] # pylint: disable=line-too-long self.assertEqual(results, target)
def test_syllabification_old_norse(self): """Syllabification""" s = Syllabifier(language="old_norse", break_geminants=True) text = "Gefjun dró frá Gylfa glöð djúpröðul óðla, svá at af rennirauknum rauk, Danmarkar auka. Báru öxn ok " \ "átta ennitungl, þars gengu fyrir vineyjar víðri valrauf, fjögur höfuð." tokenizer = WordTokenizer('old_norse') words = tokenizer.tokenize(text) s.set_invalid_onsets(invalid_onsets) syllabified_words = [s.syllabify_ssp(word.lower()) for word in words if word not in ",."] target = [['gef', 'jun'], ['dró'], ['frá'], ['gyl', 'fa'], ['glöð'], ['djúp', 'rö', 'ðul'], ['óðl', 'a'], ['svá'], ['at'], ['af'], ['ren', 'ni', 'rauk', 'num'], ['rauk'], ['dan', 'mar', 'kar'], ['auk', 'a'], ['bár', 'u'], ['öxn'], ['ok'], ['át', 'ta'], ['en', 'ni', 'tungl'], ['þars'], ['geng', 'u'], ['fy', 'rir'], ['vi', 'ney', 'jar'], ['víðr', 'i'], ['val', 'rauf'], ['fjö', 'gur'], ['hö', 'fuð']] self.assertListEqual(syllabified_words, target)
def test_latin_translations(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) translations = Synonyms(dictionary = 'translations', language = 'latin') translations = translations.lookup_synonyms(lemmas) target = [('destino', [('σκοπός', 1.0)]), ('compono', [('συντίθημι', 1.0)])] self.assertEqual(translations, target)
def test_latin_synonyms(self): """Test Synonym class lookup() function and Lemmata class isolate() method""" #first build the lemma list as in test_latin_lemmata() lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin') test_str = 'Ceterum antequam destinata componam' jv_replacer = JVReplacer() tokenizer = WordTokenizer('latin') test_str = test_str.lower() test_str = jv_replacer.replace(test_str) tokens = tokenizer.tokenize(test_str) lemmas = lemmatizer.lookup(tokens) #now isolate the list of lemmas lemmas = lemmatizer.isolate(lemmas) synonyms = Synonyms(dictionary = 'synonyms', language = 'latin') syns = synonyms.lookup_synonyms(lemmas) target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]), ('compono', [('struo', 0.5), ('condo', 0.5)])] self.assertEqual(syns, target)
def stem(text): """make string lower-case""" text = text.lower() """Stem each word of the French text.""" stemmed_text = '' word_tokenizer = WordTokenizer('french') tokenized_text = word_tokenizer.tokenize(text) for word in tokenized_text: """remove the simple endings from the target word""" word, was_stemmed = matchremove_noun_endings(word) """if word didn't match the simple endings, try verb endings""" if not was_stemmed: word = matchremove_verb_endings(word) """add the stemmed word to the text""" stemmed_text += word + ' ' return stemmed_text