def test_parse(self): # Assert parsed output with Penn Treebank II tags (slash-formatted). # "il gatto nero" is a noun phrase, "sulla stuoia" is a prepositional noun phrase. v = it.parser.parse(u"Il gatto nero seduto sulla stuoia.") self.assertEqual(v, u"Il/DT/B-NP/O gatto/NN/I-NP/O nero/JJ/I-NP/O " + u"seduto/VB/B-VP/O " + \ u"sulla/IN/B-PP/B-PNP stuoia/NN/B-NP/I-PNP ././O/O" ) # Assert the accuracy of the Italian tagger. i, n = 0, 0 for sentence in open( os.path.join(PATH, "corpora", "tagged-it-wacky.txt")).readlines(): sentence = sentence.decode("utf-8").strip() s1 = [w.split("/") for w in sentence.split(" ")] s2 = [[w for w, pos in s1]] s2 = it.parse(s2, tokenize=False) s2 = [w.split("/") for w in s2.split(" ")] for j in range(len(s1)): t1 = s1[j][1] t2 = s2[j][1] # WaCKy test set tags plural nouns as "NN", pattern.it as "NNS". # Some punctuation marks are also tagged differently, # but these are not necessarily errors. if t1 == t2 or (t1 == "NN" and t2.startswith("NN")) or s1[j][0] in "\":;)-": i += 1 n += 1 #print(float(i) / n) self.assertTrue(float(i) / n > 0.92) print("pattern.it.parser.parse()")
def test_parse(self): # Assert parsed output with Penn Treebank II tags (slash-formatted). # "il gatto nero" is a noun phrase, "sulla stuoia" is a prepositional noun phrase. v = it.parser.parse(u"Il gatto nero seduto sulla stuoia.") self.assertEqual(v, u"Il/DT/B-NP/O gatto/NN/I-NP/O nero/JJ/I-NP/O " + u"seduto/VB/B-VP/O " + \ u"sulla/IN/B-PP/B-PNP stuoia/NN/B-NP/I-PNP ././O/O" ) # Assert the accuracy of the Italian tagger. i, n = 0, 0 for sentence in open(os.path.join(PATH, "corpora", "tagged-it-wacky.txt")).readlines(): sentence = sentence.decode("utf-8").strip() s1 = [w.split("/") for w in sentence.split(" ")] s2 = [[w for w, pos in s1]] s2 = it.parse(s2, tokenize=False) s2 = [w.split("/") for w in s2.split(" ")] for j in range(len(s1)): t1 = s1[j][1] t2 = s2[j][1] # WaCKy test set tags plural nouns as "NN", pattern.it as "NNS". # Some punctuation marks are also tagged differently, # but these are not necessarily errors. if t1 == t2 or (t1 == "NN" and t2 == "NNS") or s1[j][0] in "\":;)-": i += 1 n += 1 #print float(i) / n self.assertTrue(float(i) / n > 0.92) print "pattern.it.parser.parse()"
def lemmatize_word(input_word): in_word = input_word #.decode('utf-8') word_it = parse(in_word, tokenize=False, tag=False, chunk=False, lemmata=True) the_lemmatized_word = word_it.split()[0][0][4] return the_lemmatized_word
def _getParse(word, language): import pattern.en as pattern_en # @UnresolvedImport import pattern.es as pattern_es # @UnresolvedImport import pattern.fr as pattern_fr # @UnresolvedImport import pattern.de as pattern_de # @UnresolvedImport import pattern.it as pattern_it # @UnresolvedImport if language == "es": return pattern_es.parse(word) elif language == "en": return pattern_en.parse(word) elif language == "it": return pattern_it.parse(word) elif language == "fr": return pattern_fr.parse(word) elif language == "de": return pattern_de.parse(word) else: return pattern_en.parse(word)