def test_tagged_words(self): tagged_words = indian.tagged_words()[:3] self.assertEqual( tagged_words, [('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM')] )
def test_tagged_words(self): tagged_words = indian.tagged_words()[:3] self.assertEqual(tagged_words, [('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM')])
import nltk from nltk.corpus import indian indian_pos = indian.tagged_words() # Somehow tagset='universal' is not working. Need to check it tag_fd = nltk.FreqDist(tag for (word, tag) in indian_pos) print(tag_fd.most_common()[:5]) tag_fd.plot(cumulative=True)
def test_tagged_words(self): tagged_words = indian.tagged_words()[:3] self.assertEqual(tagged_words, [("মহিষের", "NN"), ("সন্তান", "NN"), (":", "SYM")])
i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') marathi_sent = indian.sents('marathi_pos_rad_3NOV17.pos') mpos = indian.tagged_sents('marathi_pos_rad_3NOV17.pos') mp = shuffle(mpos) size = int(len(marathi_sent) * 0.8) tags = [ tag for (word, tag) in indian.tagged_words('marathi_pos_rad_3NOV17.pos') ] print(np.unique(tags)) #print("no. of tags=",len(nltk.FreqDist(tags))) defaultTag = nltk.FreqDist(tags).max() #print(defaultTag) train_sents = mp[:size] #print(len(train_sents)) test_sents = mp[size:] print(marathi_sent[0]) trainFeatures, trainLabels = transformDataset(train_sents) testFeatures, testLabels = transformDataset(test_sents) print("lengths of features")
i] == '': #In case there are no matching entries between the transition tags and emission tags, we choose the most frequent emission tag output_li[i] = max(di_emission_probs, key=itemgetter(1))[0] return output_li #tup = fn_train() #dict2_tag_follow_tag_ = tup[0] #dict2_word_tag = tup[1] #dict_word_tag_baseline = tup[2] if __name__ == "__main__": k = 5 #to shuffle sentences mp = indian.tagged_words('marathi_pos_rad_3NOV17.pos') marathi_sent = shuffle(mp) print("length of tagged words=", len(marathi_sent)) size = int(len(marathi_sent) * 0.8) print("size=", size) mtrain1 = marathi_sent[:size] print("len of mtrain=", len(mtrain1)) test = marathi_sent[size:] print("len of mtrain=", len(test)) # without shufle #marathi_sent= indian.tagged_words('marathi_pos_rad_3NOV17.pos') r = len(mtrain1) / k l = len(mtrain1) score = [] for i in range(k): test_set = mtrain1[int(r * i):int(r * i + r)]
def test_tagged_words(self): tagged_words = indian.tagged_words()[:3] self.assertEqual(tagged_words, [("মহিষের", "NN"), ("সন্তান", "NN"), (":", "SYM")])
# nltk.download('state_union') print(nltk.corpus.state_union.words()) # nltk.download('webtext') print(nltk.corpus.webtext.words()) # tagged corpora print(brown.words()) print(brown.tagged_words()) print(brown.sents()) # doctest: +ELLIPSIS print(brown.tagged_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(brown.paras( categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(brown.tagged_paras( categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('indian') print(indian.words()) # doctest: +SKIP print(indian.tagged_words()) # doctest: +SKIP # nltk.download('universal_tagset') print(brown.tagged_sents( tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(conll2000.tagged_words( tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # chunked corpora print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2000.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('conll2002') print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2002.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('semcor') print(semcor.words())