Exemplo n.º 1
0
 def test_tagged_words(self):
     tagged_words = indian.tagged_words()[:3]
     self.assertEqual(
         tagged_words, [('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM')]
     )
 def test_tagged_words(self):
     tagged_words = indian.tagged_words()[:3]
     self.assertEqual(tagged_words, [('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM')])
import nltk
from nltk.corpus import indian

indian_pos = indian.tagged_words()

# Somehow tagset='universal' is not working. Need to check it
tag_fd = nltk.FreqDist(tag for (word, tag) in indian_pos)
print(tag_fd.most_common()[:5])

tag_fd.plot(cumulative=True)
Exemplo n.º 4
0
 def test_tagged_words(self):
     tagged_words = indian.tagged_words()[:3]
     self.assertEqual(tagged_words, [("মহিষের", "NN"), ("সন্তান", "NN"),
                                     (":", "SYM")])
Exemplo n.º 5
0
                 i,
                 format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


marathi_sent = indian.sents('marathi_pos_rad_3NOV17.pos')
mpos = indian.tagged_sents('marathi_pos_rad_3NOV17.pos')
mp = shuffle(mpos)
size = int(len(marathi_sent) * 0.8)
tags = [
    tag for (word, tag) in indian.tagged_words('marathi_pos_rad_3NOV17.pos')
]
print(np.unique(tags))
#print("no. of tags=",len(nltk.FreqDist(tags)))
defaultTag = nltk.FreqDist(tags).max()

#print(defaultTag)
train_sents = mp[:size]
#print(len(train_sents))
test_sents = mp[size:]

print(marathi_sent[0])
trainFeatures, trainLabels = transformDataset(train_sents)

testFeatures, testLabels = transformDataset(test_sents)
print("lengths of features")
Exemplo n.º 6
0
                i] == '':  #In case there are no matching entries between the transition tags and emission tags, we choose the most frequent emission tag
            output_li[i] = max(di_emission_probs, key=itemgetter(1))[0]

    return output_li


#tup = fn_train()
#dict2_tag_follow_tag_ = tup[0]
#dict2_word_tag = tup[1]
#dict_word_tag_baseline = tup[2]

if __name__ == "__main__":

    k = 5
    #to shuffle sentences
    mp = indian.tagged_words('marathi_pos_rad_3NOV17.pos')
    marathi_sent = shuffle(mp)
    print("length of tagged words=", len(marathi_sent))
    size = int(len(marathi_sent) * 0.8)
    print("size=", size)
    mtrain1 = marathi_sent[:size]
    print("len of mtrain=", len(mtrain1))
    test = marathi_sent[size:]
    print("len of mtrain=", len(test))
    # without shufle
    #marathi_sent= indian.tagged_words('marathi_pos_rad_3NOV17.pos')
    r = len(mtrain1) / k
    l = len(mtrain1)
    score = []
    for i in range(k):
        test_set = mtrain1[int(r * i):int(r * i + r)]
Exemplo n.º 7
0
 def test_tagged_words(self):
     tagged_words = indian.tagged_words()[:3]
     self.assertEqual(tagged_words, [("মহিষের", "NN"), ("সন্তান", "NN"), (":", "SYM")])
Exemplo n.º 8
0
# nltk.download('state_union')
print(nltk.corpus.state_union.words())
# nltk.download('webtext')
print(nltk.corpus.webtext.words())
# tagged corpora
print(brown.words())
print(brown.tagged_words())
print(brown.sents())  # doctest: +ELLIPSIS
print(brown.tagged_sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(brown.paras(
    categories='reviews'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(brown.tagged_paras(
    categories='reviews'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('indian')
print(indian.words())  # doctest: +SKIP
print(indian.tagged_words())  # doctest: +SKIP
# nltk.download('universal_tagset')
print(brown.tagged_sents(
    tagset='universal'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(conll2000.tagged_words(
    tagset='universal'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# chunked corpora
print(conll2000.sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2000.chunked_sents()[:2]:
    print(tree)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('conll2002')
print(conll2002.sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2002.chunked_sents()[:2]:
    print(tree)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('semcor')
print(semcor.words())