def get_data(): """ Split the Brown and Nps Chat corpus into 4 training sets and 4 test sets :return: """ data = { "train_brown50": train_test_split(brown.tagged_sents(), 0.5)[0], "test_brown50": train_test_split(brown.tagged_sents(), 0.5)[1], "train_brown90": train_test_split(brown.tagged_sents(), 0.9)[0], "test_brown10": train_test_split(brown.tagged_sents(), 0.9)[1], "train_nps50": train_test_split(nps_chat.tagged_posts(), 0.9)[0], "test_nps50": train_test_split(nps_chat.tagged_posts(), 0.9)[1], "train_nps90": train_test_split(nps_chat.tagged_posts(), 0.9)[0], "test_nps10": train_test_split(nps_chat.tagged_posts(), 0.9)[1] } return data
def ex2(): tagged_brown = brown.tagged_sents(categories='news') results_brown = splitting(tagged_brown) train_brown1 = results_brown[0] train_brown2 = results_brown[1] test_brown1 = results_brown[2] test_brown2 = results_brown[3] tagged_chat = nps_chat.tagged_posts() results_chat = splitting(tagged_chat) train_chat1 = results_chat[0] train_chat2 = results_chat[1] test_chat1 = results_chat[2] test_chat2 = results_chat[3] default_tagger = nltk.DefaultTagger('NN') default_tagger.tag(test_brown1) default_tagger.tag(test_brown2) default_tagger.tag(test_chat1) default_tagger.tag(test_chat2) print('Test for brown corpus 1 : {}'.format( default_tagger.evaluate(test_brown1))) print('Test for brown corpus 2 : {}'.format( default_tagger.evaluate(test_brown2))) print('Test for chat corpus 1 : {}'.format( default_tagger.evaluate(test_chat1))) print('Test for chat corpus 2 : {}'.format( default_tagger.evaluate(test_chat2))) t1 = nltk.UnigramTagger(train_brown1, backoff=default_tagger) print(t1.evaluate(test_brown1)) t2 = nltk.BigramTagger(train_brown1, backoff=t1) print(t2.evaluate(test_brown1)) t3 = nltk.TrigramTagger(train_brown1, backoff=t2) print('Accuracy test brown 1: ', t3.evaluate(test_brown1)) t1 = nltk.UnigramTagger(train_brown2, backoff=default_tagger) print(t1.evaluate(test_brown2)) t2 = nltk.BigramTagger(train_brown2, backoff=t1) print(t2.evaluate(test_brown2)) t3 = nltk.TrigramTagger(train_brown2, backoff=t2) print('Accuracy test brown 2: ', t3.evaluate(test_brown2)) t1 = nltk.UnigramTagger(train_chat1, backoff=default_tagger) print(t1.evaluate(test_chat1)) t2 = nltk.BigramTagger(train_chat1, backoff=t1) print(t2.evaluate(test_chat1)) t3 = nltk.TrigramTagger(train_chat1, backoff=t2) print('Accuracy test chat 1: ', t3.evaluate(test_chat1)) t1 = nltk.UnigramTagger(train_chat2, backoff=default_tagger) print(t1.evaluate(test_chat2)) t2 = nltk.BigramTagger(train_chat2, backoff=t1) print(t2.evaluate(test_chat2)) t3 = nltk.TrigramTagger(train_chat2, backoff=t2) print('Accuracy test chat 2: ', t3.evaluate(test_chat2))
'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(simplify_tags=True), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], simplify_tags=True), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', simplify_tags=True), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', simplify_tags=True), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', simplify_tags=True), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', simplify_tags=True), 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', simplify_tags=True), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(simplify_tags=True), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(simplify_tags=True), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(simplify_tags=True), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus':
def create_tagger(): chat_tags = nps_chat.tagged_posts() t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(chat_tags, backoff=t0) t2 = nltk.BigramTagger(chat_tags, backoff=t1) return t2
import nltk from nltk import FreqDist from nltk.probability import ConditionalFreqDist from nltk.corpus import brown as brown from nltk.corpus import nps_chat as chat from nltk import RegexpTagger from nltk import UnigramTagger from nltk import BigramTagger from nltk import TrigramTagger sizeB = len(brown.tagged_sents()) #length of size of brown corpus sizeC = len(chat.tagged_posts()) #length of size of NPS corpus brownTS = brown.tagged_sents() brownTW = brown.tagged_words( ) #partition sentences into a list with each word containing its tag chatTP = chat.tagged_posts( ) #partition words into a list with each post containing its tag chatTW = chat.tagged_words( ) #partition words into a list with each word containing its tag def splitSen(c, p): #function to partition corpus if c == "brown": t1 = brownTS[:int(sizeB * p)] t2 = brownTS[int(sizeB * p):] return t1, t2 if c == "chat": t1 = chatTP[:int(sizeC * p)] t2 = chatTP[int(sizeC * p):]
'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], simplify_tags=True), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', simplify_tags=True), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', simplify_tags=True), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', simplify_tags=True), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', simplify_tags=True), 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', simplify_tags=True), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(simplify_tags=True), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(simplify_tags=True), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(simplify_tags=True), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus':
lambda: brown.tagged_sents(tagset='simple'), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='simple'), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', tagset='simple'), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', tagset='simple'), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', tagset='simple'), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', tagset='simple'), 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', tagset='simple'), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='simple'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='simple'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='simple'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='simple'), 'Hindi: Indian Languages Corpus':
'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='universal'), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', tagset='universal'), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', tagset='universal'), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', tagset='universal'), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', tagset='universal'), 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', tagset='universal'), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='universal'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='universal'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='universal'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus':
"English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents( categories="religion", tagset="universal" ), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents( categories="learned", tagset="universal" ), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents( categories="science_fiction", tagset="universal" ), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents( categories="romance", tagset="universal" ), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents( categories="humor", tagset="universal" ), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts( tagset="universal" ), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents( tagset="universal" ), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents( tagset="universal" ), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents( tagset="universal" ),
# LookupTagger setup from NLTK Chapter 5 # For Brown Corpus fdist_brown = nltk.FreqDist( brown.words()[:int((len(brown.words()) - 1))]) # slicing to vary the size of the dataset cfdist_brown = nltk.ConditionalFreqDist(brown.tagged_words()) top_words_brown = fdist_brown.most_common(200) most_likely_tags_brown = dict( (word, cfdist_brown[word].max()) for (word, _) in top_words_brown) default_tagger_brown = UnigramTagger(model=most_likely_tags_brown) splits = [[90, 10], [50, 50]] correct_brown = brown.tagged_sents()[:int(( len(brown.tagged_sents()) - 1))] # slicing to vary the size of the dataset correct_chat = chat.tagged_posts()[:int((len(chat.tagged_posts()) - 1))] patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] for split in splits: test_brown, train_brown = train_test_split(correct_brown, test_size=split[1] / 100,
_DEFAULT = "English: Brown Corpus (Humor, simplified)" _CORPORA = { "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(tagset="simple"), "English: Brown Corpus": lambda: brown.tagged_sents(), "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(tagset="simple"), "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents( categories=["news", "editorial", "reviews"], tagset="simple" ), "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="simple"), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="simple"), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents( categories="science_fiction", tagset="simple" ), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"), }
import nltk import sklearn from nltk.corpus import brown from nltk.corpus import nps_chat as chat from nltk.tag import DefaultTagger, RegexpTagger, UnigramTagger, BigramTagger from sklearn.model_selection import train_test_split #a splits = [[90, 10], [50, 50]] correct_brown = brown.tagged_sents() correct_chat = chat.tagged_posts() default_tagger = DefaultTagger("NN") for split in splits: #lag til funksjon for bruk i b test_brown, train_brown = train_test_split(correct_brown, test_size=split[1] / 100, shuffle=False) test_chat, train_chat = train_test_split(correct_chat, test_size=split[1] / 100, shuffle=False) default_tagger.tag(train_brown) print( f"The DefaultTagger accuracy for the Brown Corpus is {default_tagger.evaluate(test_brown)} using a {split[0]}/{split[1]} split." ) default_tagger.tag(train_chat) print( f"The DefaultTagger accuracy for the NPS Chat Corpus is {default_tagger.evaluate(test_chat)} using a {split[0]}/{split[1]} split.\n" ) #50/50 is better because the tagger doesn't "learn", so when the test data is increased (from 10%)
from nltk.corpus import brown, nps_chat import nltk # Initialize all training and test data tokens_brown = brown.sents() tokens_nps_chat = nps_chat.posts() tagged_sents_brown = brown.tagged_sents() tagged_posts_nps_chat = nps_chat.tagged_posts() size_brown_09 = int(len(tagged_sents_brown) * 0.9) size_brown_05 = int(len(tagged_sents_brown) * 0.5) size_nps_chat_09 = int(len(tagged_posts_nps_chat) * 0.9) size_nps_chat_05 = int(len(tagged_posts_nps_chat) * 0.5) train_sents_brown_09 = tagged_sents_brown[:size_brown_09] test_sents_brown_09 = tagged_sents_brown[size_brown_09:] train_sents_brown_05 = tagged_sents_brown[:size_brown_05] test_sents_brown_05 = tagged_sents_brown[size_brown_05:] train_posts_nps_chat_09 = tagged_posts_nps_chat[:size_nps_chat_09] test_posts_nps_chat_09 = tagged_posts_nps_chat[size_nps_chat_09:] train_posts_nps_chat_05 = tagged_posts_nps_chat[:size_nps_chat_05] test_posts_nps_chat_05 = tagged_posts_nps_chat[size_nps_chat_05:] # Task a) print("Task a)") tags_brown = [tag for word, tag in brown.tagged_words()] tags_nps_chat = [tag for word, tag in nps_chat.tagged_words()] # Find most common tags max_brown = nltk.FreqDist(tags_brown).max() # NN max_nps_chat = nltk.FreqDist(tags_nps_chat).max() # UH
10-19-40s_686posts.xml 10-19-adults_706posts.xml 10-24-40s_706posts.xml 10-26-teens_706posts.xml 11-06-adults_706posts.xml 11-08-20s_705posts.xml 11-08-40s_706posts.xml 11-08-adults_705posts.xml 11-08-teens_706posts.xml 11-09-20s_706posts.xml 11-09-40s_706posts.xml 11-09-adults_706posts.xml 11-09-teens_706posts.xml ''' # putting all tagged posts from the nps_chat corpus into one list nps_chat_tagged = list() for fileid in nps_chat.fileids(): print fileid for post in nps_chat.tagged_posts(fileid): nps_chat_tagged.append(post) print str(len(nps_chat_tagged)) print nps_chat_tagged[0] # tags can be retrieved in the same way as the Brown corpus
'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(tagset='simple'), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='simple'), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', tagset='simple'), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', tagset='simple'), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', tagset='simple'), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', tagset='simple'), 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', tagset='simple'), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='simple'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='simple'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='simple'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='simple'), 'Hindi: Indian Languages Corpus':
import nltk from nltk.corpus import treebank from nltk.corpus import brown from nltk.corpus import nps_chat from nltk.corpus import conll2000 from nltk.corpus import ConllCorpusReader brown_fiction = list( brown.tagged_sents(categories='fiction', tagset='universal')) brown_reviews = list( brown.tagged_sents(categories='reviews', tagset='universal')) conll = list(conll2000.tagged_sents(tagset='universal')) tree = list(treebank.tagged_sents(tagset='universal')) columntypes = ['words', 'pos'] twitter_corpus = ConllCorpusReader("resources/", "twitter.conll", columntypes, tagset='en-tweet') twitter = list(twitter_corpus.tagged_sents(tagset='universal')) nps_raw = nps_chat.tagged_posts(tagset='universal') nps = [] for post in nps_raw: post_clean = [sub for sub in post if sub[0]] nps.append(post_clean)
'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(tagset='universal'), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='universal'), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', tagset='universal'), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', tagset='universal'), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', tagset='universal'), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', tagset='universal'), 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', tagset='universal'), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='universal'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='universal'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='universal'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus':
import nltk from nltk.corpus import brown from nltk import UnigramTagger from nltk.corpus import nps_chat from nltk import FreqDist, ConditionalFreqDist brown_corpus_sents = [sent for sent in brown.tagged_sents()] brown_spl_90 = int(90 * len(brown_corpus_sents) / 100) brown_spl_50 = int(50 * len(brown_corpus_sents) / 100) nps_chat_corpus_posts = [sent for sent in nps_chat.tagged_posts()] nps_spl_90 = int(90 * len(nps_chat_corpus_posts) / 100) nps_spl_50 = int(50 * len(nps_chat_corpus_posts) / 100) train_brown_50 = brown_corpus_sents[:brown_spl_50] test_brown_50 = brown_corpus_sents[brown_spl_50:] train_nps_50 = nps_chat_corpus_posts[:nps_spl_50] test_nps_50 = nps_chat_corpus_posts[nps_spl_50:] train_brown_90 = brown_corpus_sents[:brown_spl_90] test_brown_10 = brown_corpus_sents[brown_spl_90:] train_nps_90 = nps_chat_corpus_posts[:nps_spl_90] test_nps_10 = nps_chat_corpus_posts[nps_spl_90:] def get_lookup_tagger_accuracy(test_set, lookup_tagger_basis, corpus): words = [word for sent in lookup_tagger_basis for word in sent] fd = FreqDist(words) cfd = ConditionalFreqDist(corpus.tagged_words()) most_freq_words = fd.most_common(200) likely_tags = dict( (word[0], cfd[word[0]].max()) for (word, _) in most_freq_words) baseline_tagger = UnigramTagger(model=likely_tags)
"English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(categories=["news", "editorial", "reviews"], tagset="universal"), "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="universal"), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="universal"), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(categories="science_fiction", tagset="universal"), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="universal"), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="universal"), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="universal"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="universal"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="universal"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="universal"), "Hindi: Indian Languages Corpus":
] corp_words_tagged = [ brown.tagged_words(tagset=CONST_tagset), nps_chat.tagged_words(tagset=CONST_tagset), conll2000.tagged_words(tagset=CONST_tagset), treebank.tagged_words(tagset=CONST_tagset) ] corp_words_untagged = [ brown.words(), nps_chat.words(), conll2000.words(), treebank.words() ] corp_sents_tagged = [ brown.tagged_sents(tagset=CONST_tagset), nps_chat.tagged_posts(tagset=CONST_tagset), conll2000.tagged_sents(tagset=CONST_tagset), treebank.tagged_sents(tagset=CONST_tagset) ] corp_sents_untagged = [ brown.sents(), nps_chat.posts(), conll2000.sents(), treebank.sents() ] # language tool spell checker lt_check = language_check.LanguageTool('en-US') # pyenchant spell checker # pe_check = enchant.Dict('en_US')