def tag_eptmk(show_stats=True): eptmk_untagged = epistemonikos.corpus_load().sents() if not os.path.isfile('tagger.pickle'): print ("\ntagger.pickle file doesn't exist! Use save_tagger() function" " to create it.\n") else: with open('tagger.pickle', 'rb') as tagger_file: print '\nLoaded and unpickled POS tagger.\n' trained_tagger = cPickle.load(tagger_file) eptmk_tagged = trained_tagger.batch_tag(eptmk_untagged) if show_stats is True: print 'BACKOFF:' pprint_tag_stats(eptmk_tagged, none_tag=None) eptmk_tagged = dict_tagger(eptmk_tagged, EPTMK_DICT, none_tag=None) if show_stats is True: print '+ DICT:' pprint_tag_stats(eptmk_tagged, none_tag=None) eptmk_tagged = wordnet_tagger(eptmk_tagged, none_tag=None) if show_stats is True: print '+ WNET:' pprint_tag_stats(eptmk_tagged, none_tag=None) return eptmk_tagged
def tag_eptmk(show_stats=True): eptmk_untagged = epistemonikos.corpus_load().sents() if not os.path.isfile('tagger.pickle'): print( "\ntagger.pickle file doesn't exist! Use save_tagger() function" " to create it.\n") else: with open('tagger.pickle', 'rb') as tagger_file: print '\nLoaded and unpickled POS tagger.\n' trained_tagger = cPickle.load(tagger_file) eptmk_tagged = trained_tagger.batch_tag(eptmk_untagged) if show_stats is True: print 'BACKOFF:' pprint_tag_stats(eptmk_tagged, none_tag=None) eptmk_tagged = dict_tagger(eptmk_tagged, EPTMK_DICT, none_tag=None) if show_stats is True: print '+ DICT:' pprint_tag_stats(eptmk_tagged, none_tag=None) eptmk_tagged = wordnet_tagger(eptmk_tagged, none_tag=None) if show_stats is True: print '+ WNET:' pprint_tag_stats(eptmk_tagged, none_tag=None) return eptmk_tagged
#!/usr/bin/env python # -*- coding: utf-8 -*- from collections import defaultdict from corpora import epistemonikos from nltk.corpus import stopwords eptmk = epistemonikos.corpus_load().words() words = [w.lower() for w in eptmk if w.isalpha()] word_freq = defaultdict(int) for w in words: word_freq[w] += 1 word_freq = [(v, k) for k, v in word_freq.items()] word_freq = sorted(word_freq, reverse=True) for w in word_freq[:50]: print w raw_input('press ENTER') stoplist = stopwords.words('english') nonfunc_freq = defaultdict(int) for w in [x for x in words if x not in stoplist]: nonfunc_freq[w] += 1 nonfunc_freq = [(v, k) for k, v in nonfunc_freq.items()] nonfunc_freq = sorted(nonfunc_freq, reverse=True) for w in nonfunc_freq[:50]: print w