def __init__(self): '''initialize and train brill and naive bayes classifiers''' #TODO: Fix bug where it loads tagger from calling module dir if exists(file): input = open(file, 'rb') self.classifier = load(input) input.close() print 'Successfully loaded saved classifier' return self.bayes = NaiveBayesTagger() boundary = int(len(brown.tagged_sents())*0.8) train = brown.tagged_sents(simplify_tags=True)[:boundary] brill_trainer = FastBrillTaggerTrainer(initial_tagger = self.bayes, templates = templates, trace = 3, deterministic = True) self.classifier = brill_trainer.train(train, max_rules=10) print 'Saving Taggers to file: "pos_tagger.pickle"' output = open(file, 'wb') dump(self.classifier, output, 1) output.close()
def main(): # run Simple unigram tagger brown_news_tagged = brown.tagged_sents(categories='news') brown_train = brown_news_tagged[100:] brown_test = brown_news_tagged[:100] nn_tagger = nltk.DefaultTagger('NN') ut2 = nltk.UnigramTagger(brown_train, backoff=nn_tagger) simpleUnigramTagger = SimpleUnigramTagger(brown_train, backoff=nn_tagger) print 'Simple Unigram tagger accuracy: %4.1f%%' % ( 100.0 * simpleUnigramTagger.evaluate(brown_test)) print 'Unigram tagger accuracy: %4.1f%%' % ( 100.0 * ut2.evaluate(brown_test)) # run affix tagger with entropy brown_news_tagged = brown.tagged_sents(categories='news') brown_train = brown_news_tagged[:int(0.8*len(brown_news_tagged))] rest = brown_news_tagged[int(0.8*len(brown_news_tagged)):] brown_development = rest[:int(0.5*len(rest))] brown_test = rest[int(0.5*len(rest)):] affix_tagger = nltk.AffixTagger(brown_train, backoff= nltk.DefaultTagger('NN') , cutoff=2) nltk.AffixTagger._train = _train nltk.AffixTagger.H = _H optcutoff = optimize_parameter() print "the optimal cutoff param is: %d " % optcutoff affix_tagger2 = nltk.AffixTagger(brown_train, backoff= nltk.DefaultTagger('NN') , cutoff=optcutoff) print 'Unigram tagger accuracy: %4.1f%%' % ( 100.0 * affix_tagger.evaluate(brown_test)) print 'Unigram tagger accuracy with entropy: %4.1f%%' % ( 100.0 * affix_tagger2.evaluate(brown_test))
def demo(train_size=100, test_size=100, java_home="/usr/local/jdk1.5.0/", mallet_home="/usr/local/mallet-0.4"): from nltk.corpus import brown import textwrap # Define a very simple feature detector def fd(sentence, index): word = sentence[index] return dict(word=word, suffix=word[-2:], len=len(word)) # Let nltk know where java & mallet are. nltk.internals.config_java(java_home) nltk.classify.mallet.config_mallet(mallet_home) # Get the training & test corpus. We simplify the tagset a little: # just the first 2 chars. def strip(corpus): return [[(w, t[:2]) for (w, t) in sent] for sent in corpus] brown_train = strip(brown.tagged_sents(categories="news")[:train_size]) brown_test = strip(brown.tagged_sents(categories="editorial")[:test_size]) crf = MalletCRF.train(fd, brown_train, transduction_type="VITERBI") #'/tmp/crf-model', sample_output = crf.tag([w for (w, t) in brown_test[5]]) acc = nltk.tag.accuracy(crf, brown_test) print "\nAccuracy: %.1f%%" % (acc * 100) print "Sample output:" print textwrap.fill( " ".join("%s/%s" % w for w in sample_output), initial_indent=" ", subsequent_indent=" " ) + "\n" # Clean up print "Clean-up: deleting", crf.filename os.remove(crf.filename) return crf
def demo(train_size=100, test_size=100, java_home=None, mallet_home=None): from nltk.corpus import brown import textwrap # Define a very simple feature detector def fd(sentence, index): word = sentence[index] return dict(word=word, suffix=word[-2:], len=len(word)) # Let nltk know where java & mallet are. nltk.internals.config_java(java_home) nltk.classify.mallet.config_mallet(mallet_home) # Get the training & test corpus. We simplify the tagset a little: # just the first 2 chars. def strip(corpus): return [[(w, t[:2]) for (w,t) in sent] for sent in corpus] brown_train = strip(brown.tagged_sents(categories='news')[:train_size]) brown_test = strip(brown.tagged_sents(categories='editorial')[:test_size]) crf = MalletCRF.train(fd, brown_train, #'/tmp/crf-model', transduction_type='VITERBI') sample_output = crf.tag([w for (w,t) in brown_test[5]]) acc = nltk.tag.accuracy(crf, brown_test) print('\nAccuracy: %.1f%%' % (acc*100)) print('Sample output:') print(textwrap.fill(' '.join('%s/%s' % w for w in sample_output), initial_indent=' ', subsequent_indent=' ')+'\n') # Clean up print('Clean-up: deleting', crf.filename) os.remove(crf.filename) return crf
def test_sentences(categories=[]): """returns a test sentence set: [[(word, tag), ..], [(word, tag), ..], ..]""" if len(categories) == 0: categories = brown.categories() # use all of the brown categories sents = [] for category in categories: total = len(brown.tagged_sents(categories=category)) start = int(TEST_PROPORTION * total) # use the last k sentences for test sents += brown.tagged_sents(categories=category, simplify_tags=True)[-start:-1] return sents
def training_sentences(use=1.0, categories=[]): """returns a training sentence set: [[(word, tag), ..], [(word, tag), ..], ..]""" if len(categories) == 0: categories = brown.categories() # use all of the brown categories sents = [] for category in categories: total = len(brown.tagged_sents(categories=category)) max = int((1-TEST_PROPORTION) * use * total) - 1 # use the first n sentences for training sents += brown.tagged_sents(categories=category, simplify_tags=True)[0:max] return sents
def exercise2(): print print "Exercise 2:" brown_news_tagged_sents = bn.tagged_sents(categories = 'news') brown_lore_tagged_sents = bn.tagged_sents(categories = 'lore') trigram_tagger = nltk.TrigramTagger(brown_news_tagged_sents) brown_news_eval = trigram_tagger.evaluate(brown_news_tagged_sents) brown_lore_eval = trigram_tagger.evaluate(brown_lore_tagged_sents) print "Evaluation of the trigram tagger on 'News': %f " % brown_news_eval print "Evaluation of the trigram tagger on 'Lore': %f " % brown_lore_eval print
def precisionRecall(): def tag_list(tagged_sents): return [tag for sent in tagged_sents for (word, tag) in sent] def apply_tagger(tagger, corpus): return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus] gold = tag_list(brown.tagged_sents(categories='editorial')) test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial'))) cm = nltk.ConfusionMatrix(gold, test) print cm.pp(sort_by_count=True, show_percents=True, truncate=9)
def evaluate(self): '''run tests on conll2000 and treebank data''' test = treebank.tagged_sents()[:100] treebank_result = (100*self.classifier.evaluate(test)) test = conll2000.tagged_sents()[:100] conll2000_result = (100*self.classifier.evaluate(test)) test = brown.tagged_sents()[int(len(brown.tagged_sents())*0.8):] brown_result = (100*self.classifier.evaluate(test)) return (treebank_result, conll2000_result, brown_result)
def testSet(): tagged_sents = list(brown.tagged_sents(categories='news')) random.shuffle(tagged_sents) size = int(len(tagged_sents) * 0.1) train_set, test_set = tagged_sents[size:], tagged_sents[:size] file_ids = brown.fileids(categories='news') size = int(len(file_ids) * 0.1) train_set = brown.tagged_sents(file_ids[size:]) test_set = brown.tagged_sents(file_ids[:size]) train_set = brown.tagged_sents(categories='news') test_set = brown.tagged_sents(categories='fiction')
def get_tagged_tokens(self, corpus=TAGGED, testing=False): """This tokenizes, segments, and tags all the files in a directory.""" if testing: # train against a smaller version of the corpus so that it # doesn't take years during testing. tagger = build_trainer(brown.tagged_sents(categories='news')) else: tagger = build_trainer(brown.tagged_sents()) tokens_and_spans = self.tokenize_corpus(corpus) tagged_spanned_tokens = tag_token_spans( tokens_and_spans, tagger, ) return tagged_spanned_tokens
def exercise1(): print print "Exercise 1:" brown_news_tagged_sents = bn.tagged_sents(categories = 'news') brown_lore_tagged_sents = bn.tagged_sents(categories = 'lore') unigram_tagger = nltk.UnigramTagger(brown_news_tagged_sents) brown_news_eval = unigram_tagger.evaluate(brown_news_tagged_sents) brown_lore_eval = unigram_tagger.evaluate(brown_lore_tagged_sents) print "Evaluation of the unigram tagger on 'News': %f " % brown_news_eval print "Evaluation of the unigram tagger on 'Lore': %f " % brown_lore_eval brown_lore = bn.sents(categories = 'lore') b_lore = unigram_tagger.tag(brown_lore[200]) print "Tagged words for 200th sentence of 'Brown' corpus of category 'Lore' is: " print b_lore print
def __init__(self): boundary = int(len(brown.tagged_sents())*0.8) train_naive = brown.tagged_sents(simplify_tags=True)[:boundary] temp_train_data = [] for sentence in train_naive: untagged_sent = untag(sentence) history = [] for i, (word, tag) in enumerate(sentence): temp_train_data.append((self.featextract(untagged_sent, i, history), tag)) history.append(tag) self.bayes=naivebayes.NaiveBayesClassifier.train(temp_train_data)
def ch05_11_train_test_affix_tagger(): from nltk.corpus import brown fd = nltk.FreqDist(brown.words(categories="news")) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news")) most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys()) affix_tagger = nltk.AffixTagger(model=most_freq_pos) print affix_tagger.evaluate(brown.tagged_sents(categories="editorial"))
def exploreTaggedCorpora(): brown_learned_text = brown.words(categories="learned") sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often")) brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True) tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"] fd = nltk.FreqDist(tags) fd.tabulate() def process(sentence): for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence): if t1.startswith("V") and t2 == "TO" and t3.startswith("V"): print w1, w2, w3 for tagged_sent in brown.tagged_sents(): process(tagged_sent) brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True) data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged) for word in data.conditions(): if len(data[word]) > 3: tags = data[word].keys() print word, " ".join(tags)
def getTaggerAndTestSetInSimplifiedMode(taggerName): brown_news_taggedS = brown.tagged_sents(categories='news', simplify_tags=True) brown_trainS = brown_news_taggedS[100:] brown_testS = brown_news_taggedS[:100] nn_taggerS = nltk.DefaultTagger('NN') regexp_taggerS = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ],backoff=nn_taggerS) at2S = nltk.AffixTagger(brown_trainS, backoff=regexp_taggerS) ut3S = nltk.UnigramTagger(brown_trainS, backoff=at2S) ct2S = nltk.NgramTagger(2, brown_trainS, backoff=ut3S) if taggerName == "DefaultTagger": return nn_taggerS,brown_testS else: if taggerName == "RegExpTagger": return regexp_taggerS, brown_testS else: if taggerName == "AffixTagger": return at2S,brown_testS else: if taggerName == "UnigramTagger": return ut3S,brown_testS else: if taggerName == "BigramTagger": return ct2S,brown_testS
def main(): nltk.TaggerI.evaluate2 = evaluate2 brown_news_tagged = brown.tagged_sents(categories='news') brown_train = brown_news_tagged[100:] brown_test = brown_news_tagged[:100] regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'UNKNOWN') # unkonwn (default) ],backoff=None) at2 = nltk.AffixTagger(brown_train, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(brown_train, backoff=at2) ct2 = nltk.NgramTagger(2, brown_train, backoff=ut3) e = regexp_tagger.evaluate2(brown_test) print "evaluate2 regExp(default unknown) = accoracy unkown words: %f ,accuracy known words: " %e[0],e[1] e = at2.evaluate2(brown_test) print "evaluate2 affix(regExp(default unknown)) = accoracy unkown words: %f ,accuracy known words: " %e[0],e[1] e= ut3.evaluate2(brown_test) print "evaluate2 unigram(affix(regExp(default unknown))) = accoracy unkown words: %f ,accuracy known words: " %e[0],e[1] e= ct2.evaluate2(brown_test) print "evaluate2 bigram(unigram(affix(regExp(default unknown)))) = accoracy unkown words: %f ,accuracy known words: " %e[0],e[1]
def posTagging(self, s): """ 对一个分段进行POS标记 input: ['i','love','you'] output: [('i', 'PRON'), ('love', 'VERB'), ('you', 'PRON')] """ brown_tagged_sents = brown.tagged_sents( tagset='universal', categories='news') default_tagger = nltk.DefaultTagger('NN') month = [u'january', u'february', u'march', u'april', u'may', u'june', u'july', u'august', u'september', u'october', u'november', u'december'] np_words = [w.lower() for w in names.words()] + month np_tags = dict((word, 'NP') for word in np_words) np_tagger = nltk.UnigramTagger( model=np_tags, backoff=default_tagger) brown_unigram_tagger = nltk.UnigramTagger( brown_tagged_sents, backoff=np_tagger) brown_bigram_tagger = nltk.BigramTagger( brown_tagged_sents, backoff=brown_unigram_tagger) brown_trigram_tagger = nltk.TrigramTagger( brown_tagged_sents, backoff=brown_bigram_tagger) patterns = [(r'\bi\b', 'PRON')] regexp_tagger = nltk.RegexpTagger( patterns, backoff=brown_trigram_tagger) result = regexp_tagger.tag(s) return self.encodeutf8(result)
def update_category_by_pos(): from nltk.corpus import brown from nltk import NaiveBayesClassifier from nltk import classify from nltk.tag import untag from nltk import DecisionTreeClassifier def pos_features(sentence, i): features = {'suffix(1)':sentence[i][-1:], 'suffix(2)':sentence[i][-2:], 'suffix(3)':sentence[i][-3:] } features['prev-word'] = '<start>' if i==0 else sentence[i-1] return features print pos_features(brown.sents()[0], 8) tagged_sents = brown.tagged_sents(categories='news') featuresets = [] for tagged_sent in tagged_sents: untagged_sent = untag(tagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append((pos_features(untagged_sent, i), tag)) size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = NaiveBayesClassifier.train(train_set) classifier = DecisionTreeClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
def __init__(self): # This is our fast Part of Speech tagger brown_train = brown.tagged_sents(categories=['news']) regexp_tagger = nltk.RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'(-|:|;)$', ':'), (r'\'*$', 'MD'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ness$', 'NN'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN') ]) self.unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger) self.bigram_tagger = nltk.BigramTagger(brown_train, backoff=self.unigram_tagger) # This is our semi-CFG; Extend it according to your own needs cfg = {} cfg["NNP+NNP"] = "NNP" cfg["CD+CD"] = "CD" cfg["NN+NN"] = "NNI" cfg["NNI+NN"] = "NNI" cfg["JJ+JJ"] = "JJ" cfg["JJ+NN"] = "NNI" cfg["VBN+NNS"] = "NNP" self.cfg = cfg for i, word in enumerate(STOP_WORDS): STOP_WORDS[i] = word
def read_datas(self): brown_tagged_sentence = brown.tagged_sents() brown_sent = brown.sents() size = int(len(brown_tagged_sentence) * 0.9) train_set = brown_tagged_sentence[:size] test_set = brown_tagged_sentence[size:] return (train_set,test_set)
def verbs(): wsj = nltk.corpus.treebank.tagged_words(simplify_tags=True) # word_tag_fd = nltk.FreqDist(wsj) # print [word + "/" + tag for (word, tag) in word_tag_fd if tag.startswith('V')] cfd1 = nltk.ConditionalFreqDist(wsj) print cfd1['yield'].keys() print cfd1['cut'].keys() print [w for w in cfd1.conditions() if 'VD' in cfd1[w] and 'VN' in cfd1[w]] idx1 = wsj.index(('kicked', 'VD')) print wsj[idx1-4:idx1+1] idx2 = wsj.index(('kicked', 'VN')) print wsj[idx2-4:idx2+1] def findtags(tag_prefix, tagged_text): cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix)) return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions()) def process(sentence): for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence): if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')) : print w1, w2, w3 if __name__ == "__main__": tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news')) for tag in sorted(tagdict): print tag, tagdict[tag] for tagged_sent in brown.tagged_sents(): process(tagged_sent)
def create_tagger(): """Train a tagger from the Brown Corpus. This should not be called very often; only in the event that the tagger pickle wasn't found.""" print "Building tagger..." train_sents = brown.tagged_sents() # These regexes were lifted from the NLTK book tagger chapter. t0 = nltk.RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) print "got t0" t1 = nltk.UnigramTagger(train_sents, backoff=t0) print "got t1" t2 = nltk.BigramTagger(train_sents, backoff=t1) print "got t2" t3 = nltk.TrigramTagger(train_sents, backoff=t2) print "Built tagger!" return t3
def __init__(self): try: tagger = cPickle.load(open('nerdb_tagger.pkl')) except IOError: print 'failed to load nerdb_tagger, recreating...' train_sents = conll2000.tagged_sents() + brown.tagged_sents() tagger = nltk.DefaultTagger('NN') tagger = nltk.UnigramTagger(train_sents, backoff=tagger) tagger = nltk.BigramTagger(train_sents, backoff=tagger) tagger = nltk.TrigramTagger(train_sents, backoff=tagger) cPickle.dump(tagger, open('nerdb_tagger.pkl', 'w')) print 'done' try: chunker = cPickle.load(open('nerdb_chunker.pkl')) except IOError: print 'failed to load nerdb_chunker, recreating...' train_sents = conll2000.chunked_sents() chunker = ConsecutiveNPChunker(tagger, train_sents) cPickle.dump(chunker, open('nerdb_chunker.pkl', 'w')) print 'done' self.chunker = chunker self.people = [line.strip().split(" ", 1) for line in open('actors_index.txt').readlines()] self.people += [line.strip().split(" ", 1) for line in open('actresses_index.txt').readlines()] self.movies = [line.strip().split(" ", 1) for line in open('title_index.txt').readlines()] self.entity_types = {'PERSON' : self.people, 'MOVIE' : self.movies}
def auto_tag(company): """ tag a given text using brown corpus and unigram tagger :param company: company whose reviews are tagged :return: a list of tagged words """ brown_tagged_sents = brown.tagged_sents(categories = 'news', tagset='universal') brown_sents = brown.sents(categories = 'news') # open the review of a company, and print error message if company review doesn't exist # first deal with unique cases such as General Motors => GM if company == 'General Motors': company = 'GM' elif company == 'Ford Motor Company': company = 'Ford' try: text = open('/Users/vickyzhang/Documents/Python/chart/comp/review/'+ company.capitalize() + '_review.txt').read() except FileNotFoundError: print('The system doesn\'t have a review for the company you entered. Please enter another company.') # normalize (tokenize and lowercase-ize) each word in the string text_token = nltk.word_tokenize(text) text_normal = [w.lower() for w in text_token] # build unigram tagger based on brown corpus, and use it to tag the normalized text unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) text_tagged = unigram_tagger.tag(text_normal) return text_tagged
def get_medium_tagged_sentence_tuples(MIN=3, MAX=4): ''' THIS FUNCTION IS NEVER USED Produces ~36 POS tags ''' return [[(wrd, simplify(tag)) for wrd, tag in sent[:-1]] for sent in brown.tagged_sents() if MIN < len(sent) <= MAX and sent[-1][0] == '.']
def __init__(self): try: tagger = cPickle.load(open("nerdb_tagger.pkl")) except IOError: print "failed to load nerdb_tagger, recreating..." train_sents = conll2000.tagged_sents() + brown.tagged_sents() tagger = nltk.DefaultTagger("NN") tagger = nltk.UnigramTagger(train_sents, backoff=tagger) tagger = nltk.BigramTagger(train_sents, backoff=tagger) tagger = nltk.TrigramTagger(train_sents, backoff=tagger) cPickle.dump(tagger, open("nerdb_tagger.pkl", "w")) print "done" try: chunker = cPickle.load(open("nerdb_chunker.pkl")) except IOError: print "failed to load nerdb_chunker, recreating..." train_sents = conll2000.chunked_sents() chunker = ConsecutiveNPChunker(tagger, train_sents) cPickle.dump(chunker, open("nerdb_chunker.pkl", "w")) print "done" self.chunker = chunker self.people = [line.strip().split(" ", 1) for line in open("actors_index.txt").readlines()] self.people += [line.strip().split(" ", 1) for line in open("actresses_index.txt").readlines()] self.movies = [line.strip().split(" ", 1) for line in open("title_index.txt").readlines()] self.entity_types = {"PERSON": self.people, "MOVIE": self.movies} self.numbers = eval(open("numbers.txt").read())
def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN'), # nouns (default) ] ) brown_train = brown.tagged_sents(categories='news') unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) # Override particular words main_tagger = RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')], backoff=trigram_tagger, ) return main_tagger
def _setSelectedPOSTags(self): buff = self._loadData('selective_pos.bin') if buff: self.selective_pos = buff return #First get all (word, tag) in corpuses sentences = brown.tagged_sents(simplify_tags=True) self.selected_tags = ["ADJ","ADV", "CNJ"] self.selective_pos = ConditionalFreqDist() temp_dist = ConditionalFreqDist() for sentence in sentences: for (word, tag) in sentence: if tag in self.selected_tags: temp_dist[tag].inc(str(word).lower()) #Now, get the words with frequency > 10 for category in temp_dist.conditions(): fredist = temp_dist[category] for key in fredist.keys(): if fredist[key] > 4: self.selective_pos[category].inc(key) self._saveData('selective_pos.bin',self.selective_pos)
def __init__(self): """Initialization method of :class:`TopicExtractor` class. """ # This is our fast Part of Speech tagger ############################################################################# brown_train = brown.tagged_sents(categories='news') regexp_tagger = nltk.RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'(-|:|;)$', ':'), (r'\'*$', 'MD'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ness$', 'NN'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN') ]) unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger) self.bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger) ############################################################################# # This is our semi-CFG; Extend it according to your own needs ############################################################################# self.cfg = {} self.cfg["NNP+NNP"] = "NNP" self.cfg["NN+NN"] = "NNI" self.cfg["NNI+NN"] = "NNI" self.cfg["JJ+JJ"] = "JJ" self.cfg["JJ+NN"] = "NNI"
def main(): brown_sentences = brown.tagged_sents(tagset='universal') train_sentences = [] train_tags = [] for sentence in brown_sentences: sentence_words = [] sentence_tags = [] for index, (word, pos) in enumerate(sentence): sentence_words.append(word) sentence_tags.append(pos) train_sentences.append(sentence_words) train_tags.append(sentence_tags) vocabulary_dict = {} for sentence in train_sentences: for word in sentence: if (vocabulary_dict.get(word) is None): vocabulary_dict[word] = 1 else: vocabulary_dict[word] = vocabulary_dict[word] + 1 for (word, count) in vocabulary_dict.items(): if (count < 5): rare_words.add(word) training_features = [[]] for sentence_index, sentence in enumerate(train_sentences): training_features.append([]) for word_index, word in enumerate(sentence): if (word_index == 0): prevtag = '<S>' else: prevtag = train_tags[sentence_index][word_index - 1] training_features[sentence_index].append( get_features(word_index, sentence, prevtag, rare_words)) training_features, non_rare_features = remove_rare_features( training_features, 5) counter = 0 for feature in non_rare_features: feature_dict[feature] = counter counter = counter + 1 tagset = set() for sentence in train_tags: for tag in sentence: tagset.add(tag) counter = 0 for tag in tagset: tag_dict[tag] = counter counter = counter + 1 Y_train = build_Y(train_tags) X_train = build_X(training_features) model = LogisticRegression(class_weight='balanced', solver='saga', multi_class='multinomial') model.fit(X_train, Y_train) test_data = load_test("test.txt") for sentence in test_data: temp_data = [] temp_data.append(sentence) Y_pred, Y_start = get_predictions(temp_data, model) print(viterbi(Y_start, Y_pred))
fd.tabulate() """ often 后面最高频率的词性是动词,没有名词(该语料库中) VERB ADV ADP ADJ . PRT 37 8 7 6 4 2 """ # 使用 POS 标记寻找三词短语 def process(sentence): for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence): if t1.startswith('V') and t2 == 'TO' and t3.startswith('V'): print(w1, w2, w3) for tagged_sent in brown.tagged_sents(): process(tagged_sent) # 查看与它们的标记关系高度模糊不清的词 # 这些词各自的上下文可以帮助弄清楚标记之间的关系 brown_news_tagged = brown.tagged_words(categories='news', tagset='universal') data = nltk.ConditionalFreqDist( (word.lower(), tag) for (word, tag) in brown_news_tagged) for word in data.conditions(): if len(data[word]) > 3: tags = data[word].keys() print(word, ' '.join(tags)) # 打开 POS 一致性工具 nltk.app.concordance()
CORPUS_LOADED_EVENT = "<<CL_EVENT>>" SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>" SEARCH_ERROR_EVENT = "<<SE_EVENT>>" ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>" POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = "English: Brown Corpus (Humor, simplified)" _CORPORA = { "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents( tagset="universal" ), "English: Brown Corpus": lambda: brown.tagged_sents(), "English: Brown Corpus (simplified)": lambda: brown.tagged_sents( tagset="universal" ), "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents( categories=["news", "editorial", "reviews"], tagset="universal" ), "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents( categories="religion", tagset="universal" ), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents( categories="learned", tagset="universal" ), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents( categories="science_fiction", tagset="universal" ),
# Ch5 分类和标注词汇 # 词性标注(parts-of-speech tagging,POS tagging):简称标注。 # 将词汇按照它们的词性(parts-of-speech,POS)进行分类并对它们进行标注 # 词性:也称为词类或者词汇范畴。 # 用于特定任务标记的集合被称为一个标记集。 import nltk import pylab from nltk import word_tokenize from nltk.corpus import brown brown_words = brown.words(categories='news') brown_tagged_words = brown.tagged_words(categories='news') brown_sents = brown.sents(categories='news') brown_tagged_sents = brown.tagged_sents(categories='news') # Sec 5.1 使用词性标注器 text = word_tokenize("And now for something completely different") nltk.pos_tag(text) nltk.help.upenn_tagset('CC') nltk.help.upenn_tagset('RB') nltk.help.upenn_tagset('IN') nltk.help.upenn_tagset('NN') nltk.help.upenn_tagset('JJ') nltk.corpus.brown.readme() print(nltk.corpus.gutenberg.readme()) # 处理同形同音异义词,系统正确标注了 # 前面的refUSE是动词,后面的REFuse是名词 # 前面的permit是动词,后面的permit是名字 text = word_tokenize("They refuse to permit us to obtain the refuse permit")
import nltk from nltk.corpus import brown from pprint import pprint import pylab brown_tagged_sents = brown.tagged_sents(categories='religion') brown_sents = brown.sents(categories='religion') # Create default tagger tags = [tag for (word, tag) in brown.tagged_words(categories='religion')] print(nltk.FreqDist(tags).max()) raw = 'The more I think about language, the more it amazes me that people ever understand each other at all' tokens = nltk.word_tokenize(raw) default_tagger = nltk.DefaultTagger('NN') pprint(default_tagger.tag(tokens)) # Evaluate performance print(default_tagger.evaluate(brown_tagged_sents)) # Regular Expression Tagger patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ]
import nltk from nltk.corpus import brown data = brown.tagged_sents(categories=["adventure"], tagset="universal") #This example to see how NaiveBayes with only one feature works. (The only feature is word by itself) from NaiveBayes import NaiveBayesModel model = NaiveBayesModel() model.train(data) first_sent = data[0] first_sent_words = [w for w, l in first_sent] prediction = model.predict(first_sent_words) print(prediction) # This example to see how Naive Bayes with different features works # So it will have a feature extractor from NaiveBayesUpdate import NaiveBayesModel_v2 import collections model = NaiveBayesModel_v2() def feature_extractor(word): feature_set = {} feature_set["word"] = word return feature_set formated_data = [] for sent in data:
import nltk nltk.download('universal_tagset') nltk.download('brown') import numpy as np import copy import pandas as pd import math from tqdm import tqdm from nltk.corpus import brown as corpus tagged_words = [] #word with tag all_tags = [] #tags sequence my_set = {"START", "END"} for sent in corpus.tagged_sents(tagset='universal'): # get tagged sentences tagged_words.append(("START", "START")) all_tags.append("START") for (word, tag) in sent: all_tags.append(tag) tagged_words.append((tag, word)) my_set.add(word) tagged_words.append(("END", "END")) all_tags.append("END") # print(*map(' '.join,nltk.bigrams(all_tags)),sep=' ,') utagset = [ 'START', 'VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.', 'END' ]
from nltk.corpus import brown import nltk import pickle import time import json import os corpus=[] for genre in brown.categories(): corpus += brown.tagged_sents(categories=genre) tag = set() for sen in corpus: for wordset in sen: tag.add(wordset[1]) f=open("brown_taglist.txt","w") for each in tag: f.write(each+'\n') f.close() ## create word_seg.txt and label_seg.txt temp1 = open('C:\Users\a\Desktop\data\word_seg.txt','w',encoding='utf-8') temp2 = open('C:\Users\a\Desktop\data\label_seg.txt','w',encoding='utf-8') for sen in corpus: for wordset in sen: temp1.write(wordset[0]+' ') temp2.write(wordset[1]+' ') temp1.write('\n') temp2.write('\n')
Implementation of a bigram HMM tagger i. Training phase: Compute the transition and emission probabilities of a bigram HMM tagger directly on the training set using maximum likelihood estimation. ii. Implement the Viterbi algorithm corresponding to the bigram HMM model in a way you can tag any test sentence. iii. Run the algorithm from c)ii) on the test set. Compute the error rate and compare it to the results from b)ii). ''' import math from nltk.corpus import brown from tags_and_words import TAGS, TAG2INDEX, WORDS, WORDS2INDEX from tags_and_words import START, STOP data = brown.tagged_sents(categories="news") train = data[:int(0.9 * len(data))] test = data[int(0.9 * len(data)):] def safe_log(x): if x == 0: return -float("inf") else: return math.log(x) def add_start_and_stop(sent): return [(START, u"START")] + sent + [(STOP, u"STOP")]
cfd = ConditionalFreqDist() # 得到英文停用词表 stopwords_list = stopwords.words('english') # 定义一个函数,如果属于名词类则返回true def is_noun(tag): return tag.lower() in [ 'nn', 'nns', 'nn$', 'nn-tl', 'nn+bez', 'nn+hvz', 'nns$', 'np', 'np$', 'np+bez', 'nps', 'nps$', 'nr', 'np-tl', 'nrs', 'nr$' ] ... # 统计前 5 个单词的出现次数 for sentence in brown.tagged_sents(): for (index, tagtuple) in enumerate(sentence): (token, tag) = tagtuple token = token.lower() if token not in stopwords_list and is_noun(tag): window = sentence[index + 1:index + 5] for (window_token, window_tag) in window: window_token = window_token.lower() if window_token not in stopwords_list and is_noun(window_tag): cfd[token].inc(window_token) # 好了。我们完成了!让我们开始进行联想! print(cfd['left'].max()) print(cfd['life'].max()) print(cfd['man'].max()) print(cfd['woman'].max()) print(cfd['boy'].max())
tags = generate_unique_tags() characters = generate_unique_characters() words, word_embedding = generate_words_embedding(word_embedding_path) # confusion matrix initialization confusion_matrix_train = torch.zeros(tags.id - 1, tags.id - 1, dtype=torch.int32, device=device) confusion_matrix_test = torch.zeros(tags.id - 1, tags.id - 1, dtype=torch.int32, device=device) # Partitioning the dataset corpus = np.array(brown.tagged_sents(tagset='universal')) kf = KFold(n_splits=5, shuffle=True) kf.get_n_splits(corpus) # KFOLD Starts for train_index, test_index in kf.split(corpus): # dataset parsing train_corpus = corpus[train_index] test_corpus = corpus[test_index] word_sequences_train = [[word for (word, tag) in sent] for sent in train_corpus] tag_sequences_train = [[ tags.fetch(tag, 'val') for (word, tag) in sent ] for sent in train_corpus] word_sequences_test = [[word for (word, tag) in sent]
def performance(cfd, wordlist): lt = dict((word, cfd[word].max()) for word in wordlist) baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN')) return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
CORPUS_LOADED_EVENT = "<<CL_EVENT>>" SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>" SEARCH_ERROR_EVENT = "<<SE_EVENT>>" ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>" POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = "English: Brown Corpus (Humor, simplified)" _CORPORA = { "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(tagset="universal"), "English: Brown Corpus": lambda: brown.tagged_sents(), "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(tagset="universal"), "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(categories=["news", "editorial", "reviews"], tagset="universal"), "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="universal"), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="universal"), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(categories="science_fiction", tagset="universal"), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="universal"), "English: Brown Corpus (Humor, simplified)":
if n=='All': return output_list else: return random.sample(output_list, min(n, len(output_list))) def getReverseDict(inDict): outDict = nltk.defaultdict(list) for key in inDict.keys(): outDict[inDict[key]].append(key) return outDict ## for entry in inDict[key]: ## outDict[entry]=key # get brown_tagged, sentence form brown_tagged_s = brown.tagged_sents() # word form: ##brown_tagged_w = brown.tagged_words() # get all tags used in brown tagging b_tags = sorted(set(entry[1] for entry in brown.tagged_words())) # build a freq dist the long way (but I alread have tags...) tag_count = nltk.defaultdict(int) handlecrapstr('for entry in b_tags: tag_count[entry]') for sent in brown_tagged: for (w, t) in sent: tag_count[t] += 1 # the short way... tag_fd = nltk.FreqDist(entry[1] for sent in brown_tagged for entry in sent)
class NLP(): #_grammar = nltk.data.load('file:' + PATH_GRAMMAR_FILE) _unigram_tagger = nltk.UnigramTagger(brown.tagged_sents(categories='news')) _question_tags = ["WDT", "WP$", "WPO", "WPS", "WQL", "WRB"] def __init__(self): self._input_types = [ self.match_questions, self.match_commands, self.match_descrptions, self.match_explanations ] def generate(self, msg_type): answers = [ production for production in generate(self._grammar, start=Nonterminal(msg_type)) ] return TreebankWordDetokenizer().detokenize(secrets.choice(answers)) def run(self, sentence): tokens = [word for word in nltk.word_tokenize(sentence)] tagged_tokens = self._unigram_tagger.tag(tokens) print(tagged_tokens) for pattern in self._input_types: match = pattern(tagged_tokens) if match: return match(tagged_tokens) return False def match_questions(self, tagged_tokens): first_tag = tagged_tokens[0][1] if first_tag in self._question_tags: return self.process_questions return False def match_commands(self, tagged_tokens): first_tag = tagged_tokens[0][1] if first_tag == "VB" or first_tag == "VBD": return self.process_commands return False def match_descrptions(self, tagged_tokens): first_tag = tagged_tokens[0][1] if first_tag in ["PPSS", "EX", "CD"]: return self.process_descrptions return False def match_explanations(self, tagged_tokens): return False def process_questions(self, tagged_tokens): question = tagged_tokens[0][0] #object = list(filter(lambda x: x[1] == "NN", tagged_tokens)) object = tagged_tokens[3][0] if question == "Where": return robot.find_location(object) def process_commands(self, tagged_tokens): action = tagged_tokens[0][0].encode('ascii', 'ignore') target = None for word, tag in tagged_tokens: if tag == "NN": target = word.encode('ascii', 'ignore') return ('cmd', Command(action, target)) def process_descrptions(self, tagged_tokens): subject = tagged_tokens[0] if subject[0] == "I": tagged_verb = tagged_tokens[1][1] if tagged_verb == "BEM": return robot.delete_agent() if tagged_tokens[2][ 0] == "leaving" else robot.new_interlocutor( tagged_tokens[2][0]) elif tagged_tokens[1][0] == "leave": return robot.delete_agent() elif "CD" in [x[1] for x in tagged_tokens]: step = [x[0] for x in tagged_tokens if x[1] == "CD"] robot.modify_task_plan(step[0]) else: pass elif subject[0] == "There": object = tagged_tokens[3][0] attribute = tagged_tokens[4][0] location = tagged_tokens[6][0] return robot.add_item(object, attribute, location) elif subject[1] == "CD": if "done" in [x[0] for x in tagged_tokens]: return StepCompleted(subject[0]) else: pass def process_explanations(self, tagged_tokens): pass
from nltk.corpus import brown from HMM import HMM tagged_corpus=brown.tagged_sents(categories=["adventure"],tagset="universal") model=HMM() model.fit(tagged_corpus) x=["The","man","certainly","didn't","want","to","wait"] pred=model.predict(x) print(pred) ####### -> Seem work very well :O :O :O ####### Evaluate the performance of the TriHMM eval_sents=brown.tagged_sents(categories=["romance"],tagset="universal") x=[[ w for w,t in sent] for sent in eval_sents ] y_actual=[[t for w,t in sent] for sent in eval_sents] y_predict=model.predict_many(x) correct=0 total=0 for iter,sent in enumerate(y_actual): for jter,val in enumerate(sent): if y_actual[iter][jter]==y_predict[iter][jter] correct+=1 total+=1
import random from nltk.corpus import brown import nltk tagged_sents = list(brown.tagged_sents(categories='news')) random.shuffle(tagged_sents) size = int(len(tagged_sents) * 0.1) train_set, test_set = tagged_sents[size:], tagged_sents[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) ############################################ file_ids = brown.fileids(categories='news') size = int(len(file_ids) * 0.1) train_set = brown.tagged_sents(file_ids[size:]) test_set = brown.tagged_sents(file_ids[:size]) classifier = nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) ############################################ train_set = brown.tagged_sents(categories='news') test_set = brown.tagged_sents(categories='fiction') classifier = nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) print(classifier.show_most_informative_features(5))
features = { 'Suffix(1)': sentence[i][-1:], 'Suffix(2)': sentence[i][-2:], 'Suffix(3)': sentence[i][-3:] } # 单词前面的词作为特征,如果单词为头一个单词就设置前面一个单词为<START> if i == 0: features['prev-word'] = '<START>' else: features['prev-word'] = sentence[i - 1] return features # (brown.sents()[0],8) == 'investigation' type(pos_features(brown.sents()[0], 8)) tagged_sents = brown.tagged_sents(categories='news') featuresets = [] for tagged_sent in tagged_sents: untagged_sent = nltk.tag.untag(tagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append((pos_features(untagged_sent, i), tag)) # 特征集合中的元素必须是tuple的形式 size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) nltk.classify.accuracy(classifier, test_set) # 1.6. 序列分类(通过上下文的标签提高分类的精度) # 为了获取相关分类任务之间的依赖关系,可以使用联合分类器模型 # 连续分类或者贪婪序列分类的序列分类器策略
count = count + 1 # ///////////////////////////////////////////////////////////////////////////////////////////////// # //////////////////////////////////////////////MAIN////////////////////////////////////////////// nltk.download('brown') nltk.download('punkt') nltk.download('universal_tagset') one_hot_labels = None vocab_v0 = {} wordVocab = {} tag_mapping = {} Pre.parse_Questions(brown) Pre.labelWordEncoder() brown_tagsents = dllist(brown.tagged_sents(tagset='universal')) xTrain, yTrain = Pre.dataSegmentation(brown_tagsents) ############################################ MODEL ######################################################################## # Data Partition # Padding Vectorized Data Set x_train = tf.keras.preprocessing.sequence.pad_sequences(xTrain[:50000], padding='post') y_train = tf.keras.preprocessing.sequence.pad_sequences(yTrain[:50000], padding='post') x_validation = x_train[40000:] y_validation = y_train[40000:] x_train = x_train[10000:40000] y_train = y_train[10000:40000]
def answers(): global tagged_sentences_universal, test_data_universal, \ train_data_universal, model, test_size, train_size, ttags, \ correct, incorrect, accuracy, \ good_tags, bad_tags, answer4b, answer5 # Load the Brown corpus with the Universal tag set. tagged_sentences_universal = brown.tagged_sents(categories='news', tagset='universal') # Divide corpus into train and test data. test_size = 500 train_size = len(tagged_sentences_universal) - test_size # fixme test_data_universal = tagged_sentences_universal[-test_size:] # fixme train_data_universal = tagged_sentences_universal[:train_size] # fixme if hashlib.md5(''.join( map( lambda x: x[0], train_data_universal[0] + train_data_universal[-1] + test_data_universal[0] + test_data_universal[-1]) ).encode('utf-8')).hexdigest() != '164179b8e679e96b2d7ff7d360b75735': print( '!!!test/train split (%s/%s) incorrect, most of your answers will be wrong hereafter!!!' % (len(train_data_universal), len(test_data_universal)), file=sys.stderr) # Create instance of HMM class and initialise the training and test sets. model = HMM(train_data_universal, test_data_universal) # Train the HMM. model.train() # Some preliminary sanity checks # Use these as a model for other checks e_sample = model.elprob('VERB', 'is') if not (type(e_sample) == float and e_sample <= 0.0): print('elprob value (%s) must be a log probability' % e_sample, file=sys.stderr) t_sample = model.tlprob('VERB', 'VERB') if not (type(t_sample) == float and t_sample <= 0.0): print('tlprob value (%s) must be a log probability' % t_sample, file=sys.stderr) if not (type(model.states) == list and \ len(model.states) > 0 and \ type(model.states[0]) == str): print('model.states value (%s) must be a non-empty list of strings' % model.states, file=sys.stderr) print('states: %s\n' % model.states) ###### # Try the model, and test its accuracy [won't do anything useful # until you've filled in the tag method ###### s = 'the cat in the hat came back'.split() model.initialise(s[0]) ttags = model.tag(s) # fixme print("Tagged a trial sentence:\n %s" % list(zip(s, ttags))) v_sample = model.get_viterbi_value('VERB', 5) if not (type(v_sample) == float and 0.0 <= v_sample): print('viterbi value (%s) must be a cost' % v_sample, file=sys.stderr) b_sample = model.get_backpointer_value('VERB', 5) if not (type(b_sample) == str and b_sample in model.states): print('backpointer value (%s) must be a state name' % b_sample, file=sys.stderr) # check the model's accuracy (% correct) using the test set correct = 0 incorrect = 0 incorrent_sent = [] for sentence in test_data_universal: s = [word.lower() for (word, tag) in sentence] model.initialise(s[0]) tags = model.tag(s) inc = False for ((word, gold), tag) in zip(sentence, tags): if tag == gold: correct = correct + 1 # fix me else: incorrect = incorrect + 1 # fix me inc = True if inc and len(incorrent_sent) < 10: incorrent_sent.append((sentence, tags)) print('\nFirst 10 incorrect sentences are:') for sent, tags in incorrent_sent: print("Tagged test sentence:") print(sent) print("\nTags produced:") print(tags) print('\n\n') accuracy = correct / (correct + incorrect) # fix me print('Tagging accuracy for test set of %s sentences: %.4f' % (test_size, accuracy)) # Print answers for 4b, 5 and 6 bad_tags, good_tags, answer4b = answer_question4b() print('\nA tagged-by-your-model version of a sentence:') print(bad_tags) print('The tagged version of this sentence from the corpus:') print(good_tags) print('\nDiscussion of the difference:') print(answer4b[:280]) answer5 = answer_question5() print('\nFor Q5:') print(answer5[:500]) answer6 = answer_question6() print('\nFor Q6:') print(answer6[:500])
from itertools import chain from collections import Counter, defaultdict from pomegranate import State, HiddenMarkovModel, DiscreteDistribution from nltk import pos_tag, word_tokenize from nltk.corpus import brown from sklearn.model_selection import train_test_split from nltk.corpus import brown import nltk nltk.download('brown') # Define corpus corpus = brown.tagged_sents() training_vocab = list(set([word for sent in corpus for word, tag in sent])) def pair_counts(X, Y): """Return a dictionary keyed to each unique value in the first sequence list that counts the number of occurrences of the corresponding value from the second sequences list. """ words = [ii for i in X for ii in i if type(i) != str] tags = [ii for i in Y for ii in i if type(i) != str] pair_count = {tag: {} for tag in set(tags)} for tag, word in zip(tags, words):
try: (trainsection, testsection, method) = ('news', 'editorial', 'default') opts, args = getopt.getopt(sys.argv[1:], "hi:o:m:", ["help", "train=", "test=", "method="]) except getopt.GetoptError: usage(sys.argv) for o, a in opts: if o in ('-h', '--help'): usage([sys.argv[0]]) sys.exit(0) if o in ('-i', '--train'): trainsection = a if o in ('-o', '--test'): testsection = a if o in ('-m', '--method'): method = a train_tagged_sents = brown.tagged_sents(categories=trainsection) test_tagged_sents = brown.tagged_sents(categories=testsection) train_tagged_words = brown.tagged_words(categories=trainsection) test_tagged_words = brown.tagged_words(categories=testsection) train_words = brown.words(categories=trainsection) print_to_file("\n\nmethod = " + method + "\n") default_tag = default_tag(train_tagged_sents) default_tagger = nltk.DefaultTagger(default_tag) if method in ['unigram', 'bigram', 'trigram']: tu = nltk.UnigramTagger(train_tagged_sents, backoff=default_tagger) tb = nltk.BigramTagger(train_tagged_sents, backoff=tu) tt = nltk.TrigramTagger(train_tagged_sents, backoff=tb) fd = nltk.FreqDist(train_words)
train_sets = [] for tagged_sent in train_sents: untagged_sent = nltk.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): train_sets.append((pos_feature_tag(untagged_sent, i, history), tag)) history.append(tag) self.classifier = nltk.classify.NaiveBayesClassifier.train(train_sets) def tag(self, sentence): history = [] feature_set = [] taggers = [] for i, word in enumerate(sentence): feature_set = pos_feature_tag(sentence, i, history) tag = self.classifier.classify(feature_set) history.append(tag) taggers.append(tag) return zip(sentence, taggers) if __name__ == '__main__': sentences = brown.tagged_sents(categories="news") # tagger=ConsecutivePosTagger(sentences) # print([(word,tag)for word,tag in tagger.tag(["i","am","a","gir","who","are","so","beautiful"])]) size = int(len(sentences) * 0.1) train_set, test_set = sentences[size:], sentences[:size] tagger = ConsecutivePosTagger(train_set) print(tagger.evaluate(test_set))
import nltk #from nltk.book import * from nltk.corpus import treebank from nltk.corpus import brown from nltk import word_tokenize from nltk import hmm #nltk.help.upenn_tagset("NN*") files = treebank.fileids() #print(files) t = treebank.tagged_words("wsj_0003.mrg") #for p in t: #print(p) #race1 = nltk.tag.str2tuple('race/NN') #race2 = nltk.tag.str2tuple('race/VB') #print(race1) #print(brown.tagged_words().count(race1)) #print(brown.tagged_words().count(race2)) unitag = nltk.tag.UnigramTagger(brown.tagged_sents(categories='news')[:5000]) print(unitag) s = "The secretariat is expected to race tomorrow." s_tok = word_tokenize(s) tt = unitag.tag(s_tok) print(tt) hmmTagger = nltk.hmm.HiddenMarkovModelTrainer().train_supervised( brown.tagged_sents(categories="news")[:5000]) tt2 = hmmTagger.tag(s_tok) print(tt2)
BOUNDARY = r'\b' CORPUS_LOADED_EVENT = '<<CL_EVENT>>' SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>' SEARCH_ERROR_EVENT = '<<SE_EVENT>>' ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>' # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = 'English: Brown Corpus (Humor, simplified)' _CORPORA = { 'Catalan: CESS-CAT Corpus (simplified)': lambda: cess_cat.tagged_sents(simplify_tags=True), 'English: Brown Corpus': lambda: brown.tagged_sents(), 'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(simplify_tags=True), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], simplify_tags=True), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', simplify_tags=True), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', simplify_tags=True), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', simplify_tags=True), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', simplify_tags=True), 'English: Brown Corpus (Humor, simplified)':
def main(): brown_sentences = brown.tagged_sents(tagset='universal') train_sentences = [] train_tags = [] # sepearating sentences and their labels for sentence in brown_sentences: s = [] t = [] for pair in sentence: s.append(pair[0]) t.append(pair[1]) train_sentences.append(s) train_tags.append(t) # finding word count word_count = find_word_count(train_sentences) # finding rare words rare_words = find_rare_words(word_count, 5) # adding features for each word training_features = [] for idx in range(len(train_sentences)): features = [] sentence = train_sentences[idx] for i in range(len(sentence)): prevtag = '<S>' if i == 0 else train_tags[idx][i - 1] features.append(get_features(i, sentence, prevtag, rare_words)) training_features.append(features) # overwriting training features with values after removing rare features training_features, non_rare_features = remove_rare_features( training_features, 5) # creating feature dictionary counter = 0 for feature in non_rare_features: feature_dict[feature] = counter counter = counter + 1 # creating tag dictionary tag_counter = 0 for sent_tags in train_tags: for tag in sent_tags: if tag not in tag_dict: tag_dict[tag] = tag_counter tag_counter = tag_counter + 1 X_train = build_X(training_features) Y_train = build_Y(train_tags) print "X_train Y_train built" ''' # if we want to save model then use this code filename = 'lr_model.sav' lr = pickle.load(open(filename, 'rb')) if lr == None: lr = LogisticRegression(class_weight='balanced', solver='saga', multi_class='multinomial', verbose=2) lr.fit(X_train, Y_train) print "Model fit" # save the model to disk pickle.dump(lr, open(filename, 'wb')) ''' lr = LogisticRegression(class_weight='balanced', solver='saga', multi_class='multinomial', verbose=2) lr.fit(X_train, Y_train) print "Model fit" test_data = load_test('test.txt') # tag prediction for sentence in test_data: Y_pred, Y_start = get_predictions([sentence], lr) tags = viterbi(Y_start, Y_pred) print "sentence =", sentence print "tags=", tags print "\n"
import nltk from nltk.corpus import brown brown_train = brown.tagged_sents(categories='news') regexp_tagger = nltk.RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'(-|:|;)$', ':'), (r'\'*$', 'MD'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ness$', 'NN'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN') ]) unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger) cfg = { "NNP+NNP": "NNP", "NN+NN": "NNI", "NNI+NN": "NNI", "JJ+JJ": "JJ", "JJ+NN": "NNI" } class NPExtractor(object):
# In[86]: # Показать имена признаков one_hot_multi.classes_ # In[43]: # Загрузить библиотеку from nltk.corpus import brown from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger # Получить немного текста из стандартного текстового корпуса # Brown Corpus, разбитого на предложения sentences = brown.tagged_sents(categories='news') # Выделить на 4000 предложений для тренировки и 623 для тестирования train = sentences[:4000] test = sentences[4000:] # Создать разметчик с откатом unigram = UnigramTagger(train) bigram = BigramTagger(train, backoff=unigram) trigram = TrigramTagger(train, backoff=bigram) # Показать точность trigram.evaluate(test) # > <b>6.9 Кодирование текста в качестве мешка слов
def init_nltk(): global tokenizer global tagger tokenizer = tokenize.RegexpTokenizer(r'\w+|[^\w\s]+') tagger = UnigramTagger(brown.tagged_sents())
import nltk from nltk.corpus import brown import numpy as np from collections import Counter from collections import defaultdict from math import log import time stime = time.time() sentences = np.array(brown.tagged_sents()) words = brown.tagged_words() tokens, taged = zip(*words) # # firstdict = {} # firstSum = len(sentences) # for i in sentences: # x,y = i[0] # if y not in firstdict.keys(): # firstdict[y] = 1 # else: # firstdict[y] += 1 # # for i in firstdict.keys(): # firstdict[i] = firstdict[i]/firstSum # total word count total = len(words) # preping corpus data wordcount = Counter(tokens)