def getTrainedTagger(): train = brown.tagged_sents(simplify_tags=True) newTrain = [] for sen in train: newSen = [] for word, tag in sen: newSen.append((word.lower(), tag)) newTrain.append(newSen) nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ], backoff=nn_tagger) at2 = nltk.AffixTagger(newTrain, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(newTrain, backoff=at2) ct2 = nltk.NgramTagger(2, newTrain, backoff=ut3) return ct2
def __init__(self): # This is our fast Part of Speech tagger brown_train = brown.tagged_sents(categories=['news']) regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'(-|:|;)$', ':'), (r'\'*$', 'MD'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ness$', 'NN'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN')]) self.unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger) self.bigram_tagger = nltk.BigramTagger(brown_train, backoff=self.unigram_tagger) # This is our semi-CFG; Extend it according to your own needs cfg = {} cfg["NNP+NNP"] = "NNP" cfg["CD+CD"] = "CD" cfg["NN+NN"] = "NNI" cfg["NNI+NN"] = "NNI" cfg["JJ+JJ"] = "JJ" cfg["JJ+NN"] = "NNI" cfg["VBN+NNS"] = "NNP" self.cfg = cfg for i, word in enumerate(STOP_WORDS): STOP_WORDS[i] = word
def test_tag(): """train test""" unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) tags = unigram_tagger.tag(brown_sents[2007]) # 评价tag正确率 print(unigram_tagger.evaluate(brown_tagged_sents), tags)
def _build_big_vocab(filename, tagset='universal'): # WORD IDS AND FREQUENCIES # ======================== # Long list of word sequences separated by <eos> data = _read_words(filename) # Store tallies of unique words in data, e.g. {''<unk>': 4794, 'the': 4529, '<eos>': 3761} counter_words = collections.Counter(data) # Creates an ordered list of 2-tuples containing a WORD and its TALLY count_pairs = sorted(counter_words.items(), key=lambda x: (-x[1], x[0])) # x[0] is a backup criterion in case -x[1] are equal # Creates a tuple of words sorted in descending order from most frequent to least frequent words, _ = list(zip(*count_pairs)) # Assign a unique integer ID to each word word_to_id = dict(zip(words, range(len(words)))) # POS TAGS # ======================== # Tagged long list ptb_sents = nltk.corpus.treebank.tagged_sents() uni_tag = nltk.UnigramTagger(ptb_sents) tagged = uni_tag.tag(data) # Store tallies of POS tags, e.g. {''NOUN': 20321, 'DT': 4529, None: 3761} counter_pos = collections.Counter([x[1] for x in tagged]) word_to_id_freq_pos = {} for k,v in word_to_id.items(): pos = uni_tag.tag([k])[0][1] if pos is None: pos = 'UNK' word_to_id_freq_pos[k] = (v, counter_words[k], pos) return word_to_id_freq_pos # Return a dict with unique words as keys and their ID as value
def __init__(self): tsents = mac_morpho.tagged_sents() tsents = [[(w.lower(), t) for (w, t) in sent] for sent in tsents if sent] tagger0 = nltk.DefaultTagger('N') tagger1 = nltk.UnigramTagger(tsents[100:], backoff=tagger0) self.tagger = nltk.BigramTagger(tsents[100:], backoff=tagger1)
def tag_it(train, test, regex_pattern, print_errors=False): """ Use tagger hierarchy approach shown in the lecture I actually tried some variations and different orders, e.g. regex at the beginning. But the below order gave me the best results :param train: :param test: :param regex_pattern: :param print_errors: :return: """ default_tagger = nltk.DefaultTagger('NOUN') regex_tagger = nltk.tag.RegexpTagger(regex_pattern, backoff=default_tagger) unigram_tagger = nltk.UnigramTagger(train, backoff=regex_tagger) bigram_tagger = nltk.BigramTagger(train, backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(train, backoff=bigram_tagger) print(trigram_tagger.evaluate(test)) # print wrongly classified values if print_errors: sents = nps_chat.posts() untagged = trigram_tagger.tag_sents(sents[((len(sents) * 9) // 10):]) cfd = nltk.ConditionalFreqDist((word, tag) for idx1, sent in enumerate(test) for idx2, (word, tag) in enumerate(sent) if tag != untagged[idx1][idx2][1]) for k, v in cfd.items(): for key, item in v.items(): print(k, key, item)
def ngramTagger(train_sents, n=2, defaultTag='NN'): t0 = nltk.DefaultTagger(defaultTag) if (n <= 0): return t0 elif (n == 1): t1 = nltk.UnigramTagger(train_sents, backoff=t0, verbose=True) return t1 elif (n == 2): t1 = nltk.UnigramTagger(train_sents, backoff=t0, verbose=True) t2 = nltk.BigramTagger(train_sents, backoff=t1, verbose=True) return t2 else: t1 = nltk.UnigramTagger(train_sents, backoff=t0, verbose=True) t2 = nltk.BigramTagger(train_sents, backoff=t1, verbose=True) t3 = nltk.TrigramTagger(train_sents, backoff=t2, verbose=True) return t3
def test_POS_tag_tokenize_words_simple_test(self): training_sents = brown.tagged_sents() patterns = [ # for regexp tagger (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'POS'), (r'.*s$', 'NNS'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*', 'NN')] default_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger) unigram_tagger = nltk.UnigramTagger(training_sents, backoff=regexp_tagger) bigram_tagger = nltk.BigramTagger(training_sents, backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(training_sents, backoff=bigram_tagger) final_tagger = trigram_tagger self.assertEqual( [[('who', 'WPS'), ('are', 'BER'), ('your', 'PP$'), ('friend', 'NN'), ("'s", 'POS'), ('here', 'RB'), ('?', '.')]], POS_tag_tokenized_phrases( [ ['who', 'are', 'your', 'friend', "'s", 'here', '?'] ], final_tagger))
def create_tagger(): """Train a tagger from the Brown Corpus. This should not be called very often; only in the event that the tagger pickle wasn't found.""" print "Building tagger..." train_sents = brown.tagged_sents() # These regexes were lifted from the NLTK book tagger chapter. t0 = nltk.RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) print "got t0" t1 = nltk.UnigramTagger(train_sents, backoff=t0) print "got t1" t2 = nltk.BigramTagger(train_sents, backoff=t1) print "got t2" t3 = nltk.TrigramTagger(train_sents, backoff=t2) print "Built tagger!" return t3
def get_trained_unigram_tagger(): train_data_input = json.load( open(ROOT_DIR + '/src/assets/training_Sets/stationsExtractionTrainingSet.json')) train_data = [[(element["pos"], element["classification"]) for element in sentence] for sentence in train_data_input] return nltk.UnigramTagger(train_data)
def nltk_tagger(brown_words, brown_tags, brown_dev_words): training = [] for brown_sentence, tag_sentence in zip(brown_words, brown_tags): words = brown_sentence.split(' ') tags = tag_sentence.split(' ') sentence_tags = [] for word, tag in zip(words, tags): sentence_tags.append((word, tag)) sentence_tags.pop(0) sentence_tags.pop(0) sentence_tags.pop() sentence_tags.pop() training.append(sentence_tags) t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(training, backoff=t0) t2 = nltk.BigramTagger(training, backoff=t1) t3 = nltk.TrigramTagger(training, backoff=t2) # IMPLEMENT THE REST OF THE FUNCTION HERE tagged = [] for sentence in brown_dev_words: tgd_stc = t3.tag(sentence) pairs = [] for tup in tgd_stc: word, tg = tup joint = word + '/' + tg pairs.append(joint) joint = ' '.join(pairs) tagged.append(joint + '\n') return tagged
def brill_tagger(tagged_sentences): wordings = [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'.*ness$', 'NN'), (r'.*ly$', 'NN'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*ould$', 'MD'), (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), (r'.*ious$', 'JJ'), (r'.*ble$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'), (r'.*est$', 'JJ'), (r'.*ould$', 'MD'), ] # the part of code is taken as a reference from http://stackoverflow.com/questions/14802442/how-to-use-a-regex-backoff-tagger-in-python-nltk-to-override-nns # here we are using the unigram and regex taggers as backoffs for brill tagger regex_tagger = nltk.tag.RegexpTagger(wordings) unigram_tagger = nltk.UnigramTagger(tagged_sentences, backoff=regex_tagger) model = nltk.tag.brill.brill24() brill_trainer = nltk.tag.brill_trainer.BrillTaggerTrainer( unigram_tagger, model) brill_tagger = brill_trainer.train(tagged_sentences) return brill_tagger
def word_tagger(self): default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(self.training_sents, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(self.training_sents, backoff=unigram_tagger) self.text = bigram_tagger.tag(self.text)
def dump(config): """Loads word embeddngs an calculates neighbors. Args: config: an instance of TaggerConfiguration """ tagger_dir = config.tagger_dir tagger_name = os.path.join(tagger_dir, "tagger.pkl") os.makedirs(tagger_dir, exist_ok=True) if not os.path.isfile(tagger_name): brown_tagged_sents = brown.tagged_sents(tagset='universal') size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] t0 = nltk.DefaultTagger('X') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t3 = nltk.TrigramTagger(train_sents, backoff=t2) scores = [[t1.evaluate(test_sents), t1], [t2.evaluate(test_sents), t2], [t3.evaluate(test_sents), t3]] best_score, best_tagger = max(scores, key=lambda x: x[0]) print("Finished building POS tagger {0:.2f}%".format(best_score * 100)) with open(tagger_name, 'wb') as f: pkl.dump(best_tagger, f) with open(tagger_name, 'rb') as f: return pkl.load(f) print("Finished saving %s and %s." % (ids_name, distances_name))
def __init__(self, train_sents): train_data = [[(t, c) for w, t, c in sent] for sent in train_sents] #print(train_data) self.tagger = nltk.UnigramTagger(train_data) self.tagger = nltk.tag.BigramTagger(train_data, backoff=self.tagger) self.tagger = nltk.tag.TrigramTagger(train_data, backoff=self.tagger) print(self.tagger.evaluate(train_data))
def tagging_system(text, name): bts = brown.tagged_sents(categories="news", tagset="universal") # train hmmtagr on all bts and use hmmtagger to evaluate the result of unigramTagger hmmTagr = hmm.HiddenMarkovModelTagger.train(bts) uTagr = nltk.UnigramTagger(bts) tsent = nltk.word_tokenize(text) tagged_sent = [uTagr.tag(tsent)] hmm_tagged_sent = hmmTagr.tag(tsent) # Comparing with hmm tagger, and set all None tag to th for i in range(len(tagged_sent[0])): if tagged_sent[0][i][1] == None: tagged_sent[0][i] = hmm_tagged_sent[i] # print out the accuracy and the tagged text if name == 'my_test.txt' or name == 'my_test1.txt': print( "-------Below is the accuracy analysis of my tagging system on text : {} ------" .format(name)) print("the accuracy of {} is :{}".format( name, hmmTagr.evaluate(tagged_sent))) print( "some mistaken tags and what it should be based on golden standard: " ) for i in range(len(tagged_sent[0])): if not tagged_sent[0][i][1] == hmm_tagged_sent[i][1]: print("{} should be {}".format(tagged_sent[0][i], hmm_tagged_sent[i])) print("------Below is the outcome of my tagging system on text: {}------". format(name)) print(tagged_sent[0])
def __init__(self): """Initialization method of :class:`TopicExtractor` class. """ # This is our fast Part of Speech tagger ############################################################################# brown_train = brown.tagged_sents(categories='news') regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'(-|:|;)$', ':'), (r'\'*$', 'MD'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ness$', 'NN'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN')]) unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger) self.bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger) ############################################################################# # This is our semi-CFG; Extend it according to your own needs ############################################################################# self.cfg = {} self.cfg["NNP+NNP"] = "NNP" self.cfg["NN+NN"] = "NNI" self.cfg["NNI+NN"] = "NNI" self.cfg["JJ+JJ"] = "JJ" self.cfg["JJ+NN"] = "NNI"
def nltk_tagger(brown): tagged = [] training = nltkbrown.tagged_sents(tagset = 'universal') #create Unigram, Bigram, Trigram taggers unigram_tagger = nltk.UnigramTagger(training) bigram_tagger = nltk.BigramTagger(training) trigram_tagger = nltk.TrigramTagger(training) default_tagger = nltk.DefaultTagger('NOUN') bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger) trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) # tag sentences tagged_sentence = [] for sentence in brown: tags = trigram_tagger.tag(sentence) tagged_sentence.append(tags) for sentence in tagged_sentence: sentence = sentence[2:-1] temp = [] for tup in sentence: wordtag = tup[0] + '/' + tup[1] temp.append(wordtag) tagged.append(temp) return tagged
def _build_tagger(): global tagger file = Path(tagger_path) if tagger != None: return if file.is_file(): tagger = object_io.read_object(tagger_path) else: print('{} - Building train data...'.format(datetime.now())) dataset = nltk.corpus.floresta.tagged_sents() + \ nltk.corpus.mac_morpho.tagged_sents() traindata = [[(w, _simplify_tag(t)) for (w, t) in sent] for sent in dataset] print('{} - Training POS tagging model...'.format(datetime.now())) tagger = nltk.NgramTagger( 4, traindata, backoff=nltk.TrigramTagger( traindata, backoff=nltk.BigramTagger( traindata, backoff=nltk.UnigramTagger( traindata, backoff=nltk.DefaultTagger('NOUN'))))) print('{} - Saving tagger object...'.format(datetime.now())) object_io.save_object(tagger, tagger_path)
def main(): brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') train_size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:train_size] test_sents = brown_tagged_sents[train_size:] unseen_sents = brown_sents[train_size + 117] # unigram only unigram_tagger = nltk.UnigramTagger(train_sents, verbose=True) evaluate_tagger(unigram_tagger, test_sents, unseen_sents) # previous only previous_tagger = PreviousTagTagger(train_sents, verbose=True) evaluate_tagger(previous_tagger, test_sents, unseen_sents) # default tagger t0 = nltk.DefaultTagger('NN') # backoff 2 t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) evaluate_tagger(t2, test_sents, unseen_sents) # backoff 3 t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t3 = nltk.TrigramTagger(train_sents, backoff=t2) evaluate_tagger(t3, test_sents, unseen_sents) # backoff previous 2 t1 = PreviousTagTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) evaluate_tagger(t2, test_sents, unseen_sents) # backoff previous 3 t1 = PreviousTagTagger(train_sents, backoff=t0) t2 = nltk.UnigramTagger(train_sents, backoff=t1) t3 = nltk.BigramTagger(train_sents, backoff=t2) evaluate_tagger(t3, test_sents, unseen_sents) # backoff previous 4 t1 = PreviousTagTagger(train_sents, backoff=t0) t2 = nltk.UnigramTagger(train_sents, backoff=t1) t3 = nltk.BigramTagger(train_sents, backoff=t2) t4 = nltk.TrigramTagger(train_sents, backoff=t3) evaluate_tagger(t4, test_sents, unseen_sents)
def train_and_test_tagger(): from nltk.corpus import brown brown_tagged_sents = brown.tagged_sents(categories="news") size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] unigram_tagger = nltk.UnigramTagger(train_sents) print unigram_tagger.evaluate(test_sents)
def bitagger_train(train_sents, backoff=False): if backoff == True: t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) else: t2 = nltk.BigramTagger(train_sents) return t2
def get_pos_tagger(training, tagger='Perceptron'): training = [[(w.lower(), simplify_tag(t)) for (w, t) in sent] for sent in training if sent] if tagger == 'Perceptron': tagger = nltk.tag.PerceptronTagger(load=False) tagger.train(training) else: tagger0 = nltk.DefaultTagger('n') if tagger == 'Unigram': tagger1 = nltk.UnigramTagger(training, backoff=tagger0) elif tagger == 'Bigram': tagger1 = nltk.UnigramTagger(training, backoff=tagger0) tagger = nltk.BigramTagger(training, backoff=tagger1) return tagger
def ngramTagger(train_sents, n=0, defaultTag='NN'): t0 = nltk.DefaultTagger(defaultTag) if (n <= 0): return t0 elif (n == 1): t1 = nltk.UnigramTagger(train_sents, backoff=t0) return t1 elif (n == 2): t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) return t2 else: t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t3 = nltk.TrigramTagger(train_sents, backoff=t2) return t3
def train_tagger(train_sents): """Train and return a tagger using train_sents. """ tags = [t for sent in train_sents for (w, t) in sent] t0 = nltk.DefaultTagger(nltk.FreqDist(tags).max()) t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) return t2
def __init__(self, train_sents, testdata): train_data = [[(t, c) for w, t, c in sent] for sent in train_sents] test_data = [[(t, c) for w, t, c in sent] for sent in testdata] self.tagger = nltk.UnigramTagger( train_data) #nltk.NaiveBayesClassifier.train(train_data) self.tagger = nltk.tag.BigramTagger(train_data, backoff=self.tagger) self.tagger = nltk.tag.TrigramTagger(train_data, backoff=self.tagger) print(self.tagger.evaluate(test_data))
def trainTagger(): fd = nltk.FreqDist(brown.words(categories='news')) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) most_freq_words = fd.most_common(15000) likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words) unigram_tagger = nltk.UnigramTagger(model=likely_tags) return unigram_tagger
def performance(cfd, wordlist): lt = dict((word, cfd[word].max()) for word in wordlist) baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN')) return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def create_trainer(self): t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(self.train_sentences, backoff=t0) # t2 = nltk.BigramTagger(self.train_sentences, backoff=t1) t3 = nltk.TrigramTagger(self.train_sentences, backoff=t2) output = open('t.pkl', 'wb') dump(t3, output, -1) output.close()
def tokenize(): tsents = floresta.tagged_sents() tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] train = tsents[100:] test = tsents[:100] tagger0 = nltk.DefaultTagger('n') tagger1 = nltk.UnigramTagger(train, backoff=tagger0) return tagger1