def parse_token_pos(essay_object): """ 通过nltk的语料库训练pos模型,然后拿文章进行token,然后得pos :param essay_object: :return: 返回的是一篇文章的tokens和token对应的pos """ # train pos by nltk's cropus from nltk.corpus import treebank train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] train_brown = nltk.corpus.brown.tagged_sents()[0:5000] test_brown = nltk.corpus.brown.tagged_sents()[5000:] tnt_tagger = nltk.tag.tnt.TnT() tnt_tagger.train(train_sents) t_tagger_brown = nltk.tag.tnt.TnT() t_tagger_brown.train(train_brown) print("训练pos模型完成") print("当前文章为{}".format(essay_object.essay_str)) tokenTags = tnt_tagger.tag(essay_object.tokens) # pos of token bTags = t_tagger_brown.tag(essay_object.tokens) # pos of token essay_token_attribute = [] for tuple_token_pos in tokenTags: # change token list_token_pos = list(tuple_token_pos) if list_token_pos[1] == 'Unk': list_token_pos[1] = bTags[0][1] if list_token_pos[1] == 'Unk': if list_token_pos[0][-2:] == 'ed': list_token_pos[1] = 'VBD' essay_token_attribute.append(list_token_pos) return essay_token_attribute
def load_data(self, percentage): print("Started Loading the Data") # Get the complete data data_set = treebank.fileids() # Partition the data into train and test data sets training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)] testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)] # How much percentage of files consider for training? index = int(percentage*len(training_data_fileIds)) training_data_fileIds = training_data_fileIds[:index] tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds) tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds) tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds) tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds) # print(len(tagged_training_data1), len(tagged_testing_data1)) # UnTag the data for other uses untagged_training_data = [untag(item) for item in tagged_training_data] untagged_testing_data = [untag(item) for item in tagged_testing_data] print("Data Loaded Successfully. Stats are") print("Training Data Sentences: ", len(tagged_training_data)) print("Testing Data Sentences: ", len(tagged_testing_data)) return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
def trainPOS_Tagger(): train_data = treebank.tagged_sents()[:3000] test_data = treebank.tagged_sents()[3000:] tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) tnt_pos_tagger.evaluate(test_data) f = open('tnt_treebank_pos_tagger.pickle', 'w') pickle.dump(tnt_pos_tagger, f) f.close()
def main(): from nltk.corpus import treebank from main import TAGS train_data = treebank.tagged_sents()[:3000] test_data = treebank.tagged_sents()[3000:] hmm = hmm_tagger(TAGS) print 'start train' hmm.train(train_data) print 'start test' word_accuracy, sentence_accuracy = hmm.evaluate(test_data) print "Word accuracy = {0}% | Sentence accuracy = {1}%".format( word_accuracy * 100, sentence_accuracy * 100)
def demo(corpus, num_sents): """ Loads a few sentences from the Brown corpus or the Wall Street Journal corpus, trains them, tests the tagger's accuracy and tags an unseen sentence. @type corpus: C{str} @param corpus: Name of the corpus to load, either C{brown} or C{treebank}. @type num_sents: C{int} @param num_sents: Number of sentences to load from a corpus. Use a small number, as training might take a while. """ if corpus.lower() == "brown": from nltk.corpus import brown tagged_sents = brown.tagged_sents()[:num_sents] elif corpus.lower() == "treebank": from nltk.corpus import treebank tagged_sents = treebank.tagged_sents()[:num_sents] else: print "Please load either the 'brown' or the 'treebank' corpus." size = int(len(tagged_sents) * 0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] maxent_tagger = MaxentPosTagger() maxent_tagger.train(train_sents) print "tagger accuracy (test %i sentences, after training %i):" % \ (size, (num_sents - size)), maxent_tagger.evaluate(test_sents) print "\n\n" print "classify unseen sentence: ", maxent_tagger.tag(["This", "is", "so", "slow", "!"]) print "\n\n" print "show the 10 most informative features:" print maxent_tagger.classifier.show_most_informative_features(10)
def demo3(): from nltk.corpus import treebank, brown d = list(treebank.tagged_sents()) e = list(brown.tagged_sents()) d = d[:1000] e = e[:1000] d10 = int(len(d) * 0.1) e10 = int(len(e) * 0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = TnT(N=1000, C=False) s = TnT(N=1000, C=False) dtest = d[(i * d10) : ((i + 1) * d10)] etest = e[(i * e10) : ((i + 1) * e10)] dtrain = d[: (i * d10)] + d[((i + 1) * d10) :] etrain = e[: (i * e10)] + e[((i + 1) * e10) :] t.train(dtrain) s.train(etrain) tacc = t.evaluate(dtest) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = s.evaluate(etest) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += tacc / tp_kn sknacc += sacc / tp_kn tallacc += tacc sallacc += sacc # print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc print("brown: acc over words known:", 10 * tknacc) print(" : overall accuracy:", 10 * tallacc) print(" : words known:", 10 * tknown) print("treebank: acc over words known:", 10 * sknacc) print(" : overall accuracy:", 10 * sallacc) print(" : words known:", 10 * sknown)
def get_accuracy(self, sentences=[]): if sentences == []: test_sents = treebank.tagged_sents()[6000:] else: test_sents = sentences print self._tagger.evaluate(test_sents)
def tag_matching(sequences): treebank_sentences = treebank.tagged_sents() #treebank_sentences = brown.tagged_sents() # Return best count/sequence best = (0, None) count = 0 errors = 0 resultset = [] for seq in sequences: for sent in treebank_sentences: for i, word in enumerate(sent): if sent[i][1] == seq[0]: try: if sent[i+1][1] == seq[1]: count += 1 #if sent[i+2][1] == seq[2]: # count += 1 except IndexError: errors += 1 if count > best[0]: best = (count, seq) resultset.append((seq, count, errors)) count, erros = 0, 0 return resultset
def getData(corpus="brown", categories=""): if corpus == "brown": if categories != "": return brown.tagged_sents(tagset='universal', categories=categories) return brown.tagged_sents(tagset='universal') elif corpus == "treebank": return treebank.tagged_sents(tagset='universal') elif corpus == "nps_chat": #Dialogue dataset data = [] posts = nps_chat.posts() words = nps_chat.tagged_words(tagset='universal') index = 0 for sent in posts: data.append(words[index:index + len(sent)]) index += len(sent) return data elif corpus == "conll2000": return conll2000.tagged_sents(tagset='universal') return brown.tagged_sents(tagset='universal')
def train_tagger(): """ This function trains the tagger """ print("Training POS tagger...") # https://github.com/japerk/nltk3-cookbook/blob/master/chapter4.py tagged_sentences = treebank.tagged_sents() size = int(len(tagged_sentences) * 0.9) train_sents = tagged_sentences[:size] test_sents = tagged_sentences[3000:] default = DefaultTagger("NN") tagger = ClassifierBasedPOSTagger( train=train_sents, backoff=default, cutoff_prob=0.3 ) print(tagger.evaluate(test_sents)) # 0.9613641269156055 # save model to pickle file as binary file_name = MODEL_PATH + "tag_model.pkl" with open(file_name, "wb") as fout: pickle.dump(tagger, fout) print("model written to: " + file_name) print("") return tagger
def train_pos_tagger(self, path): tagged_sents = treebank.tagged_sents() train_size = int(.75 * len(tagged_sents)) training_sents = tagged_sents[:train_size] test_sents = tagged_sents[train_size:] X, y = self.transform_to_dataset(training_sents) clf = Pipeline([('vectorizer', DictVectorizer(sparse=False)), ('classifier', DecisionTreeClassifier(criterion="entropy"))]) print('Training started') clf.fit(X, y) print('Training finished') X_test, y_test = transform_to_dataset(test_sents) print('Accuracy: {}'.format(clf.score(X_test, y_test))) # Save model to file model_pkl = open(path, 'wb') pickle.dump(clf, model_pkl) model_pkl.close() self.classifier = clf
def sequence_matching(input): sents = treebank.tagged_sents() parses = treebank.parsed_sents() for s in range(len(sents)): # look through every sentence in treebank to find a sequence match with input sent = sents[s] pars = parses[s] k = 0 # k will track how far into the sequence has been matched matches = [] # log position in sent that there was a match to help build tree later for i in range(len(input)): match = False # flag to cut down on time if a word doesn't match anything in the sent for j in range(k, len(sent)): # loop through every word in sentence starting from last match if sent[j][1] == input[i][1]: # labels (pos) match k = j UpdateTree(pars, j, input[i][1]) match = True # if this line is never reached, then don't waste more time on this sentence if i == len(input) - 1: # made it through the entire input, so sent was a match return pars # pars will have words replaced where there is a match break if match == False: print("Sentence does not match") break # program has looked through whole sentence without matching a word so move onto the next sentence return None # no sentence was found to match the input sequence, print error message
def traintest_bigram_trigram_tagger(self): from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import treebank test_sents = treebank.tagged_sents()[3000:] train_sents = treebank.tagged_sents()[:3000] print 'trainging bigramTagger' bitagger = BigramTagger(train_sents) print 'evaluation bitagger' print bitagger.evaluate(test_sents) print 'trainging trigram Tagger' tritagger = TrigramTagger(train_sents) print 'evaluation bitagger' print tritagger.evaluate(test_sents) print 'tagging'
def demo2(): from nltk.corpus import treebank d = list(treebank.tagged_sents()) t = TnT(N=1000, C=False) s = TnT(N=1000, C=True) t.train(d[(11)*100:]) s.train(d[(11)*100:]) for i in range(10): tacc = t.evaluate(d[i*100:((i+1)*100)]) tp_un = float(t.unknown) / float(t.known +t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) t.unknown = 0 t.known = 0 print('Capitalization off:') print('Accuracy:', tacc) print('Percentage known:', tp_kn) print('Percentage unknown:', tp_un) print('Accuracy over known words:', (tacc / tp_kn)) sacc = s.evaluate(d[i*100:((i+1)*100)]) sp_un = float(s.unknown) / float(s.known +s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) s.unknown = 0 s.known = 0 print('Capitalization on:') print('Accuracy:', sacc) print('Percentage known:', sp_kn) print('Percentage unknown:', sp_un) print('Accuracy over known words:', (sacc / sp_kn))
def __init__(self, do_markovify=True): print("tagging the datasets and markovifying them ... please wait!") # print(list(brown.tagged_sents())) # print(list(nps_chat.tagged_words())) # with open("reddit_apple_android.txt", "w") as text_file: # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (text_file.sents('reddit_apple_android.txt'))) self.tagged_sents = list(brown.tagged_sents()) # self.tagged_sents = list(treebank.tagged_sents()) # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-emma.txt'))) # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('quora.txt'))) # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('reddit_apple_android.txt'))) # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('hackernews.txt'))) self.tagged_sents.append(list(treebank.tagged_sents())) # self.tagged_sents.append(list(nps_chat.tagged_words())) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-emma.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('chesterton-brown.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-persuasion.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-sense.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('reddit_apple_android.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (genesis.sents('english-web.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(gutenberg.sents('austen-persuasion.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(gutenberg.sents('austen-sense.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(genesis.sents('english-web.txt')))) # self.tagged_sents.append(list(genesis.tagged_words())) # self.tagged_sents.append(list(snowball_data.tagged_words())) # print(self.tagged_sents) if do_markovify: self.model = markovify.Chain(self.tagged_sents, 2)
def create_input_dataset(): print 'Loading input' input_data = [] tags = [] sents = wsj.sents() json_file = open('data.json','w') counter = 0 for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]): prev = None prev_prev = None for j,word in enumerate(sentence): datapoint = {} temp = [] len_sentence = len(sentence) if(j > 0): temp.append(sents[i][j-1]) else: temp.append('*') if(j > 1): temp.append(sents[i][j-2]) else: temp.append('*') temp.append(sents[i][j]) if(j < len_sentence-1): temp.append(sents[i][j+1]) else: temp.append('*') if(j < len_sentence-2): temp.append(sents[i][j+2]) else: temp.append('*') datapoint['wn'] = temp datapoint['index'] = j datapoint['i'] = counter counter += 1 if(prev == None): datapoint['t_minus_one'] = '*' else: datapoint['t_minus_one'] = prev[1] if(prev_prev == None): datapoint['t_minus_two'] = '*' else: datapoint['t_minus_two'] = prev_prev[1] prev_prev = prev prev = word # print datapoint,word[1] datapoint['tag'] = word[1] json_file.write(json.dumps(datapoint)) json_file.write('\n') input_data.append(datapoint) tags.append(word[1]) print 'Done' json_file.close() return input_data, tags
def extractTransitions(tagged_sents=treebank.tagged_sents(tagset='universal')): for s in tagged_sents: lasttag = 0 for token,tag in s: T[lasttag][tag]+=1 L[tag][token]+=1 lasttag = tag
def demo(corpus, num_sents): if corpus.lower() == "treebank": from nltk.corpus import treebank tagged_sents = treebank.tagged_sents()[:num_sents] elif corpus.lower() == "brown": from nltk.corpus import brown tagged_sents = brown.tagged_sents()[:num_sents] else: print "Please load either the 'brown' or the 'treebank' corpus." size = int(len(tagged_sents) * 0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] maxent_tagger = MaxentPosTagger() maxent_tagger.train(train_sents) print "tagger accuracy (test %i sentences, after training %i):" % \ (size, (num_sents - size)), maxent_tagger.evaluate(test_sents) print "\n\n" print "classify unseen sentence: ", maxent_tagger.tag( ["This", "is", "so", "slow", "!"]) print "\n\n" print "show the 10 most informative features:" print maxent_tagger.classifier.show_most_informative_features(10)
def demo2(): from nltk.corpus import treebank d = list(treebank.tagged_sents()) t = TnT(N=1000, C=False) s = TnT(N=1000, C=True) t.train(d[(11) * 100:]) s.train(d[(11) * 100:]) for i in range(10): tacc = t.accuracy(d[i * 100:((i + 1) * 100)]) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) t.unknown = 0 t.known = 0 print("Capitalization off:") print("Accuracy:", tacc) print("Percentage known:", tp_kn) print("Percentage unknown:", tp_un) print("Accuracy over known words:", (tacc / tp_kn)) sacc = s.accuracy(d[i * 100:((i + 1) * 100)]) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) s.unknown = 0 s.known = 0 print("Capitalization on:") print("Accuracy:", sacc) print("Percentage known:", sp_kn) print("Percentage unknown:", sp_un) print("Accuracy over known words:", (sacc / sp_kn))
def demo3(): from nltk.corpus import brown, treebank d = list(treebank.tagged_sents()) e = list(brown.tagged_sents()) d = d[:1000] e = e[:1000] d10 = int(len(d) * 0.1) e10 = int(len(e) * 0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = TnT(N=1000, C=False) s = TnT(N=1000, C=False) dtest = d[(i * d10):((i + 1) * d10)] etest = e[(i * e10):((i + 1) * e10)] dtrain = d[:(i * d10)] + d[((i + 1) * d10):] etrain = e[:(i * e10)] + e[((i + 1) * e10):] t.train(dtrain) s.train(etrain) tacc = t.accuracy(dtest) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = s.accuracy(etest) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += tacc / tp_kn sknacc += sacc / tp_kn tallacc += tacc sallacc += sacc # print(i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc) print("brown: acc over words known:", 10 * tknacc) print(" : overall accuracy:", 10 * tallacc) print(" : words known:", 10 * tknown) print("treebank: acc over words known:", 10 * sknacc) print(" : overall accuracy:", 10 * sallacc) print(" : words known:", 10 * sknown)
def make_sentences(): dictionary = [k.strip() for k in open("./embeddings/words.lst")] ind_lookup = {word:(ind+1) for ind,word in enumerate(dictionary)} taglst = [k.strip() for k in open("data/tags.lst")] tag_lookup = {word:(ind+1) for ind,word in enumerate(taglst)} bracket_rep = { "-LRB-":"(", "-RRB-":")", "-RSB-":"[", "-RSB-":"]", "-LCB-":"{", "-RCB-":"}"} sentences = list(treebank.tagged_sents()) for i,sent in enumerate(sentences): sent = [(item.lower(),tag) for (item,tag) in sent if tag != '-NONE-'] sent = [(bracket_rep.get(item, item), tag) for (item,tag) in sent] sent = [(u'0', tag) if item[0].isdigit() else (item,tag) for (item,tag) in sent] sent = [(u"UNKNOWN", tag) if item not in ind_lookup else (item,tag) for (item,tag) in sent] # 1 indexed!!! sent = [(ind_lookup[item], tag_lookup[tag]) for (item,tag) in sent] sentences[i] = sent sentences = [i for i in sentences if len(i) > 4] print(sum(map(len, sentences)) / float(len(sentences))) return sentences
def __init__(self,dname='treebank'): super().__init__() data = None #selecting the datset if dname =='treebank': if len(treebank.words()) == 0: nltk.download('treebank') data = treebank.tagged_sents(tagset='universal') elif dname == 'brown': if len(brown.words()) == 0: nltk.download('brown') data = brown.tagged_sents(tagset='universal') self.data=data #print(data[0:1]) vocab,tags =self._build_vocab() max_sent_len = max(map(len, data)) self.max_sent_len = max_sent_len self.word_to_idx = defaultdict(lambda:0, {word:idx for idx,word in enumerate(vocab)}) self.idx_to_word = {idx:word for word,idx in self.word_to_idx.items()} self.tag_to_idx = {tag:idx for idx,tag in enumerate(tags)} self.idx_to_tag = {idx:tag for tag,idx in self.tag_to_idx.items()} self.sen_list,self.tag_list = self._convert_to_num()
def _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data): # train is the proportion of data used in training; the rest is reserved # for testing. if tagged_data is None: print("Loading tagged data from treebank... ") tagged_data = treebank.tagged_sents() if num_sents is None or len(tagged_data) <= num_sents: num_sents = len(tagged_data) if randomize: random.seed(len(tagged_data)) random.shuffle(tagged_data) cutoff = int(num_sents * train) training_data = tagged_data[:cutoff] gold_data = tagged_data[cutoff:num_sents] testing_data = [[t[0] for t in sent] for sent in gold_data] if not separate_baseline_data: baseline_data = training_data else: bl_cutoff = len(training_data) // 3 (baseline_data, training_data) = (training_data[:bl_cutoff], training_data[bl_cutoff:]) (trainseqs, traintokens) = corpus_size(training_data) (testseqs, testtokens) = corpus_size(testing_data) (bltrainseqs, bltraintokens) = corpus_size(baseline_data) print("Read testing data ({0:d} sents/{1:d} wds)".format(testseqs, testtokens)) print("Read training data ({0:d} sents/{1:d} wds)".format(trainseqs, traintokens)) print("Read baseline data ({0:d} sents/{1:d} wds) {2:s}".format( bltrainseqs, bltraintokens, "" if separate_baseline_data else "[reused the training set]")) return (training_data, baseline_data, gold_data, testing_data)
def benchmark_aptagger(): ''' Benchmark the aptagger vs the Penn Treebank sample in nltk ''' from nltk.corpus import treebank # we want to remove "-NONE-" tags since these appear to be garbage text = [] tags = [] k = 0 for sentence in treebank.tagged_sents(): text.append([ele[0] for ele in sentence if ele[1] != '-NONE-']) tags.extend([ele[1] for ele in sentence if ele[1] != '-NONE-']) k += 1 t1 = time.time() predicted = tagger.tag_sents(text) t2 = time.time() ncorrect = sum( bool(t == p[1]) for t, p in izip(tags, chain.from_iterable(predicted))) print("For Penn Treebank sample in NLTK:") print("Took %s seconds to POS tag %s tokens (%s tokens/sec)" % (t2 - t1, len(tags), int(len(tags) / (t2 - t1)))) print("Accuracy: %s" % (float(ncorrect) / len(tags)))
def benchmark_aptagger(): ''' Benchmark the aptagger vs the Penn Treebank sample in nltk ''' from nltk.corpus import treebank # we want to remove "-NONE-" tags since these appear to be garbage text = [] tags = [] k = 0 for sentence in treebank.tagged_sents(): text.append([ele[0] for ele in sentence if ele[1] != '-NONE-']) tags.extend([ele[1] for ele in sentence if ele[1] != '-NONE-']) k += 1 t1 = time.time() predicted = tagger.tag_sents(text) t2 = time.time() ncorrect = sum(bool(t == p[1]) for t, p in izip(tags, chain.from_iterable(predicted))) print("For Penn Treebank sample in NLTK:") print("Took %s seconds to POS tag %s tokens (%s tokens/sec)" % ( t2 - t1, len(tags), int(len(tags) / (t2 - t1)))) print("Accuracy: %s" % (float(ncorrect) / len(tags)))
def get_pos_tagger(): train_sents = treebank.tagged_sents() tagger = nltk.TrigramTagger(train_sents, backoff= nltk.BigramTagger(train_sents, backoff= nltk.UnigramTagger(train_sents, backoff= nltk.DefaultTagger("NN")))) return tagger
def main(): ### Globals ### regexp_tagger = nltk.RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) training_data = treebank.tagged_sents() unigram_tagger = nltk.UnigramTagger(training_data, backoff=regexp_tagger) bigram_tagger = nltk.BigramTagger(training_data, backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(training_data, backoff=bigram_tagger) unigram_pickler = pickle.Pickler(open("unigram_tagger.bin","w")) bigram_pickler = pickle.Pickler(open("bigram_tagger.bin","w")) trigram_pickler = pickle.Pickler(open("trigram_tagger.bin","w")) unigram_pickler.dump(unigram_tagger) bigram_pickler.dump(bigram_tagger) trigram_pickler.dump(trigram_tagger)
def train_pos_tagger(self, path): # Just to make sure nltk.download('treebank') tagged_sentences = treebank.tagged_sents() train_size = int(.80 * len(tagged_sentences)) training_sentences = tagged_sentences[:train_size] X_train, y_train = self.transform_to_dataset(training_sentences) model = CRF() print('Training started...') model.fit(X_train, y_train) print('Training finished.') # Save classifier to file model_pkl = open(path, 'wb') pickle.dump(model, model_pkl) model_pkl.close() print("POSTagger saved.") self.classifier = model
def demo(corpus, num_sents): """ Loads a few sentences from the Brown corpus or the Wall Street Journal corpus, trains them, tests the tagger's accuracy and tags an unseen sentence. @type corpus: C{str} @param corpus: Name of the corpus to load, either C{brown} or C{treebank}. @type num_sents: C{int} @param num_sents: Number of sentences to load from a corpus. Use a small number, as training might take a while. """ if corpus.lower() == "brown": from nltk.corpus import brown tagged_sents = brown.tagged_sents()[:num_sents] elif corpus.lower() == "treebank": from nltk.corpus import treebank tagged_sents = treebank.tagged_sents()[:num_sents] else: print "Please load either the 'brown' or the 'treebank' corpus." size = int(len(tagged_sents) * 0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] maxent_tagger = MaxentPosTagger() maxent_tagger.train(train_sents) print "tagger accuracy (test %i sentences, after training %i):" % \ (size, (num_sents - size)), maxent_tagger.evaluate(test_sents) print "\n\n" print "classify unseen sentence: ", maxent_tagger.tag( ["This", "is", "so", "slow", "!"]) print "\n\n" print "show the 10 most informative features:" print maxent_tagger.classifier.show_most_informative_features(10)
def _train_tagger(self): training_sents = treebank.tagged_sents() patterns = [ # for regexp tagger (r'^[\.|\?|!]$', '.'), (r'^,$', ','), (r'^\'$', '\'\''), (r'^\"$', '\"'), (r'^\($', '('), (r'^\)$', ')'), (r'^[=|/]$', 'SYM'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'POS'), (r'.*s$', 'NNS'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'^[0-9][0-9]*$', 'CD'), (r'^[0-9]([0-9]*[-|.|,|/][0-9]*)*$', 'CD'), (r'^([0-9]*\.[0-9]*)*$', 'CD'), (r'^[^a-zA-Z]*$', ':'), (r'[A-Z].*', 'NNP'), (r'.*', 'NN') ] default_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger) unigram_tagger = nltk.UnigramTagger(training_sents, backoff=regexp_tagger) bigram_tagger = nltk.BigramTagger(training_sents, backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(training_sents, backoff=bigram_tagger) self.final_tagger = trigram_tagger
def ie_preprocess(document): print document sentences = nltk.sent_tokenize(document) # print sentences trigram_tagger = nltk.TrigramTagger(brown_a, cutoff=0) sentences = [nltk.word_tokenize(sent) for sent in sentences] print "\nDefault tagger" x = [t0.tag(sent) for sent in sentences] print x print "\nUnigram tagger" x = [t1.tag(sent) for sent in sentences] print x print "\nBigram tagger" x = [t2.tag(sent) for sent in sentences] print x print "\nTrigram tagger" x = [t3.tag(sent) for sent in sentences] print x print "\n" # sentences = [nltk.pos_tag(sent) for sent in sentences trainer = hmm.HiddenMarkovModelTrainer() train_data = treebank.tagged_sents()[:3000] tagger = trainer.train_supervised(train_data) print tagger print "\nHMM tagger" x = [tagger.tag(sent) for sent in sentences] print x print "\nPOS Tag" sentences = [nltk.pos_tag(sent) for sent in sentences] print sentences return sentences
def split_sents(self, train=0.95, total=3500, document_class=TaggedSentence): sents = tagged_corpus.tagged_sents()[:total] total = len(sents) if total is None else total i = int(round(train * total)) j = i + int(round(total - train * total)) return (map(document_class, sents[0:i]), map(document_class, sents[i:j]))
def demo(corpus, num_sents): """ Loads a few sentences from the Brown corpus or the Wall Street Journal corpus, trains them, tests the tagger's accuracy and tags an unseen sentence. @type corpus: C{str} @param corpus: Name of the corpus to load, either C{brown} or C{treebank}. @type num_sents: C{int} @param num_sents: Number of sentences to load from a corpus. Use a small number, as training might take a while. """ if corpus.lower() == "brown": from nltk.corpus import brown tagged_sents = brown.tagged_sents()[:num_sents] elif corpus.lower() == "treebank": from nltk.corpus import treebank tagged_sents = treebank.tagged_sents()[:num_sents] elif corpus.lower() == "floresta": from nltk.corpus import floresta tagged_sents = floresta.tagged_sents()[:num_sents] elif corpus.lower() == "cintil": print "Loading CINTIL" #column_types = ['ignore','words','ignore','ignore','pos','ignore'] #cintil = ConllCorpusReader('/home/dsbatista/cintil/','cintil-fixed.conll',column_types) column_types = ['words','pos','ignore'] #cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed.conll',column_types) cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed-reduced.conll',column_types) tagged_sents = cintil.tagged_sents()[:num_sents] else: print "Please load either the 'brown' or the 'treebank' corpus." size = int(len(tagged_sents) * 0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] maxent_tagger = MaxentPosTagger() maxent_tagger.train(train_sents) maxent_tagger.evaluate(test_sents) """ print "tagger accuracy (test %i sentences, after training %i):" % \ (size, (num_sents - size)), maxent_tagger.evaluate(test_sents) print "\n\n" print "classify unseen sentence: ", maxent_tagger.tag(["Isto", "é", "bastante","rápido", "!"]) print "\n\n" print "show the 40 most informative features:" print maxent_tagger.classifier.show_most_informative_features(40) """ fModel = open('test.pkl',"wb") pickle.dump(maxent_tagger, fModel,1) fModel.close()
def get_tagger(): try: with open(tagger_fn) as tagger_file: tagger = pickle.load(tagger_file) except: tagger = ClassifierBasedPOSTagger(train=treebank.tagged_sents()) with open(tagger_fn,"w") as tagger_file: pickle.dump(tagger,tagger_file) return tagger
def HMM(): train_data = treebank.tagged_sents()[:3000] print(train_data[0]) s1 = "Today is a good day ." s2 = "Joe met Joanne in Delhi ." s3 = "Chicago is the birthplace of Ginny" ################################ ToDo = "Use HMM"
def tag_words(self, words, sents): train_sents = treebank.tagged_sents() tagger = UnigramTagger(train_sents) test_sents = tagger.tag(sents[0]) # test_sents = treebank.tagged_sents()[3000:] # print treebank.tagged_sents()[1:] # print "accuracy: " + str(self._tagger.evaluate(test_sents)) # print self._tagger.tag(words) # print test_sents print tagger.evaluate(test_sents)
def run(self): app = App.get_running_app() print 'start training TnT pos tagger' train_sents = treebank.tagged_sents()[:2000] unk = DefaultTagger('NN') app.root.tnt_tagger = tnt.TnT(unk=unk, Trained=True) app.root.tnt_tagger.train(train_sents) print 'end training TnT pos tagger'
def create_dataset(): #print 'Loading dataset' dataset = [] tags = [] sents = wsj.sents() for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]): prev = None prev_prev = None for j,word in enumerate(sentence): datapoint = {} temp = [] len_sentence = len(sentence) if(j > 0): temp.append(sents[i][j-1]) else: temp.append('*') if(j > 1): temp.append(sents[i][j-2]) else: temp.append('*') temp.append(sents[i][j]) if(j < len_sentence-1): temp.append(sents[i][j+1]) else: temp.append('*') if(j < len_sentence-2): temp.append(sents[i][j+2]) else: temp.append('*') #what is WN ? datapoint['wn'] = temp datapoint['index'] = j if(prev == None): datapoint['t_minus_one'] = '*' else: datapoint['t_minus_one'] = prev[1] if(prev_prev == None): datapoint['t_minus_two'] = '*' else: datapoint['t_minus_two'] = prev_prev[1] prev_prev = prev prev = word # print datapoint,word[1] dataset.append(datapoint) tags.append(word[1]) #print 'Done' return dataset, tags
def train_parser(self): default_tagger = DefaultTagger("NN") train_sents = treebank.tagged_sents()[:3000] initial_tagger = self.backoff_tagger( train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger ) initial_tagger.evaluate(train_sents) brill_tagger = self.train_brill_tagger(initial_tagger, train_sents) pickle.dump(brill_tagger, open(self.pickle_path, "wb")) return brill_tagger
def test_pos_template(self): train_sents = treebank.tagged_sents()[:1000] tagger = UnigramTagger(train_sents) trainer = brill_trainer.BrillTaggerTrainer( tagger, [brill.Template(brill.Pos([-1]))]) brill_tagger = trainer.train(train_sents) # Example from https://github.com/nltk/nltk/issues/769 result = brill_tagger.tag('This is a foo bar sentence'.split()) expected = [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('foo', None), ('bar', 'NN'), ('sentence', None)] self.assertEqual(result, expected)
def make_backoff_tagger(): """ Returns a backoff tagger that useses a UnigramTagger, BigramTagger, TrigramTagger, and a Default tagger that returns NN :returns: A backoff POS tagger. """ return backoff_tagger(treebank.tagged_sents(), [UnigramTagger, BigramTagger, TrigramTagger], backoff=DefaultTagger('NN'))
def train_pos_tagger(): """ Trains a POS tagger with sentences from Penn Treebank and returns it. """ train_sents = treebank.tagged_sents(simplify_tags=True) tagger = nltk.TrigramTagger(train_sents, backoff= nltk.BigramTagger(train_sents, backoff= nltk.UnigramTagger(train_sents, backoff= nltk.DefaultTagger("NN")))) return tagger
def train_tagger(tagger_name): train_sents = treebank.tagged_sents()[:5000] if tagger_name == "TnT" or tagger_name == 'tagger': trained_tagger = tnt.TnT() trained_tagger.train(train_sents) else: tagger1 = DefaultTagger('NN') tagger2 = TrigramTagger(train_sents, backoff=tagger1) tagger3 = BigramTagger(train_sents, backoff=tagger2) trained_tagger = UnigramTagger(train_sents, backoff=tagger3) return trained_tagger
def main(): # """ # ++++++++++++++++++++++++++++++++++++++++++ # DATA PREPROCESSING # """ ######### # EITHER sentences = treebank.tagged_sents() # OR # sentences = parsebrown() # have to dl brown corpus ("brown-universal.txt") and change path in parsebrown function ######### # trnstc, tststc, valstc = ttvsplit(sentences[0:50000], .6, .3, .1) trnstc, tststc, valstc = ttvsplit(sentences, .6, .3, .1) xtrn, ytrn = str2dct(trnstc) xtst, ytst = str2dct(tststc) xval, yval = str2dct(valstc) dict_encoder, xtrn, xtst, xval = dct2arr(xtrn, xtst, xval) label_encoder, ytrn, ytst, yval = catenc(ytrn, ytst, yval) ytrn, ytst, yval = ohenc(ytrn, ytst, yval) # # print(xtrn[0]) # treebank (61014, 44232) # brown (860100, 188) # # print(ytrn[0]) # treebank (61014, 46) # brown (860100, 9) # # """ # # ++++++++++++++++++++++++++++++++++++++++++ # # MODEL # # """ model_params = { 'build_fn': build_model, 'input_dim': xtrn.shape[1], 'hidden_neurons': 512, 'output_dim': ytrn.shape[1], 'epochs': 3, 'batch_size': 1024, 'verbose': 1, 'validation_data': (xval, yval), 'shuffle': True } m = KerasClassifier(**model_params) hist = m.fit(xtrn, ytrn) score = m.score(xtst, ytst) print("score") print(score) m.model.save('model')
def traintest_uni_bi_tri_tagger(self): from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import conll2000, treebank test_sents = conll2000.tagged_sents()[8000:] train_sents = treebank.tagged_sents()[3000:] print 'trainging trigramter with backoff' backoff = DefaultTagger('NN') tagger = self.backoff_tagger(train_sents, [UnigramTagger, BigramTagger,TrigramTagger], backoff=backoff) print 'evaluation trigram with backoff' print tagger.evaluate(test_sents) print 'tagging' print tagger.tag(word_tokenize("This is a test. This should be faster than nothing. How can I rent a car in the next twelve hours? "))
def process(data): processed_tweets = [] t0 = AffixTagger(train=treebank.tagged_sents()) t1 = UnigramTagger(train=treebank.tagged_sents(), backoff=t0) t2 = BigramTagger(train=treebank.tagged_sents(), backoff=t1) count = 0 for tweet in data.get_tweets(): count += 1 print count tweet = remove_hashtags(tweet) tweet = remove_user_tags(tweet) tweet = remove_html_entities(tweet) tweet = remove_punctuation_deep(tweet) tweet = tokenize_and_remove_stopwords(tweet) tweet = remove_apostrophes(tweet) tweet = remove_multiple_spaces(tweet) tweet = translate_slang(tweet) tweet = pos_tag_filter(tweet, data, t2) if not is_empty(tweet): processed_tweets.append(tweet) data.set_tweets(processed_tweets)
def create_dataset(): print "Loading dataset" dataset = [] tags = [] sents = wsj.sents() for i, sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]): prev = None prev_prev = None for j, word in enumerate(sentence): datapoint = {} temp = [] len_sentence = len(sentence) if j > 0: temp.append(sents[i][j - 1]) else: temp.append("*") if j > 1: temp.append(sents[i][j - 2]) else: temp.append("*") temp.append(sents[i][j]) if j < len_sentence - 1: temp.append(sents[i][j + 1]) else: temp.append("*") if j < len_sentence - 2: temp.append(sents[i][j + 2]) else: temp.append("*") datapoint["wn"] = temp datapoint["index"] = j if prev == None: datapoint["t_minus_one"] = "*" else: datapoint["t_minus_one"] = prev[1] if prev_prev == None: datapoint["t_minus_two"] = "*" else: datapoint["t_minus_two"] = prev_prev[1] prev_prev = prev prev = word # print datapoint,word[1] dataset.append(datapoint) tags.append(word[1]) print "Done" return dataset, tags
def from_treebank(klass): from nltk.corpus import brown, treebank probdist = klass() for sent in treebank.tagged_sents(): for word, tag in sent: probdist.inc(word.lower(), tag) for sent in treebank_brown.tagged_sents(): for word, tag in sent: probdist.inc(word.lower(), tag) for word, tag in get_lexicon(): probdist.inc(word, tag, closed_class=False) for i in range(10): probdist.inc('can', 'VB') return probdist
def store_pos_tag_dicts(): pos_tag_dict = defaultdict(tuple) tagged = treebank.tagged_sents() for sent in tagged: for tup in sent: if not tup[1] in pos_tag_dict[tup[0].lower()]: pos_tag_dict[tup[0].lower()] += (tup[1], ) pos_tag_dict_univ = defaultdict(tuple) penn_tagged_univ = treebank.tagged_sents(tagset='universal') brown_tagged_univ = brown.tagged_sents(tagset='universal') for text in [penn_tagged_univ, brown_tagged_univ]: for sent in text: for tup in sent: if not tup[1] in pos_tag_dict_univ[tup[0].lower()]: pos_tag_dict_univ[tup[0].lower()] += (tup[1], ) for word in states.values(): pos_tag_dict[word.lower()] += ('NNP', ) pos_tag_dict_univ[word.lower()] += ('NOUN', ) dicts = (pos_tag_dict, pos_tag_dict_univ) with open('{}/data/pos_dicts.pickle'.format(mod_path), 'wb') as file: pickle.dump(dicts, file, protocol=2)
def evaluate(self): '''run tests on conll2000 and treebank data''' test = treebank.tagged_sents()[:100] treebank_result = (100*self.classifier.evaluate(test)) test = conll2000.tagged_sents()[:100] conll2000_result = (100*self.classifier.evaluate(test)) test = brown.tagged_sents()[int(len(brown.tagged_sents())*0.8):] brown_result = (100*self.classifier.evaluate(test)) return (treebank_result, conll2000_result, brown_result)
def train_pos_tagger(): """ Trains a POS tagger with sentences from Penn Treebank and returns it. """ train_sents = treebank.tagged_sents(simplify_tags=True) tagger = nltk.TrigramTagger(train_sents, backoff=nltk.BigramTagger( train_sents, backoff=nltk.UnigramTagger( train_sents, backoff=nltk.DefaultTagger("NN")))) return tagger
def create_dataset(): print 'Loading dataset' dataset = [] tags = [] sents = wsj.sents() for i, sentence in enumerate(wsj.tagged_sents()[:10]): prev = None prev_prev = None for j, word in enumerate(sentence): datapoint = {} temp = [] len_sentence = len(sentence) temp.append(sents[i][j]) if (j > 0): temp.append(sents[i][j - 1]) else: temp.append('*') if (j > 1): temp.append(sents[i][j - 2]) else: temp.append('*') if (j < len_sentence - 1): temp.append(sents[i][j + 1]) else: temp.append('*') if (j < len_sentence - 2): temp.append(sents[i][j + 2]) else: temp.append('*') datapoint['wn'] = temp datapoint['index'] = j if (prev == None): datapoint['t_minus_one'] = '*' else: datapoint['t_minus_one'] = prev[1] if (prev_prev == None): datapoint['t_minus_two'] = '*' else: datapoint['t_minus_two'] = prev_prev[1] prev_prev = prev prev = word # print datapoint,word[1] dataset.append(datapoint) tags.append(word[1]) print 'Done' return dataset, tags
def LemmatizeSents(self,sents): tagger=tagging(treebank.tagged_sents(),[UnigramTagger,BigramTagger,TrigramTagger],backoff=None) newSents=[] for sent in sents: taggedSent=tagger.tag(word_tokenize(sent)) words=[] for (wd,tg) in taggedSent: newTag=self.tagMap(tg) wd=WordNetLemmatizer().lemmatize(wd,newTag) words=words+[wd] newSent=' '.join(words) #print(newSent) newSents.append(newSent) return newSents
def __init__(self, train_set='treebank'): ''' Constructor ''' # Before building a new tagger check if one has already been pickled if (os.path.exists(os.getcwd() + '/' + _pickle_file)): input = open(_pickle_file, 'rb') self._tagger = load(input) input.close() input = open(_test_sents_pickle_file, 'rb') self._test_sents = load(input) input.close() # Primitives necessary for training the Brill tagger. # Taken from cookbook else: if train_set == 'treebank': tagged_sents = list(treebank.tagged_sents()) else: tagged_sents = list(brown.tagged_sents()) random.shuffle(tagged_sents) split_index = int(round(0.8 * len(tagged_sents))) train_sents = tagged_sents[:split_index] self._test_sents = tagged_sents[split_index:] default_tagger = DefaultTagger('NN') tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger] initial_tagger = backoff_tagger(train_sents, tagger_classes, backoff=default_tagger) sym_bounds = [(1,1), (2,2), (1,2), (1,3)] asym_bounds = [(-1, -1), (1,1)] templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, *sym_bounds), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, *sym_bounds), brill.ProximateTokensTemplate(brill.ProximateTagsRule, *asym_bounds), brill.ProximateTokensTemplate(brill.ProximateWordsRule, *asym_bounds)] # Train the tagger trainer = brill.FastBrillTaggerTrainer(initial_tagger, templates, deterministic=True) self._tagger = trainer.train(train_sents) #Pickle the trained tagger if not os.path.exists(os.getcwd() + '/pickles/'): os.mkdir(os.getcwd() + '/pickles/') output = open(_pickle_file, 'wb') dump(self._tagger, output, -1) output.close() output = open(_test_sents_pickle_file, 'wb') dump(self._test_sents, output, -1) output.close()
def test_Phrase(): import nltk from nltk.corpus import treebank fileids = treebank.fileids() grammar = r""" ADVP:{<RB>(<CC>*<RB>*|<JJ>*)} {} """ for fileld in fileids: sents = treebank.tagged_sents(fileld) for sent in sents: tree_Gram = nltk.RegexpParser(grammar).parse(sent) for subtree in tree_Gram.subtrees(): if subtree.label() == "ADVP": print subtree
def _load_penntreebank(): global penntree_tagged_words, penntree_tagged_sents if penntree_tagged_words is None or penntree_tagged_sents is None: nltk.download('treebank') from nltk.corpus import treebank # Organized in sentences and words penntree_tagged_sents = treebank.tagged_sents() penntree_tagged_words = [ word for sent in penntree_tagged_sents for word in sent ] else: timestamp_msg('Using already loaded sequences ...') return penntree_tagged_sents, penntree_tagged_words
def tag_penn(words): """ Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer. Parameters ---------- words: A list of strings. Returns ------- A list of tuples of (str, str) """ pt_tagger = UnigramTagger(treebank.tagged_sents()) tags = pt_tagger.tag(words) return tags