def load_data(self, percentage): print("Started Loading the Data") # Get the complete data data_set = treebank.fileids() # Partition the data into train and test data sets training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)] testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)] # How much percentage of files consider for training? index = int(percentage*len(training_data_fileIds)) training_data_fileIds = training_data_fileIds[:index] tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds) tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds) tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds) tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds) # print(len(tagged_training_data1), len(tagged_testing_data1)) # UnTag the data for other uses untagged_training_data = [untag(item) for item in tagged_training_data] untagged_testing_data = [untag(item) for item in tagged_testing_data] print("Data Loaded Successfully. Stats are") print("Training Data Sentences: ", len(tagged_training_data)) print("Testing Data Sentences: ", len(tagged_testing_data)) return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
def chunker(parsedData): """ Extract the grammar rules from the input parsed text and assign each rule with the probability of it occuring in the parsed text. """ tags_words = treebank.tagged_words() # This is the list where all the rules will be stored, for # construction of the PCFG rules = [] NP = Nonterminal('NP') rhs_rules = [] # Extract the rules from the training-data for sent in parsedData: for production in sent.productions(): rules.append(production) # Add the lexical rules for word, tag in tags_words: # For each tagged word, create a tree containing that # lexical rule # This is to be able to add it to the list rules t = Tree.fromstring("(" + tag + " " + word + ")") for production in t.productions(): rules.append(production) # All the syntactic rules and all of the lexical rules # are extracted from the training-data # Here the PCFG is extracted rules_prob = nltk.grammar.induce_pcfg(Nonterminal('S'), rules) return rules_prob
def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk.corpus import treebank from nltk import stem stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.items[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = ' '.join(stemmed) results = re.sub(r"(.{,70})\s", r'\1\n', results + ' ').rstrip() # Convert the original to a string, and word wrap it. original = ' '.join(orig) original = re.sub(r"(.{,70})\s", r'\1\n', original + ' ').rstrip() # Print the results. print '-Original-'.center(70).replace(' ', '*').replace('-', ' ') print original print '-Results-'.center(70).replace(' ', '*').replace('-', ' ') print results print '*' * 70
def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk.corpus import treebank from nltk import stem stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.files()[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = ' '.join(stemmed) results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip() # Convert the original to a string, and word wrap it. original = ' '.join(orig) original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip() # Print the results. print('-Original-'.center(70).replace(' ', '*').replace('-', ' ')) print(original) print('-Results-'.center(70).replace(' ', '*').replace('-', ' ')) print(results) print('*'*70)
def process_basic(self): word_list = list(treebank.tagged_words()) transaction_dict = {} emmision_dict = {} pos_tag_dict = {} for i in range(1, len(word_list), 2): word_tuples = (word_list[i - 1], word_list[i]) tag1, tag2 = word_list[i - 1][1], word_list[i][1] if not pos_tag_dict.get(word_list[i - 1][1], 0): pos_tag_dict[word_list[i - 1][1]] = 0 if not pos_tag_dict.get(word_list[i][1], 0): pos_tag_dict[word_list[i][1]] = 0 pos_tag_dict[word_list[i - 1][1]] += 1 pos_tag_dict[word_list[i][1]] += 1 if not transaction_dict.get( (word_tuples[0][1], word_tuples[1][1]), 0): transaction_dict[(word_tuples[0][1], word_tuples[1][1])] = 0 transaction_dict[(word_tuples[0][1], word_tuples[1][1])] += 1 if not emmision_dict.get(word_tuples[0], 0): emmision_dict[word_tuples[0]] = 0 emmision_dict[word_tuples[0]] += 1 if not emmision_dict.get(word_tuples[1], 0): emmision_dict[word_tuples[1]] = 0 emmision_dict[word_tuples[1]] += 1 transaction_state_arr = self.get_transition_probabilty( pos_tag_dict, transaction_dict) emmision_state_arr = self.get_emmision_probability( pos_tag_dict, emmision_dict) for each in emmision_state_arr[::-1]: print("\t\t each", each)
def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk.corpus import treebank from nltk import stem stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.fileids()[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = " ".join(stemmed) results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip() # Convert the original to a string, and word wrap it. original = " ".join(orig) original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip() # Print the results. print("-Original-".center(70).replace(" ", "*").replace("-", " ")) print(original) print("-Results-".center(70).replace(" ", "*").replace("-", " ")) print(results) print("*" * 70)
def test_sentences(grammar): for t in test: print("Processing: " + str(t)) reference = list(treebank.tagged_words(t)) tokens = list(treebank.words(t)) print("fixing grammar.....") # Checks if grammar covers all words in the sentence and adds them to the grammar if necessary fixed_grammar = get_fixed_grammer(grammar, tokens) print("fixed grammar") print("Building Parser....") parser = ViterbiParser(fixed_grammar) print("Parsing...") #Gets list of all possible trees, the most likely tree is at index 0 start = time.time() parses = parser.parse_all(tokens) print("Time") print(start - time.time()) #Getting POS tags from parser tree leafs = parses[0].pos() #Calculating accuracy of Parser results correct_tags = 0.0 for i in range(len(leafs)): if leafs[i] == reference[i]: correct_tags += 1.0 print(str(correct_tags/len(leafs)))
def createPOSList(): pos_map = dict() for a in treebank.tagged_words(): word = a[0] if word not in pos_map: pos_map[word] = [posmap[a[1]]] else: pos_map[word] = list(set(list(posmap[a[1]]) + list(pos_map[word]))) return pos_map
def preprocess_corpora(): brown_words = brown.tagged_words(simplify_tags=True) treebank_words = treebank.tagged_words(simplify_tags=True) ''' #this takes forever. bwog_corpus = nltk.corpus.PlaintextCorpusReader('../bwog-corpus-txt', '.*\.txt') bwog_sents = bwog_corpus.sents(bwog_corpus.fileids()) bwog_words = [] for s_i in xrange(0, len(bwog_sents)/100000): #TODO: skip punctuation simp_tagged_sent = [(word,simp_tag(tag)) for word,tag in nltk.pos_tag(bwog_sents[s_i])] bwog_words.extend(simp_tagged_sent) ''' all_tagged_words = brown_words + treebank_words #+ bwog_words all_sents = brown.sents() + treebank.sents() #+ bwog_sents compute_concordance(all_tagged_words)
def convert_format(self): sentences = [] words = pkl.load(open("LM_corpura//%s//%s" % (cfg['lm_corpus'], cfg['corpus__dict_file']), 'rb')) word_dict = dict([(word, key) for key, word in enumerate(words, 1)]) #word_dict = common.get_word_dict(self.conf['index2word_path']) if self.name == "pos_tagging": tags = set([tag for word, tag in treebank.tagged_words()]) tag_index = {tag: idx for idx, tag in enumerate(tags, 1)} for sentence in treebank.tagged_sents(): sent_words = [(word_dict[common.tokenize(w)], tag_index[t]) if common.tokenize(w) in word_dict else (0, tag_index[t]) for w, t in sentence] sentences.append(sent_words) return sentences
def get_words(): """ Returns list of words from nltk treebank """ import nltk nltk.download("treebank") from nltk.corpus import treebank word_ls = [] for item in treebank.fileids(): for (word, tag) in treebank.tagged_words(item): # assuming the words are allready lowered word = word.lower() word_ls.append(word) word_ls = list(set(word_ls)) return word_ls
def pos_tag(sentence, verbose=False): from nltk.corpus import treebank treebankDict = {} for (word, tag) in treebank.tagged_words(): treebankDict[word] = tag words = tokenize.WhitespaceTokenizer().tokenize(sentence) tagged_words = [] for word in words: try: tag = { 'a': 'ex_quant', 'an': 'ex_quant', 'every': 'univ_quant' }[word.lower()] except: try: tag = treebankDict[word] except: raise KeyError('\'%s\' is not in the Part-of-Speech lookup' % word) tagged_words.append((word, tag)) return tagged_words
def demo(): from nltk.corpus import treebank #from nltk.probability import LidstoneProbDist #from nltk.probability import WittenBellProbDist from nltk.probability import SimpleGoodTuringProbDist from nltk.model import NgramModel estimator = lambda fdist, bins: SimpleGoodTuringProbDist(fdist, len(fdist)+1) #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) #estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) tag_corpus = [] for (word,tag) in treebank.tagged_words(): tag_corpus.append(tag) lm = NgramModel(2, tag_corpus, estimator) print lm lm1 = NgramModel(1, tag_corpus, estimator) print lm1 print tag_corpus[:20] sent = "NN" print lm1.entropy(sent) sent = "DT " print lm1.entropy(sent) sent = "VBZ" print lm1.entropy(sent) sent = "JJ" print lm1.entropy(sent) sent = "RB" print lm1.entropy(sent) sent = "DT NN" print lm.entropy(sent)
def main(): tr=treebank.raw() tg=treebank.tagged_words() ##print([tag[0] for tag in treebank.tagged_words(tagset='universal')]) tags=[tag[1] for tag in treebank.tagged_words(tagset='universal')] df = pd.read_csv('imdb_master.csv', encoding = 'ISO-8859-1') df=pd.DataFrame(df) ## print (df.head()) df2=df.loc[1:,['type','review','label','file']] ## print (df2.head()) ## rev=df.loc[1:,['review']] ## lab=df.loc[1:,['label']] ## print(rev.head()) ## print(lab.head()) rev_lab=df2[['review','label']] rev_lab=low(rev_lab) rev_lab=tok(rev_lab) rev_lab=call_clean(rev_lab) ## print(rev_lab.head()) ## rev_lab_non_stop=restop(rev_lab) rev_lab_non_stop_c=call(rev_lab_non_stop) ## print(rev_lab_non_stop_c.head()) lem_rev_lab=lemtize(rev_lab_non_stop_c) stem_rev_lab=stemma(lem_rev_lab) ## ## print(lem_rev_lab.head()) ## ## print(stem_rev_lab.head()) fd=nltk.FreqDist(tags) Comm=fd.most_common() tagged_revs=pos_t(stem_rev_lab) print(tagged_revs.head()) nwf_pos=[] nwt_pos=[] nw_pos=[] for rw in tagged_revs['POS']: for (k,v) in rw: ## print(k) ## nw_pos.append(str(k)+'_'+str(v)) ## nwt_pos.append(nw_pos) ## nw_pos=[] ## nwf_pos.append(nwt_pos) ## nwt_pos=[] ## print(nwf_pos[1]) ## nw_pos=' '.join(nw_pos) ## tagged_revs['newpos']=nwf_pos ## print(tagged_revs.head()) fit_vec(stem_rev_lab)
from nltk.probability import FreqDist from nltk.corpus import treebank fd = FreqDist() for word, tag in treebank.tagged_words(): fd[tag] += 1 tags = list(fd.items()) tags.sort(key=lambda (tag, freq): tag) for tag, freq in tags: print('{0}\t\t\t{1}'.format(tag, freq))
from nltk.corpus import treebank file = input() print(treebank.tagged_words(file)[0])
# if i in token: # x.append('digit_yes') # else: # x.append('digit_no') # x_train.append(x) #case11 Word contains a digit, current word, word contains a capital letter and previous word x_train = [] y_train = [] list1 = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] list2 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] pre_token = 'first' for token, pos in treebank.tagged_words(): # for token, pos in treebank.tagged_words()[:80000]: x = [] y_train.append(pos) x.append(pre_token + '-' + token) x.append(token) pre_token = token for item in list1: if item in token: x.append('capital_yes') else: x.append('capital_no') for i in list2: if i in token: x.append('digit_yes') else:
f.write('%s\n' % word) with open('treebank_words.txt', 'w') as f: for word in treebank.words(): f.write('%s\n' % word) tc_tags = [] for t in tc['tagged_words']: tc_tags.append(t[1]) with open('tc_tags.txt', 'w') as f: for tag in tc_tags: f.write('%s\n' % tag) treebank_tags = [] for t in treebank.tagged_words(): treebank_tags.append(t[1]) with open('treebank_tags.txt', 'w') as f: for tag in treebank_tags: f.write('%s\n' % tag) with open('tc_sent_lengths.txt', 'w') as f: for sent in tc['sents']: f.write('%s\n' % len(sent)) with open('treebank_sent_lengths.txt', 'w') as f: for sent in treebank.sents(): f.write('%s\n' % len(sent)) #tc_tags_series = pd.Series(tc_tags)
def __init__(self): self.wordToTags = defaultdict(set) convertedTaggedWords = [(w,nltk.tag.mapping.map_tag('en-ptb', 'universal', t)) for w,t in treebank.tagged_words()] for word, tag in convertedTaggedWords: self.wordToTags[word].add(tag) productions = list() S = nltk.Nonterminal('S') for tree in treebank.parsed_sents(): productions += tree.productions() # create the grammar pcfg = nltk.induce_pcfg(S, productions) # print(pcfg) self.viterb = ViterbiParser(pcfg) self.mostRecentTree = None self.validPosTags = set() self.validChunkTags = set() self.validIOBTags = set() self.relationTags = set() self.anchorTags = set() # pos tags self.validPosTags.add("CC") self.validPosTags.add("CD") self.validPosTags.add("DT") self.validPosTags.add("EX") self.validPosTags.add("FW") self.validPosTags.add("IN") self.validPosTags.add("JJ") self.validPosTags.add("JJR") self.validPosTags.add("JJS") self.validPosTags.add("LS") self.validPosTags.add("MD") self.validPosTags.add("NN") self.validPosTags.add("NNS") self.validPosTags.add("NNP") self.validPosTags.add("NNPS") self.validPosTags.add("PDT") self.validPosTags.add("POS") self.validPosTags.add("PRP") self.validPosTags.add("PRP$") self.validPosTags.add("PR") self.validPosTags.add("PBR") self.validPosTags.add("PBS") self.validPosTags.add("RP") self.validPosTags.add("SYM") self.validPosTags.add("TO") self.validPosTags.add("UH") self.validPosTags.add("VB") self.validPosTags.add("VBZ") self.validPosTags.add("VBP") self.validPosTags.add("VBD") self.validPosTags.add("VBG") self.validPosTags.add("WDT") self.validPosTags.add("WP") self.validPosTags.add("WP$") self.validPosTags.add("WRB") self.validPosTags.add(".") self.validPosTags.add(",") self.validPosTags.add(":") self.validPosTags.add("(") self.validPosTags.add(")") # chunk tags self.validChunkTags.add("NP") self.validChunkTags.add("PP") self.validChunkTags.add("VP") self.validChunkTags.add("ADVP") self.validChunkTags.add("ADJP") self.validChunkTags.add("SBAR") self.validChunkTags.add("PRT") self.validChunkTags.add("INTJ") self.validChunkTags.add("PNP") # IOB tags self.validIOBTags.add("I-") self.validIOBTags.add("O-") self.validIOBTags.add("B-") # relation tags self.relationTags.add("SBJ") self.relationTags.add("OBJ") self.relationTags.add("PRD") self.relationTags.add("TMP") self.relationTags.add("CLR") self.relationTags.add("LOC") self.relationTags.add("DIR") self.relationTags.add("EXT") self.relationTags.add("PRP") # anchor tags self.anchorTags.add("A1") self.anchorTags.add("P1")
from nltk.corpus import brown as brown from nltk.corpus import treebank as treebank import json import re brown_tagged_words = brown.tagged_words(simplify_tags=True) print "brown tags retrieved" treebank_tagged_words = treebank.tagged_words(simplify_tags=True) print "treebank tags retrieved" all_tagged_words = brown_tagged_words + treebank_tagged_words all_tagged_words = [(tuple[0].lower(), tuple[1]) for tuple in all_tagged_words] print "all_tags retrieved" vocab = {} for char in ".abcdefghijklmnopqrstuvwxyz": vocab[char] = {} for i, (current_word, current_tag) in enumerate(sorted(all_tagged_words)): if current_word[0] in "abcdefghijklmnopqrstuvwxyz": char = current_word[0] else: char = '.' if vocab[char].has_key(current_word): try: g = [pos[0] for pos in vocab[char][current_word]].index(current_tag) vocab[char][current_word][g] = (current_tag, vocab[char][current_word][g][1]+1)
print reader.paras() print reader.tagged_paras() #TaggedCorpus uses default tokenizer but we can change it by customizing it from nltk.tokenize import SpaceTokenizer reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer()) print reader.words() #Customing TaggedCorpus's sentence tokenizer from nltk.tokenize import LineTokenizer reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer()) print reader.words() #Customizing TaggedCorpus's paragraph Block reader #Customizing TaggedCorpus's tag separator - Pg 57 ###To map a corpus tags to the universal tagset, the corpus reader must be initialized #with a known tagset name. Then you can pass tagset="universal" to the method. reader=TaggedCorpusReader(root,r'.*\.pos',tagset='en-brown') reader.tagged_words(tagset="universal") #Ex: from nltk.corpus import treebank treebank.tagged_words() treebank.tagged_words(tagset="universal") #If we try to map an unknown mapping or tagset then everyword will be tagged with "UNK" treebank.tagged_words(tagset="brown")
# tokenize words word_tokenizer = TreebankWordTokenizer() word_list = [word_tokenizer.tokenize(sent) for sent in article_sent] # train pos tagger # evaluate accuracy test_sents = treebank.tagged_sents()[3000:] test_chunks = treebank_chunk.chunked_sents()[3000:] conll_test = conll2000.chunked_sents('test.txt') train_new_tagger = False if train_new_tagger: train_sents = treebank.tagged_sents()[:3000] #create dictionary from treeback of most frequent words print("creating dictionary from treeback") model = word_tag_model(treebank.words(), treebank.tagged_words()) #keeping tagger default for chaining purposes print("Training tagger") backoff= DefaultTagger('NN') nt = NamesTagger(backoff=backoff) #taggers = [UnigramTagger, BigramTagger, TrigramTagger] #trained_taggers = backoff_tagger(train_sents,taggers,backoff=nt) #Regexp - best to treat numbers? regexp_tagger = RegexpTagger(patterns, backoff=nt) treebank_tagger = UnigramTagger(model=model,backoff=regexp_tagger) #skipping affix #skipping brill
return tag # Return the POS of a rule (used for list sorting) def get_key(rule): return rule.split()[1] if __name__ == '__main__': # Get allowed words allowed_words_file = open('../../allowed_words.txt', 'r') allowed_words = allowed_words_file.read().split('\n') # Tagged words from corpora treebank_tagged_words = list(set(treebank.tagged_words())) conll2000_tagged_words = list(set(conll2000.tagged_words())) brown_tagged_words = list(set(brown.tagged_words())) nps_tagged_words = list(set(nps_chat.tagged_words())) vocab_rules = [] unvocabbed_words = [] # Find tags that occur with allowed words in the corpora for word in allowed_words: curr_tags = get_tags_linear(word, treebank_tagged_words) if not curr_tags: curr_tags = get_tags_linear(word, conll2000_tagged_words) if not curr_tags:
lm.entropy(test_words) # <markdowncell> # ### Counting # # For example, how many words in a corpus are not in WordNet? # <codecell> from nltk.corpus import wordnet from nltk.probability import ConditionalFreqDist cfd = ConditionalFreqDist( (pos, len(wordnet.synsets(word)) > 0) for word,pos in treebank.tagged_words() ) cfd.tabulate() # <markdowncell> # ### Missing functionality # # #### Head word identification # # NLTK has no functionality to identify the head words of phrases. In this noun phrase, 'man' is the head word, # but it is not straightforward to identify it. # <codecell>
from nltk.corpus import treebank tb_tagged_sents = treebank.tagged_sents() tb_sents = treebank.sents() patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] tb_tags = [tag for (word, tag) in treebank.tagged_words()] tb_tag = nltk.FreqDist(tb_tags) #b) i) print(tb_tag.most_common()) print(len(tb_sents)) #uni n = int(len(tb_tagged_sents) * 0.1) uni_train_sents = tb_tagged_sents[n:] uni_test_sents = tb_tagged_sents[:n] unigram_tagger = nltk.UnigramTagger(uni_train_sents) uni_accuracy = unigram_tagger.evaluate(uni_test_sents) print("Unigram Accuracy: ", uni_accuracy) #bi
"""句法分析-形式语言与自动机""" import nltk from nltk import FreqDist, Nonterminal, nonterminals, Production from nltk.corpus import treebank, sinica_treebank from nltk.grammar import toy_pcfg2 print(str(nltk.corpus.treebank).replace('\\\\', '/')) out = treebank.fileids() print(out) print(treebank.words('wsj_0007.mrg')) print(treebank.tagged_words('wsj_0007.mrg')) print(treebank.parsed_sents('wsj_0007.mrg')[2]) # 语法树 # treebank_chunk.chunked_sents()[1].draw() # out = treebank_chunk.chunked_sents()[1].leaves() # out = treebank_chunk.chunked_sents()[1].pos() # out = treebank_chunk.chunked_sents()[1].productions() # print(out) fd = FreqDist() fd.items() print(sinica_treebank.sents()) print(sinica_treebank.parsed_sents()[27]) """上下文无关文法(Context-free Grammar, CFG) 参考wiki 自动机理论 https://zh.wikipedia.org/zh-cn/%E8%87%AA%E5%8B%95%E6%A9%9F%E7%90%86%E8%AB%96 在计算机科学中,若一个形式文法 G = (V, Σ, P, S) 的产生式规则都取如下的形式:A -> α,则谓之。其中 A∈V ,α∈(V∪Σ)* 。 上下文无关文法取名为“上下文无关”的原因就是因为字符 A 总可以被字符串 α 自由替换,而无需考虑字符 A 出现的上下文。 一个CFG由以下部分组成: 非终结符的有限集合(N) 终结符的有限集合(T) 开始符号(S)
import nltk import nltk.corpus print(str(nltk.corpus.treebank).replace('\\\\','/')) print(nltk.corpus.treebank.fileids()) from nltk.corpus import treebank print(treebank.words('wsj_0007.mrg')) print(treebank.tagged_words('wsj_0007.mrg'))
def _get_data(): return _split_tagged_words(treebank.tagged_words())
tags = [] distinct_tags = [] # dictionary mapping pos tag name to numeric id tag_index = {} # id of corresponding tag tag_id = [] # represent pos tags in y_train/y_test with integers y_train_in_integers = [] y_test_in_integers = [] # define window size window_size = 3 print('Phrase 1: Dividing data into training set and testing set...') train = [] test = [] train = treebank.tagged_words()[:90677] test = treebank.tagged_words()[90677:] # print(len(treebank.tagged_words())) # print(treebank.tagged_words()) # print(train) # print(test) print('Phrase 1: END\n') X_train, y_train = chop_words_into_windows(window_size, train) X_test, y_test = chop_words_into_windows(window_size, test) # test if chop_words_into_windows is correctly implemented # for i in range(0, 50): # print(X_train[i]) # print(y_train[i]) # print('*****')
from nltk.grammar import Nonterminal from nltk.corpus import treebank training_set = treebank.parsed_sents() print training_set[1] # extract the productions for all annotated training sentences treebank_productions = list( set(production for sent in training_set for production in sent.productions()) ) treebank_productions[0:10] # add productions for each word, POS tag for word, tag in treebank.tagged_words(): t = nltk.Tree.fromstring("(" + tag + " " + word + ")") for production in t.productions(): treebank_productions.append(production) # build the PCFG based grammar treebank_grammar = nltk.grammar.induce_pcfg( Nonterminal('S'), treebank_productions ) # build the parser viterbi_parser = nltk.ViterbiParser(treebank_grammar) # get sample sentence tokens tokens = nltk.word_tokenize(sentence)
# The most frequent nouns usually provide information on the subject of a text. Below, the most frequent nouns of an already tagged text of the *Treebank*-corpus are determined. Let's see if we can conclude the text's subject. # In[20]: from nltk.corpus import treebank from nltk import FreqDist from nltk import bigrams print("\nTreebank sentences: ", treebank.sents(fileids="wsj_0003.mrg")) # In[21]: tagged0003=treebank.tagged_words(tagset="universal",fileids="wsj_0003.mrg") print("File tagged0003: ",tagged0003) # In[22]: fdist=FreqDist(a[0].lower() for a in tagged0003 if a[1]=="NOUN") #fdist.tabulate(20) print(fdist.most_common(20)) freqNouns = [w[0] for w in fdist.most_common(20)] fdist.plot(20) # Next, the adjectives immediately before the most frequent nouns are determined. What can be concluded from them?
# print brown.sents(categories=['news', 'editorial', 'reviews']) print('POS TAGS:') print('WORDS:') print(brown.words()[:5]) print(brown.tagged_words()[:5]) print('SENTS:') print([s[:5] for s in brown.sents()[:5]]) print([s[:5] for s in brown.tagged_sents()[:5]]) print() # CHUNKED # The CoNLL 2000 Corpus includes phrasal chunks # The CoNLL 2002 Corpus includes named entity chunks print('CHUNKING & NER:') print(conll2000.fileids()) print(conll2000.sents()[0]) print(conll2000.chunked_sents()[0]) print(conll2002.sents()[0]) print(conll2002.chunked_sents()[0]) print() # PARSED # 10% sample of the Penn Treebank print('PENN TREEBANK:') print(treebank.fileids()[:5]) print(treebank.words('wsj_0001.mrg')[:10]) print(treebank.tagged_words('wsj_0001.mrg')[:10]) print(treebank.sents('wsj_0001.mrg')[0]) print(treebank.parsed_sents('wsj_0001.mrg')[0]) print()
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
from nltk import UnigramTagger from nltk.corpus import treebank from tag_util import word_tag_model model = word_tag_model(treebank.words(), treebank.tagged_words()) tagger = UnigramTagger(model=model) test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents))
output_list = list() list_of_one_match = list() i = 0 for pair in tagged_tokens: if i == len(tag_seq): output_list.append(list_of_one_match) list_of_one_match = [] i = 0 elif pair[1] == tag_seq[i]: list_of_one_match.append(pair) i += 1 else: list_of_one_match = [] i = 0 print(output_list[:10]) dict = dict() list_of_tuples = brown.tagged_words(tagset='universal') for el in list_of_tuples: if el[0] not in dict.keys(): dict[el[0]] = [el[1]] else: if el[1] not in dict[el[0]]: dict[el[0]].append(el[1]) sorted_x = sorted(dict.items(), key=lambda kv: len(kv[1]), reverse=True) for i in range(10): print(sorted_x[i]) search_by_tag_sequence(["NNP", "NNP", "NNP"], treebank.tagged_words())
import nltk #from nltk.book import * from nltk.corpus import treebank from nltk.corpus import brown from nltk import word_tokenize from nltk import hmm #nltk.help.upenn_tagset("NN*") files = treebank.fileids() #print(files) t = treebank.tagged_words("wsj_0003.mrg") #for p in t: #print(p) #race1 = nltk.tag.str2tuple('race/NN') #race2 = nltk.tag.str2tuple('race/VB') #print(race1) #print(brown.tagged_words().count(race1)) #print(brown.tagged_words().count(race2)) unitag = nltk.tag.UnigramTagger(brown.tagged_sents(categories='news')[:5000]) print(unitag) s = "The secretariat is expected to race tomorrow." s_tok = word_tokenize(s) tt = unitag.tag(s_tok) print(tt) hmmTagger = nltk.hmm.HiddenMarkovModelTrainer().train_supervised( brown.tagged_sents(categories="news")[:5000]) tt2 = hmmTagger.tag(s_tok) print(tt2)
lam1 = lam1 / total lam2 = lam2 / total lam3 = lam3 / total return lam1, lam2, lam3 # OOV #Use letter information to predict tags from nltk.corpus import treebank from collections import Counter import random tagged_words = treebank.tagged_words() tagged_sentences = treebank.tagged_sents() cut_off = int(len(tagged_sentences) * 0.8) train = tagged_sentences[:cut_off] test = tagged_sentences[cut_off:] train_words = tagged_words[:80000] test_words = tagged_words[80000:] def unigram_tagger(train, test): word_to_all_pos = {} for word, pos in train: if word not in word_to_all_pos: word_to_all_pos[word] = [pos]
#Other tagged corpora also come with the tagged_words method. #Note that the chat corpus is tagged with Penn Treebank POS tags. nltk.corpus.nps_chat.tagged_words()[:50] #In this class, we will mostly use the Penn Treebank tag set, #as it is the most widely used. The Treebank has the tagged_words #and tagged_sents methods, as well as the words method that we used #before to get the tokens. from nltk.corpus import treebank treebank_tokens = treebank.words() treebank_tokens[:50] treebank_tagged_words = treebank.tagged_words()[:50] len(treebank.tagged_words()) treebank_tagged_words[:50] #maybe an error here? treebank_tagged = treebank.tagged_sents()[:2] len(treebank.tagged_sents()) treebank_tagged[:2] #The NLTK has almost 4,000 sentences of tagged data from #Penn Treebank, while the actual Treebank has much more. #This will limit the accuracy of the parsers that we can define in #lab, but also make the running times short enough for labs. ############Let's look at the frequencies of the tags in this portion of #Penn Treebank. To do that, we use the NLTK Frequency #Distribution for all the tags from the (word, tag) pairs in the Treebank.
import nltk from nltk.corpus import treebank treebank_tagged = treebank.tagged_words(tagset='universal') tagpairs = nltk.bigrams(treebank_tagged) preceders_noun = [x[1] for (x, y) in tagpairs if y[1] == 'NOUN'] freqdist = nltk.FreqDist(preceders_noun) print([tag for (tag, _) in freqdist.most_common()])
import nltk from nltk.corpus import brown from nltk.corpus import treebank print(brown.tagged_sents()[:2]) print(brown.tagged_words()[:50]) wordtag = brown.tagged_words()[0] brown_humor_tagged = brown.tagged_words(categories='humor', tagset='universal') print(brown_humor_tagged[:50]) a = nltk.corpus.nps_chat.tagged_words()[:50] print(a) treebank_tokens = treebank.words() print("treebank_tokens ", treebank_tokens) treebank_tagged_words = treebank.tagged_words()[:50] print("tree tagged", treebank_tagged_words[:50]) treebank_tagged = treebank.tagged_sents()[:2] print(treebank_tagged[:2]) tag_fd = nltk.FreqDist(tag for (word, tag) in treebank_tagged_words) for tag, freq in tag_fd.most_common(): print(tag, freq)
#!/usr/bin/env python3 from nltk import FreqDist # import treebank from nltk.corpus import treebank # import ne chunker from nltk import chunk, tag tempList = [ "DATE", "TIME", "GPE", "FACILITY", "LOCATION", "MONEY", "PERSON", "ORGANIZATION", "PERCENT" ] data = treebank.tagged_words() # load treebank data chunkd_data = chunk.ne_chunk(data) # chunk the data chunkd_trees = chunkd_data.subtrees( filter=lambda t: t.label() in tempList) # select subtrees which are NE word_fd = FreqDist( [' '.join(word for word, pos in tree.leaves()) for tree in chunkd_trees]) print("Three most common named entities are: ") print(', '.join(word for word, freq in word_fd.most_common(3))) # Three most common named entities are: # U.S., New York, Japanese
__author__ = 'rumesh' import nltk from nltk.corpus import brown # print brown.tagged_sents()[:2] # print brown.tagged_words()[:50] brown_news_tagged = brown.tagged_words(categories='news', tagset='universal') # print brown_news_tagged[:50] # print nltk.corpus.nps_chat.tagged_words()[:50] from nltk.corpus import treebank # print treebank.tagged_words()[:50] # print len(treebank.tagged_words()) # print treebank.tagged_sents()[:2] # print len(treebank.tagged_sents()) def findtags(tag_prefix, tagged_text): cfd = nltk.ConditionalFreqDist((tag,word) for (word,tag) in tagged_text if tag.startswith(tag_prefix)) return dict((tag, cfd[tag].keys()[:20]) for tag in cfd.conditions()) tagdict = findtags('NN', treebank.tagged_words()) for tag in sorted(tagdict): print tag, tagdict[tag]