def pcfg_demo(): """ A demonstration showing how C{WeightedGrammar}s can be created and used. """ from nltk.corpus import treebank from nltk import treetransforms from nltk import induce_pcfg from nltk.parse import pchart pcfg_prods = toy_pcfg1.productions() pcfg_prod = pcfg_prods[2] print 'A PCFG production:', ` pcfg_prod ` print ' pcfg_prod.lhs() =>', ` pcfg_prod.lhs() ` print ' pcfg_prod.rhs() =>', ` pcfg_prod.rhs() ` print ' pcfg_prod.prob() =>', ` pcfg_prod.prob() ` print grammar = toy_pcfg2 print 'A PCFG grammar:', ` grammar ` print ' grammar.start() =>', ` grammar.start() ` print ' grammar.productions() =>', # Use string.replace(...) is to line-wrap the output. print ` grammar.productions() `.replace(',', ',\n' + ' ' * 26) print print 'Coverage of input words by a grammar:' print grammar.covers(['a', 'boy']) print grammar.covers(['a', 'girl']) # extract productions from three trees and induce the PCFG print "Induce PCFG grammar from treebank data:" productions = [] for item in treebank.items[:2]: for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() S = Nonterminal('S') grammar = induce_pcfg(S, productions) print grammar print print "Parse sentence using induced grammar:" parser = pchart.InsideChartParser(grammar) parser.trace(3) # doesn't work as tokens are different: #sent = treebank.tokenized('wsj_0001.mrg')[0] sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves() print sent for parse in parser.nbest_parse(sent): print parse
def pcfg_demo(): """ A demonstration showing how C{WeightedGrammar}s can be created and used. """ from nltk.corpus import treebank from nltk import treetransforms from nltk import induce_pcfg from nltk.parse import pchart pcfg_prods = toy_pcfg1.productions() pcfg_prod = pcfg_prods[2] print('A PCFG production:', repr(pcfg_prod)) print(' pcfg_prod.lhs() =>', repr(pcfg_prod.lhs())) print(' pcfg_prod.rhs() =>', repr(pcfg_prod.rhs())) print(' pcfg_prod.prob() =>', repr(pcfg_prod.prob())) print() grammar = toy_pcfg2 print('A PCFG grammar:', repr(grammar)) print(' grammar.start() =>', repr(grammar.start())) print(' grammar.productions() =>', end=' ') # Use string.replace(...) is to line-wrap the output. print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26)) print() print('Coverage of input words by a grammar:') print(grammar.covers(['a', 'boy'])) print(grammar.covers(['a', 'girl'])) # extract productions from three trees and induce the PCFG print("Induce PCFG grammar from treebank data:") productions = [] for item in treebank.items[:2]: for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() S = Nonterminal('S') grammar = induce_pcfg(S, productions) print(grammar) print() print("Parse sentence using induced grammar:") parser = pchart.InsideChartParser(grammar) parser.trace(3) # doesn't work as tokens are different: #sent = treebank.tokenized('wsj_0001.mrg')[0] sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves() print(sent) for parse in parser.nbest_parse(sent): print(parse)
def grammar_development_with_treebank(): from nltk.corpus import treebank t = treebank.parsed_sents("wsj_0001.mrg")[0] print t print "identify verbs for SV in VP -> SV S", [ subtree for tree in treebank.parsed_sents() for subtree in tree.subtrees(_grammar_filter) ]
def pcfg_demo(): """ A demonstration showing how a ``PCFG`` can be created and used. """ from nltk.corpus import treebank from nltk import treetransforms from nltk import induce_pcfg from nltk.parse import pchart # pcfg_prods = toy_pcfg1.productions() # # pcfg_prod = pcfg_prods[2] # print('A PCFG production:', repr(pcfg_prod)) # print(' pcfg_prod.lhs() =>', repr(pcfg_prod.lhs())) # print(' pcfg_prod.rhs() =>', repr(pcfg_prod.rhs())) # print(' pcfg_prod.prob() =>', repr(pcfg_prod.prob())) # print() # # grammar = toy_pcfg2 # print('A PCFG grammar:', repr(grammar)) # print(' grammar.start() =>', repr(grammar.start())) # print ' grammar.productions() =>', # # Use .replace(...) is to line-wrap the output. # print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26)) # print() # extract productions from three trees and induce the PCFG print("Induce PCFG grammar from treebank data:") productions = [] item = treebank._fileids[0] for tree in treebank.parsed_sents(item)[:3]: # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() # S = Nonterminal('S') # grammar = induce_pcfg(S, productions) print(productions) print() print("Parse sentence using induced grammar:") parser = pchart.InsideChartParser(grammar) parser.trace(3) # doesn't work as tokens are different: # sent = treebank.tokenized('wsj_0001.mrg')[0] sent = treebank.parsed_sents(item)[0].leaves() print(sent) for parse in parser.parse(sent): print(parse)
def learn_treebank(files=None, markov_order=None): """ Learn a PCFG from the Penn Treebank, and return it. By default, this learns from NLTK's 10% sample of the Penn Treebank. You can give the filename of a Treebank file; 'wsj-02-21.mrg' will learn from the entire training section of Treebank. """ if files is None: bank = treebank.parsed_sents() else: bank = treebank.parsed_sents(files) return learn_trees(bank, collapse=True, markov_order=markov_order)
def grammarDevelopmen(): print "page 315 8.6 Grammar Developmen" print "=============== Treebanks and Grammars ===============" from nltk.corpus import treebank t = treebank.parsed_sents('wsj_0001.mrg')[0] print t def filter(tree): child_nodes = [child.node for child in tree if isinstance(child, nltk.Tree)] return (tree.node == 'VP') and ('S' in child_nodes) print [subtree for tree in treebank.parsed_sents() for subtree in tree.subtrees(filter)]
def sentences(): for f in treebank.fileids(): for t in treebank.parsed_sents(f): t.chomsky_normal_form(horzMarkov=1) t.collapse_unary(collapsePOS=True) yield (t, t.leaves())
def gen_corpus(path, threshold): """ src: http://www.nltk.org/_modules/nltk/tree.html corpora from wsj_0001.mrg to wsj_0199.mrg e.g.: t = treebank.parsed_sents('wsj_0001.mrg')[0] to visualize a tree: t.draw() :param path: save to path :param threshold: minimum length of a sentence to keep :return: none """ boundaries = [] sentences = [] for t in treebank.parsed_sents(treebank.fileids()): flat = _flatten_tree(t, threshold) if flat: boundaries.append(flat) sentence = ' '.join(t.leaves()).translate(PUNC_TRANS).lower() sentence = re.sub(r' +', ' ', sentence) # replace digit(s) as 'x'(s) sentences.append(re.sub(r'\d', 'x', sentence).strip()) _check_length_match(boundaries, sentences) with open(path + "/boundaries.txt", 'w') as f: f.write('1'.join(boundaries)) with open(path + "/sentences.txt", 'w') as f: f.write(' '.join(sentences))
def CKY_parser(): ''' Given the PCFG, we use the built in CKY praser function to get a sentence's most probable parse ''' PCFG_grammar = make_PCFG_grammar() # Utilize the ViertabiParser given the PCFG grammar induction rules parser = ViterbiParser(PCFG_grammar) # Sample sentence parse sentences = treebank.parsed_sents('wsj_1964.mrg') skipped_sentences = 0 # A for loop to print out the full parse for sentence in sentences: sentence = sentence.leaves() try: PCFG_grammar.check_coverage(sentence) for parse in parser.parse(sentence): print(parse) except: skipped_sentences += 1 continue print("Total skipped sentences:", skipped_sentences)
def ex6(symbol='S', display=5): """ PCFG: Probabilistic CFGs Generating the probability distribution of a given symbol in a CFG. For a condenced visual display of results, expansions with less than five or any given number of instances are removed from the results although the calculations for the probability distribution of the symbol includes all available productions. """ prob_dist = dict() l5_view = dict() productions = [ p for tree in treebank.parsed_sents() for p in tree.productions() ] all_sym_prd = [p for p in productions if p.lhs().symbol() == symbol] sym_count = len(all_sym_prd) unique_rhs = set([p.rhs() for p in all_sym_prd]) all_rhs = [p.rhs() for p in all_sym_prd] for rhs in unique_rhs: prob_dist[rhs] = all_rhs.count(rhs) / sym_count if all_rhs.count(rhs) < display: # condence display prob_dist.pop(rhs) return prob_dist
def sequence_matching(input): sents = treebank.tagged_sents() parses = treebank.parsed_sents() for s in range(len(sents)): # look through every sentence in treebank to find a sequence match with input sent = sents[s] pars = parses[s] k = 0 # k will track how far into the sequence has been matched matches = [] # log position in sent that there was a match to help build tree later for i in range(len(input)): match = False # flag to cut down on time if a word doesn't match anything in the sent for j in range(k, len(sent)): # loop through every word in sentence starting from last match if sent[j][1] == input[i][1]: # labels (pos) match k = j UpdateTree(pars, j, input[i][1]) match = True # if this line is never reached, then don't waste more time on this sentence if i == len(input) - 1: # made it through the entire input, so sent was a match return pars # pars will have words replaced where there is a match break if match == False: print("Sentence does not match") break # program has looked through whole sentence without matching a word so move onto the next sentence return None # no sentence was found to match the input sequence, print error message
def test_GrammarParser(): import nltk from nltk.corpus import treebank grammar = r"""NP: {<DT>*(<NN>|<NNP>|<NNS>)+} # Chunk everything }<VBD|IN>+{ # Chink sequences of VBD and IN """ # tree=treebank.parsed_sents('wsj_0001.mrg')[0] # print tree grammar_VP = r"""VP: {<VBZ><VP>} """ # tree=nltk.RegexpParser(grammar).parse(treebank.parsed_sents('wsj_0001.mrg')[0].pos()) # print tree fileids = treebank.fileids() # for fileld in fileids: for i in range(len(fileids)): if i > 10: break # trees=treebank.parsed_sents(fileld) trees = treebank.parsed_sents(fileids[i]) for tree in trees: tree_Gram = nltk.RegexpParser(grammar).parse(tree) for subtree in tree_Gram.subtrees(): if subtree.label() == "VP": print subtree
def test(): """Do some tree drawing tests.""" def print_tree(n, tree, sentence=None, ansi=True, **xargs): print() print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves()))) print(tree) print() drawtree = TreePrettyPrinter(tree, sentence) try: print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs)) except (UnicodeDecodeError, UnicodeEncodeError): print(drawtree.text(unicodelines=False, ansi=False, **xargs)) from nltk.corpus import treebank for n in [0, 1440, 1591, 2771, 2170]: tree = treebank.parsed_sents()[n] print_tree(n, tree, nodedist=2, maxwidth=8) print() print('ASCII version:') print(TreePrettyPrinter(tree).text(nodedist=2)) tree = Tree.fromstring( '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) ' '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) ' '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int) sentence = ('Ze had met haar moeder kunnen gaan winkelen ,' ' zwemmen of terrassen .'.split()) print_tree('Discontinuous tree', tree, sentence, nodedist=2)
def main(transform_func = None, n = 10): parser=StanfordParser( path_to_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser.jar", path_to_models_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models.jar", model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) test_sents = treebank.sents()[-n:] print "len(test_sents) = %d" %(len(test_sents)) if transform_func and callable(transform_func): print "transforming it using ", transform_func test_sents = [[transform_func(w) for w in s] for s in test_sents] # transform it print test_sents[:10] print "predicting" pred_parses = parser.parse_sents(test_sents) gold_parses = treebank.parsed_sents() print "evaluating" correct_n = gold_n = predicted_n = 0.0 for gparse, pparse in zip(gold_parses, pred_parses): cn, gn, pn = precision_and_recall_stat(get_nodes_with_range(gparse), get_nodes_with_range(pparse)) correct_n += cn gold_n += gn predicted_n += pn print "Prediction: %f, Recall: %f" %(correct_n / predicted_n, correct_n / gold_n)
def pcfg(train_idx=None, smoothing=None): """ productions = [] item = treebank._fileids[0] print("ITEM\n\n",item,"\n\n") for tree in treebank.parsed_sents(item)[:3]: # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS = False) tree.chomsky_normal_form(horzMarkov = 2) productions += tree.productions() """ if train_idx == None: train_idx = (len(treebank.fileids()) * 3) // 4 productions = [] for item in treebank.fileids()[0:train_idx]: for tree in treebank.parsed_sents(item): tree.collapse_unary( collapsePOS=False) # Remove unary production rule tree.chomsky_normal_form( horzMarkov=2 ) # Convert into chomsky normal form i.e., A->(B,C,D) into A->(B,E) E->(C,D) productions += tree.productions() S = Nonterminal('S') if smoothing == None: grammar = learn_pcfg(S, productions) elif smoothing == 'L1': grammar = smoothing_pcfg(S, productions) with open('grammar.pkl', 'wb') as f: pickle.dump(grammar, f) return grammar
def train_grammar(unknown_words=[], nb_reduced_production=6000): productions = [] for item in train: for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS=False) # Remove branches A-B-C into A-B+C tree.chomsky_normal_form(horzMarkov=2) # Remove A->(B,C,D) into A->B,C+D->D #tree_prods = tree.productions() productions += tree.productions() counter = collections.Counter(productions) n_comms = [item for item, count in counter.most_common(nb_reduced_production) for i in range(count)] #Adding unkwown words and terminal rules back into the reduced productions set unknown_words_prods = [] for p in productions: if isinstance(p._rhs[0], str): unknown_words_prods.append(p) for u in unknown_words: rhs = [u] lhs = p._lhs new_prod = Production(lhs, rhs) unknown_words_prods.append(new_prod) n_comms += unknown_words_prods S = Nonterminal('S') grammar = induce_pcfg(S, n_comms) return grammar
def convert_wsj(file_obj): from nltk.corpus import treebank sys.stderr.write("Converting Penn Treebank sampler...\n") tb = TreebankConverter() for sentence in treebank.parsed_sents(): tb.add_sentence(sentence) tb.write(file_obj)
def test(): model = torch.load('./ckpt/model0.pt') leafmodel = LeafNet() x = treebank.sents('wsj_0003.mrg')[0] y = treebank.parsed_sents('wsj_0003.mrg')[0] preprocess(y) # embed_x is the list of embedding vectors of x embed_x = [] x_list = [] l = int(len(x)) for i in range(0, l): txlist = [] x[i] = x[i].lower() txlist.append(x[i]) tembed = torch.Tensor(get_embed(x[i])) embed_x.append(tembed) pred = leafmodel(embed_x[i]) gt = (torch.argmax(pred)).item() txlist.append(gt) x_list.append(txlist) # we got the (sentence,gt) list, embedding vector list for the leafs xscore = 0.0 while (len(x_list) != 1): x_list, embed_x, tscore = calculate_score(x_list, embed_x, model) xscore = xscore + tscore x_list = str(x_list).replace('[', '(').replace(']', ')').replace( '\'', '').replace(',', '') x_list_tree = Tree.fromstring((x_list)) draw_trees(x_list_tree) draw_trees(y)
def PCFG_Section(): toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) pcfg_prods = toy_pcfg1.productions() pcfg_prod = pcfg_prods[2] print('A PCFG production:', pcfg_prod) print('pcfg_prod.lhs() =>', pcfg_prod.lhs()) print('pcfg_prod.rhs() =>', pcfg_prod.rhs()) print('pcfg_prod.prob() =>', pcfg_prod.prob()) # extract productions from three trees and induce the PCFG print("Induce PCFG grammar from treebank data:") productions = [] for item in treebank.fileids()[:2]: for tree in treebank.parsed_sents(item): # print(" ".join(tree.leaves())) # perform optional tree transformations, e.g.: # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D prods = tree.productions() # print(prods[0].prob()) productions += prods S = Nonterminal('S') grammar = induce_pcfg(S, productions) # print(grammar) # This is a PCFG ### Parsing section below ### print("\nParse sentence using induced grammar:") parser = pchart.InsideChartParser(grammar) parser.trace(1) sent = treebank.parsed_sents('wsj_0001.mrg')[0] print(sent.prob())
def train(): print("Collecting sub-corpus from Penn Treebank (nltk.corpus)") # prepare parsing trees, extrated from treebank tbank_trees = [] for sent in treebank.parsed_sents(): sent.chomsky_normal_form() tbank_trees.append(sent) # build vocabulary list, extracted from treebank vocab_size = 10000 # set vocabulary size to 10000 words = [wrd.lower() for wrd in treebank.words()] vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)] # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency tbank_productions = set(production for tree in tbank_trees for production in tree.productions()) tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions)) production_rules = tbank_grammar.productions() rules_to_prob = defaultdict(int) nonterm_occurrence = defaultdict(int) #calculate probablity for rules for sent in tbank_trees: for production in sent.productions(): if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal): production = Production(production.lhs(), [production.rhs()[0].lower()]) nonterm_occurrence[production.lhs()] += 1 rules_to_prob[production] += 1 for rule in rules_to_prob: rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()] # use Katz smoothing rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab) rules = list(rules_to_prob.keys()) rules_reverse_dict = dict((j,i) for i, j in enumerate(rules)) left_rules = defaultdict(set) right_rules = defaultdict(set) unary_rules = defaultdict(set) # classify left, right rules for rule in rules: if len(rule.rhs()) > 1: left_rules[rule.rhs()[0]].add(rule) right_rules[rule.rhs()[1]].add(rule) else: unary_rules[rule.rhs()[0]].add(rule) terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str)) terminal_nonterms = defaultdict(int) for rule in terminal_nonterms_rules: terminal_nonterms[rule.lhs()] += 1 pcfg_parser = { 'vocab': vocab, 'left_rules': left_rules, 'right_rules': right_rules, 'unary_rules': unary_rules, 'rules_to_prob': rules_to_prob, 'terminal_nonterms': terminal_nonterms } return pcfg_parser
def add_words(self, file_ids): for id in file_ids: sentences = ptb.parsed_sents(id) for sen_tree in sentences: words = Corpus._filter_words(sen_tree) words = ['<eos>'] + words + ['<eos>'] for word in words: self.dict.add(word)
def _induce_grammar(self): self.productions = [] for tree in treebank.parsed_sents(treebank.fileids()): # perform optional tree transformations, e.g.: tree.collapse_unary( collapsePOS=False) # Remove branches A-B-C into A-B+C tree.chomsky_normal_form( horzMarkov=2) # Remove A->(B,C,D) into A->B,C+D->D self.productions += tree.productions()
def nltk_parse(s): tokens = nltk.word_tokenize(s) print(tokens) tagged = nltk.pos_tag(tokens) print(tagged[0:6]) entities = nltk.chunk.ne_chunk(tagged) print(entities) t = treebank.parsed_sents('wsj_0001.mrg')[0] t.draw()
def read_data(): treebank_tagged_sents = list( chain(*[[tree.pos() for tree in treebank.parsed_sents(pf)] for pf in treebank.fileids()])) words_list = [[tag[0] for tag in sent] for sent in treebank_tagged_sents] labels = [[tag[1] for tag in sent] for sent in treebank_tagged_sents] words = [] max_words = 0 for sent in words_list: words.extend(sent) max_words = max(max_words, len(sent)) print("Max. Words:", max_words) seq_length = 100 print("Seq. Length:", seq_length) words = list(set(words)) print("Number of Words:", len(words)) unique_labels = [] for sent in labels: unique_labels.extend(sent) unique_labels = list(set(unique_labels)) print("Number of Unique Labels:", len(unique_labels)) word2id = {word: i + 1 for i, word in enumerate(words)} id2word = {i + 1: word for i, word in enumerate(words)} X_data = [] Y_data = [] for i in range(len(treebank_tagged_sents)): for j in range(len(words_list[i])): _x = [0] * max_words for k in range(j + 1): _x[j - k] = word2id[words_list[i][k]] _x = _x[:seq_length] _x.reverse() X_data.append(_x) Y_data.append(one_hot(labels[i][j], unique_labels)) X_data = np.array(X_data, dtype=np.int32) Y_data = np.array(Y_data, dtype=np.float32) print(X_data.shape) print(Y_data.shape) return X_data, Y_data, unique_labels, words, word2id, id2word
def read_wsj_from_treebank(self, index): from nltk.corpus import treebank self.__reset() self.__input_text = 'wsj_000' + str(index) + '.mrg' self.__sents = treebank.sents(self.__input_text) self.__tagged_sents = treebank.parsed_sents(self.__input_text) if self.__verbose: self.__print_all() return self.__tagged_sents
def get_processed_data(): bank = treebank.parsed_sents() train_bank, test_bank = train_test_split(bank, test_size=0.2) train_bank = list(train_bank) test_bank = list(test_bank) train_bank = convert_to_base_category(train_bank) test_bank = convert_to_base_category(test_bank) return train_bank, test_bank
def extract_simple_productions(n): rules = [] new_rules = [] for t in treebank.parsed_sents()[:n]: rules = rules + t.productions() for r in rules: r = simple_rule(r) if not "EMPTY" in str(r): new_rules.append(r) return new_rules
def TreebankNoTraces(): tb = [] for t in treebank.parsed_sents(): if t.label() != "S": continue RemoveFunctionTags(t) RemoveTraces(t) t.collapse_unary(collapsePOS=True, collapseRoot=True) t.chomsky_normal_form() tb.append(t) return tb
def getTrees(source, size): '''Load the trees from source, return first SIZE trees''' if source == 'treebank': from nltk.corpus import treebank trees = treebank.parsed_sents() #inds = random.permutation(range(0,len(trees)))[0:size] trees = trees[:size] return trees else: return list()
def getTrees(source,size): '''Load the trees from source, return first SIZE trees''' if source=='treebank': from nltk.corpus import treebank trees = treebank.parsed_sents() #inds = random.permutation(range(0,len(trees)))[0:size] trees = trees[:size] return trees else: return list()
def learn_treebank(trees=None): """ Learn a PCFG from the Penn Treebank, and return it. By default, this learns from NLTK's 10% sample of the Penn Treebank. You can also pass a set of trees. """ if trees is None: bank = treebank.parsed_sents() else: bank = trees return learn_trees(bank, collapse=True)
def main(phrase_level, sanitize): for n in range(1, 200): tree_file = "wsj_{}.mrg".format(str(n).zfill(4)) sentences = treebank.parsed_sents(tree_file) for s in sentences: for subtree in s.subtrees(lambda t: t.label() == phrase_level): if sanitize == True: sanitize_tree(subtree) print(subtree.pformat(100000)) break
def parse_treebank(parser: ViterbiParser, sentences): start_time = time.time() parser.trace(trace=1) for sentence in treebank.parsed_sents(sentences[:3]): tokens = sentence.leaves() for tree in parser.parse(tokens): print(tree) print( f"Time elapsed for sentence of length {len(tokens)}: {time.time() - start_time}" )
def TreebankNoTraces(): tb = [] for t in treebank.parsed_sents(): if t.label() != "S": continue RemoveFunctionTags(t) RemoveTraces(t) t.collapse_unary(collapsePOS = True, collapseRoot = True) t.chomsky_normal_form() tb.append(t) return tb
def main(): sentence = """I saw a man with a telescope. ... Colorless green ideas sleep furiously. ... The horse raced past the barn fell.""" tokens = nltk.word_tokenize(sentence) print(tokens) tagged = nltk.pos_tag(tokens) print(tagged[0:6]) entities = nltk.chunk.ne_chunk(tagged) print(entities) t = treebank.parsed_sents('wsj_0001.mrg')[0]
def write_example_tree(features, f): filename = features['_filename'] sen = features['_sentence_id'] phr = features['_phrase_id'] tree = treebank.parsed_sents(filename)[sen] phrase = tree[tree.treepositions('preorder')[phr]] l = treebank_helper.get_label(phrase) treebank_helper.set_label(phrase, '***' + l + '***') f.write(str(tree)) f.write('\n') treebank_helper.set_label(phrase, l)
def treebank_accessor(): ''' Function that reads the Penn treebank and returns all the trees for each sentence in the corpus. ''' trees = [] for i in range(1, TREEBANK_FILES + 1): file_number = "%03d" % (i,) t = treebank.parsed_sents('wsj_0' + file_number + '.mrg') for sentence in range(len(t)): # For each sentence in the file, convert to a tree and add it to trees[] trees.append(t[sentence]) return trees
def get_treebank_rules(cutoff=0, include_counts=False): all_rules = cache_utils.cache_get('treebank_rules', 'rules') if not all_rules: log('Generating lexical rules from Penn Treebank', 4) from nltk.corpus import treebank all_rules = dict() for tree in treebank.parsed_sents(): for rule, count in lexical_rules(tree).items(): all_rules[rule] = all_rules.get(rule, 0) + count cache_utils.cache_set('treebank_rules', 'rules', all_rules) if include_counts: return {k: v for (k, v) in all_rules.items() if v > cutoff} else: rules_set = set([rule for rule, count in all_rules.items() if count > cutoff]) return rules_set
def read_treebank_files(files, extractor,fe): """Read the listed treebank files and collect function tagging examples from each tree. The user-provided feature extractor is applied to each phrase in each tree. The extracted feature dicts and the true function tags for each phrase are stored in two separate lists, which are returned. """ X = [] Y = [] for filename in files: scount = 0 for tree in treebank.parsed_sents(filename): tree = ParentedTree.convert(tree) treebank_helper.postprocess(tree) find_examples_in_tree(tree, X, Y, extractor,fe, filename, scount, 0) scount += 1 return X, Y
def get_trees(fileids=None, verbose=False): """ Get the CNF trees for the treebank fileids given, or for the entire treebank """ if not fileids: # Get the Penn Treebank corpus fileids = treebank.fileids() # Get the sentence-trees in each file tree_lists = [treebank.parsed_sents(file_id) for file_id in fileids] trees = [sent for sent_list in tree_lists for sent in sent_list] if verbose: print("obtained", len(trees), "trees from the corpus.") cnf_trees = [ctc.convert_tree(t) for t in trees] if verbose: print("converted", len(trees), "trees to cnf.") return cnf_trees
def create_forests(self, filename=None, treelist=None, clear=False): """ This will read sentences to parse. One sentence per line, no periods etc. :param filename: not used :param clear: start with empty """ filename = filename or Document.get_default_treeset_file() forests = [] input_trees = [] shared_lexicon = load_lexicon(Document.get_default_lexicon_file()) print('loaded shared_lexicon: ', shared_lexicon) if treelist: input_trees = treelist elif has_nltk: print(f"reading trees {NLTK_TREE_RANGE[0]}-{NLTK_TREE_RANGE[1]} from NLTK's treebank") for i in range(*NLTK_TREE_RANGE): # 199 trees = treebank.parsed_sents(f'wsj_0{str(i).rjust(3, "0")}.mrg') for j, tree in enumerate(trees): tree.chomsky_normal_form() tree.collapse_unary() input_trees.append(as_list(tree)) else: readfile = open(filename, 'r') for line in readfile: line = line.strip() if line: if line.startswith('[') and line.endswith(']'): input_trees.append(ast.literal_eval(line)) else: input_trees.append(line) for input_tree in input_trees: syn = classes.SyntaxAPI() syn.lexicon = shared_lexicon if isinstance(input_tree, list): syn.input_tree = input_tree else: syn.input_text = input_tree forest = Forest(heading_text=str(input_tree), syntax=syn) forests.append(forest) return forests
def train_pcfg(): print 'training grammar' productions = [] # print len(treebank.fileids()) trees = [] # up to 199 less for shorter grammar for quicker training for fileid in treebank.fileids()[0:20]: for tree in treebank.parsed_sents(fileid): # perform optional tree transformations, e.g.: # Remove branches A->B->C into A->B+C so we can avoid infinite # productions tree.collapse_unary(collapsePOS=False) # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser) # horizontal and vertical Markovization: remember parents and siblings in tree # This gives a performance boost, but makes the grammar HUGE # If we use these we would need to implement a tag forgetting method #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0) tree.chomsky_normal_form() productions += tree.productions() S = nltk.Nonterminal('S') grammar = nltk.induce_pcfg(S, productions) print "grammar trained!" return grammar
#! /usr/bin/python # -*- coding: utf-8 -*- __author__ = "Osman Baskaya" from nltk.corpus import treebank files = "cl23.mrg wsj_1695.mrg wsj_1778.mrg".split() for f in files: for sentence in treebank.parsed_sents(f): s = [] for word, p in sentence.pos(): if p != '-NONE-': s.append(word) print ' '.join(s) #f = '../data/senseval3/english-all-words.xml' #soup = BeautifulSoup(open(f), 'xml') #texts = soup.find_all('text') #sentences = [] #quot_set = set(['"', ]) #quot = False #sentence = [] #for t in texts: #tokens = t.text.split() #for token in tokens: #if token in quot_set: #quot = not quot
rules = [] results = re.findall("(\({0}\ {1}\))".format(rule,word), sent) for res in results: x = res.split(" ") if len(x) == 2: p,c = x rules.append("{0} -> '{1}'".format(p[1:], c[:-1])) return rules def check(productions, rules): i = 0 for x in productions: if str(x) in rules: i += 1 else: print x return (i,len(productions)) if __name__=="__main__": total, recall = 0,0 for s in treebank.parsed_sents(): sent = "".join(str(s).split("\n")) unaries = find_unary(sent) nonunaries = find_nonunary(sent) rules = unaries + nonunaries r, t = check(s.productions(), rules) recall+=r total+=t print "{0} out of {1}: {2}".format(recall,total, float(recall)/total)
def find_pronouns(tree): pronouns = [] for child in tree: if type(child) in [unicode, str] and child.lower() in PRONOUNS: pronouns.append((child.lower(), None)) if isinstance(child, ParentedTree): pronouns = pronouns + find_pronouns(child) return pronouns total = 0 for file in treebank.fileids(): stats['name'] = file for tree in treebank.parsed_sents(file): tree = ParentedTree.convert(tree) for pronoun, np_node in find_pronouns(tree): if pronoun in gendered: stats['gendered'] += 1 if pronoun in itits: stats['itits'] += 1 stats['total'] += 1 total += 1 stats['pct_gendered'] = stats['gendered']/float(stats['total']) print file, total files.append(stats.copy()) stats = dict.fromkeys(stats, 0)
import nltk from nltk.corpus import treebank # show samples of treebank t = treebank.parsed_sents('wsj_0001.mrg')[0] # print(t) # filter sentential complements def filter(tree): child_nodes = [child.label() for child in tree if isinstance(child, nltk.Tree)] return (tree.label() == 'VP') and ('S' in child_nodes) subtrees = [subtree for tree in treebank.parsed_sents() for subtree in tree.subtrees(filter)] for st in subtrees: print(st)
# Extracts Penn Treebank from NLTK. from nltk.corpus import treebank from operator import itemgetter import codecs words = treebank.sents() tagged_words = [map(itemgetter(1), sent) for sent in treebank.tagged_sents()] parsed_sents = treebank.parsed_sents() total_sents = len(parsed_sents) f = codecs.open('../data/penn_treebank','w','utf-8') assert (len(words) == len(tagged_words) and len(words) == len(parsed_sents)), ' '.join(map(str, [len(words), len(tagged_words), len(parsed_sents)])) f.write(str(total_sents) + '\n') for i in xrange(total_sents): sent_len = len(words[i]) f.write(str(sent_len) + '\n') sent = ' '.join(words[i]) pos = ' '.join(tagged_words[i]) assert(sent.count('\n') == 0 and pos.count('\n') == 0 and len(sent.split(' ')) == sent_len and len(pos.split(' ')) == sent_len) f.write(sent + '\n') f.write(pos + '\n') tree = str(parsed_sents[i]).split('\n') f.write(str(len(tree)) + '\n') f.write('\n'.join(tree) + '\n')
from nltk.corpus import treebank from nltk import Tree, Nonterminal from nltk.parse.viterbi import ViterbiParser from nltk.grammar import induce_pcfg from os import getcwd, walk from pickle import dump ############################### # 2) Remove numerical indices # ############################### print "Loading treebank." sentenceStrings = map(lambda x: x.pprint(), treebank.parsed_sents()) #these are various things we want to remove (indices) or replace indexStrings = map(str, range(166,0,-1)) #things that come before values and what should be left behind after #the index is removed indexPrefixes = [("-",""), ("=","")] indiceRemedy = lambda n: map(lambda x: (n[0]+x,n[1]), indexStrings) fixingTuples = reduce(lambda x,y: x+indiceRemedy(y), [[]]+indexPrefixes) #this is where the removing takes place print "Cleaning POS tags." removeTargets = lambda x: reduce(lambda y,z: y.replace(z[0],z[1]), \ [x]+fixingTuples) sentenceStrings = map(removeTargets, sentenceStrings) sentenceTrees = map(lambda x: Tree(x), sentenceStrings) ###################################################
import nltk from nltk.corpus import treebank print(treebank.parsed_sents('wsj_0007.mrg')[2])
from nltk.corpus import treebank from nltk.grammar import ContextFreeGrammar, Nonterminal from nltk.treetransforms import chomsky_normal_form ''' tbank_productions = set(production for sent in treebank.parsed_sents() for production in sent.productions()) ''' treebank_prods = [] for i in range(199): # for all found sets of fileids tbstuff = treebank._fileids[i] # get a bunch of 'em for tree in treebank.parsed_sents(tbstuff): tree.chomsky_normal_form() treebank_prods += tree.productions() tTCpcfg = nltk.induce_pcfg(Nonterminal('S'), list(treebank_prods)) # induce pcfg # PTCpcfg = nltk.induce_pcfg(tbank_grammar) # treetransforms: chomsky_normal_form print("done! You have your WeightedGrammar")
master_path = "./Data/" train_filepath = master_path + "train.csv" train_data = pd.read_csv(train_filepath) dup_prob = [] row_count = 0 for row in train_data.iterrows(): row_count += 1 q1 = row[1]['question1'] q2 = row[1]['question2'] while row_count < 19: print(treebank.parsed_sents(q1)[0]) print(treebank.parsed_sents(q2)[0]) if pd.isnull(q1): q1_words = [] else: q1_words = q1.split(' ') if pd.isnull(q2): q2_words = [] else: q2_words = q2.split(' ') wd_counter = 0 sim_counter = 0
import nltk from nltk.corpus import treebank from nltk.probability import * from nltk.grammar import * ### RETRIEVE ALL TREES AND THEN SELECT THE FIRST 100. all_trees = treebank.parsed_sents() trees_100 = all_trees[0:100] ### FUNCTION EXTRACTING LEAVES OF NODES WITH LABEL AS A PARAMETER OF getAvgNodeLength(). def getAvgNodeLength(label): l_leaves = list() for tree in trees_100: for node in tree: if node.label() == label: l_leaves.append(node.leaves()) ### CREATED OWN LIST OF PUNCTUATION TO EXCLUDE SINCE USING string.punctuation WOULD ### HAVE DELETED WORDS SUCH AS "Dr.", "World-Wide", "U.S.", etc. WHICH ARE OF INTEREST. punct = [u"*", u",", u"&", u"'"] for wordlist in l_sbj: for word in wordlist: for i in punct: if i in word: wordlist.remove(word) ### CREATE LIST OF LENGTHS (IN WORDS) OF NODES. l_len = list() for wordlist in l_leaves:
count = {} for sentence in sentences: for word in sentence: if word in count: count[word] += 1 else: count[word] = 1 ## 3. Estadisticas de transicion de palabras (2-gram model) from sklearn.feature_extraction.text import CountVectorizer sentences = sentences = texto.strip().split('.')[:-1] bigram_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=1) X_2 = bigram_vectorizer.fit_transform(sentences).toarray() ## 4. Using NLTK do a Part of Speech tagging (POS tagging) tokens = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(tokens) ## 5. Dibuja un arbol lexico-gramatical con la ayuda de NLTK from nltk.corpus import treebank t = treebank.parsed_sents('wsj_0001.mrg')[0] t.draw()
def main(): answers = open('coref_key.txt', 'r') this_correct = 0 correct = 0 total = 0 prev_sentences = deque() for file in FILENAMES: this_correct = 0 this_total = 0 prev_sentences.clear() for tree in treebank.parsed_sents(file): tree = ParentedTree.convert(tree) for pronoun, np_node in find_pronouns(tree): # i = 0 # for t in list(prev_sentences)[-3:]: # t.pretty_print() # print("-"*25) # i = i + 1 # if i == 3: break proposed = hobbs_to_string(hobbs(np_node, pronoun.lower(), prev_sentences)) tree.pretty_print() actual = answers.readline() if proposed == actual[:-1]: update_pronoun_results(pronoun, 1) correct += 1 this_correct += 1 update_pronoun_results(pronoun, 0) total += 1 this_total += 1 print "Pronoun: '" + pronoun + "' Proposed: '" + proposed + "' Actual: '" + actual + "'" if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n" print("*"*100) print("*"*100) prev_sentences.append(tree) print("-"*50) if this_correct: print file,":\tCorrect:", this_correct, "\tTotal:", this_total, "\tPercentage:", this_correct/float(this_total), "\n" if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n" print("-"*50) print "Male correct:", PRONOUN_RESULTS['male'], "\tMale total:", PRONOUN_RESULTS['male_total'], "\tPercent correct:", PRONOUN_RESULTS['male_pct'] print "Female correct:", PRONOUN_RESULTS['female'], "\tFemale total:", PRONOUN_RESULTS['female_total'], "\tPercent correct:", PRONOUN_RESULTS['female_pct'] print "Neutral correct:", PRONOUN_RESULTS['neutral'], "\tNeutral total:", PRONOUN_RESULTS['neutral_total'], "\tPercent correct:", PRONOUN_RESULTS['neutral_pct'] print "Plural correct:", PRONOUN_RESULTS['they'], "\tPlural total:", PRONOUN_RESULTS['they_total'], "\tPercent correct:", PRONOUN_RESULTS['they_pct'] print "Reflexive correct:", PRONOUN_RESULTS['reflexive'], "\tReflexive total:", PRONOUN_RESULTS['reflexive_total'], "\tPercent correct:", PRONOUN_RESULTS['reflexive_pct'] print "Total correct:", correct, "\tTotal:", total, "\tPercent correct:", correct/float(total)