def compute_sentence_probability(grammar, tokens): invalid_prob = 1e-20 earley_parser = nltk.EarleyChartParser(grammar, trace=0) viterbi_parser = nltk.ViterbiParser(grammar) try: e_chart = earley_parser.chart_parse(tokens) except ValueError: return 0 # d, tokens = find_closest_tokens(language, tokens) # return invalid_prob ** d # If the sentence is complete, return the Viterbi likelihood v_parses = viterbi_parser.parse_all(tokens) if v_parses: prob = functools.reduce(lambda a, b: a + b.prob(), v_parses, 0) / len(v_parses) return prob # If the sentence is incomplete, return the sum of probabilities of all possible sentences prob = 0 for edge in e_chart.edges(): if edge.end() == len(tokens) and isinstance(edge.nextsym(), str): prob += get_edge_prob(edge, e_chart, grammar) return prob
def pcfg_parser(): # grammar = nltk.parse_pcfg(""" # S -> NP VP [1.0] # VP -> TV NP [0.4] # VP -> IV [0.3] # VP -> DatV NP NP [0.3] # TV -> 'saw' [1.0] # IV -> 'ate' [1.0] # DatV -> 'gave' [1.0] # NP -> 'telescopes' [0.8] # NP -> 'Jack' [0.2] # """) # alternative repr, or clause probs must sum to 1 grammar = nltk.parse_pcfg(""" S -> NP VP [1.0] VP -> TV NP [0.4] | IV [0.3] | DatV NP NP [0.3] TV -> 'saw' [1.0] IV -> 'ate' [1.0] DatV -> 'gave' [1.0] NP -> 'telescopes' [0.8] NP -> 'Jack' [0.2] """) print grammar viterbi_parser = nltk.ViterbiParser(grammar) print viterbi_parser.parse("Jack saw telescopes".split())
def Viterbi_fromfile(grammarfile): print 'Build a parser from ', grammarfile f = open(grammarfile) grammarstring = f.read() f.close() grammar = nltk.parse_pcfg(grammarstring) print 'Grammar size: ', len(grammar.productions()) return nltk.ViterbiParser(grammar)
def validate(text): grammar = nltk.PCFG.fromstring(grammar_str) parser = nltk.ViterbiParser(grammar) trees = parser.parse(list(text)) valid = False location_name = None for tree in trees: value = tree_iterator(tree).leaves() location_name = ''.join(value) valid = True return (valid,location_name)
def viterbi_parser(self, include_edgelabels=True): if self._pcfg_parser is not None: return self._pcfg_parser def constructor(): return self.pcfg(include_edgelabels) self._pcfg = _cached( self._pcfg, TigerCorpusReader.STORAGE_ROOT + u"/" + TigerCorpusReader.PCFG_FILE_SUFFIX, constructor) self._pcfg_parser = nltk.ViterbiParser(self._pcfg) return self._pcfg_parser
def _parser(self, tokens: List[str]): """Generates a Parse Tree from a list of tokens provided by the Lexer. Args: tokens: A tokenized list of commands and Entities. i.e. ['control_play', 'query_similar_entities', 'Justin Bieber'] Returns: An nltk parse tree, as defined by the CFG given in this function. """ # TODO: Improve the CFG work for the following: # - Play songs faster than despicito # - Play something similar to despicito but faster # - Play something similar to u2 and justin bieber def gen_lexing_patterns(vals: List[str]): # TODO: Here we remove entries containing ', # as it is a special character used by # the NLTK parser. We need to fix this # eventually. safe_vals = [s for s in vals if "\'" not in s] return "' | '".join(safe_vals) or "NONE" # A Probabilistic Context Free Grammar (PCFG) # can be used to simulate "operator precedence", # which removes the problems of ambiguity in # the grammar. grammar = nltk.PCFG.fromstring(""" Root -> Terminal_Command Result [0.6] Root -> Terminal_Command [0.4] Result -> Entity [0.5] Result -> Unary_Command Result [0.1] Result -> Result Binary_Command Result [0.4] Entity -> '{}' [1.0] Unary_Command -> '{}' [1.0] Terminal_Command -> '{}' [1.0] Binary_Command -> '{}' [1.0] """.format( gen_lexing_patterns(self.kb_named_entities), gen_lexing_patterns(self.keywords.get("unary").keys()), gen_lexing_patterns(self.keywords.get("terminal").keys()), gen_lexing_patterns(self.keywords.get("binary").keys()), )) parser = nltk.ViterbiParser(grammar) # TODO: Returns the first tree, but need to deal with # case where grammar is ambiguous, and more than # one tree is returned. return next(parser.parse(tokens))
def sample_complete_sentence(grammar, tokens): # tokens should not exceed the longest possible sentence from the grammar # Set truncate=True when calling find_closest_tokens() to find a matched sentence complete_tokens = tokens[:] viterbi_parser = nltk.ViterbiParser(grammar) while not viterbi_parser.parse_all(complete_tokens): symbols, probs = predict_next_symbols(grammar, complete_tokens) try: complete_tokens.append(symbols[np.argmax(probs)]) except ValueError: # Cannot predict the next token (symbols and probs are empty lists), but the sentence is incomplete print tokens print complete_tokens return complete_tokens
def calc_logprior(grammar, tokens): invalid_prob=1e-20 viterbi_parser=nltk.ViterbiParser(grammar) try: v_parses=viterbi_parser.parse_all(tokens) if v_parses: prob=reduce(lambda a, b: a+b.prob(), v_parses, 0)/len(v_parses) else: print 'fail parse' return math.log(invalid_prob) except ValueError: return math.log(invalid_prob) return math.log(prob)
def get_processed_trees(self, grammar=None, trace=False): import os if not grammar: grammar_file = os.path.join(os.path.dirname(__file__), '../../../grammars/generic_question', self.question.category) else: # do based on priority # do map first grammar_file = os.path.join(os.path.dirname(__file__), '../../../grammars', grammar) with open(grammar_file, 'r') as file: grammar_str = file.read() grammar = nltk.PCFG.fromstring(grammar_str) parser = nltk.ViterbiParser(grammar) if trace: parser.trace() trees = parser.parse(self.question.question_extract) return trees
def main(): """ main function of the second and third part of PCL2 exercise 6 call of the script via the command line. Example call: $ python aufgabe02.py -g grammar.txt -s sentences.txt -o out.tex required arguments: -g / --grammar: txt-file containing either a CFG or PCFG optional arguments: -s / --sents: txt-file containing sentences (one per line) -o / --out: tex-file where the trees should get written to Unless you set an output file, the parsed sentences are only displayed on the command line. Otherwise, the trees are written qtree-conform to the declared tex-file. Assuming you have LaTeX installed, you can create a pdf file with your trees by typing the following command in your command line: $ pdflatex outfile.tex If no sentences(-s) given, you can write your sentences directly on the command line. To finish the input-mode, press 'ctrl+D'. """ # setting up the arguments with argparse argparser = argparse.ArgumentParser() argparser.add_argument('-o','--out', type=argparse.FileType('w'),\ metavar='FILE', help='output file') argparser.add_argument('-g','--grammar', type=argparse.FileType('r'),\ metavar='FILE', help='grammar file') argparser.add_argument('-s','--sents', type=argparse.FileType('r'), \ default=sys.stdin, metavar='FILE', help='sentence file') args = argparser.parse_args() # try to form a string from the data of the grammar file try: grammar_string = "".join(args.grammar) # if no grammar is given, exit the script # (assuming that nobody wants to write the same grammar over and over again) except TypeError: print "try:\t$ python aufgabe02.py -g [grammar.txt] -s [sentence.txt] -o "\ "[outfile.tex]\n\t(where '-s' and '-o' are optional arguments)" exit() # parsing grammar from string try: grammar = nltk.CFG.fromstring(grammar_string) parser = nltk.ChartParser(grammar) except ValueError: # Part of Ex03 # if the grammar contains probabilites, take the PCFG-method # (and use the ViterbiParser) grammar = nltk.PCFG.fromstring(grammar_string) parser = nltk.ViterbiParser(grammar) if DEBUG: print "parser used:", type(parser) # collecting input form the given file (or: stdin by default) sent_file = args.sents # assigning the output file to a new variable out_file = args.out # list containing all possible trees for every sentence all_trees_from_all_sentences = [] # parse all sentences inside the sentence file for sent in sent_file: if DEBUG: print "\nparsing sentence: ", sent, type(sent) sent = sent.split() try: # parsing the sentence with the given parser # and appending it to the bigger list (all_trees_from_all_sentences) all_trees_from_all_sentences.append( build_tree_variations(parser, sent)) if DEBUG: print "parsing of sentence: ", sent, "DONE" except ValueError as e: print "\nERROR:", sent, "couldn't be parsed\nReason: %s" % e # printing the trees onto the command line with pretty print for tree_variations in all_trees_from_all_sentences: # extracting the data from the given tuple (sent, tree_variations) = tree_variations sent = " ".join(sent) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" print "sentence: '%s'" % sent v_enum = 1 for tree in tree_variations: print "version %i" % v_enum # http://www.nltk.org/howto/tree.html tree.pretty_print(unicodelines=True, nodedist=4) v_enum += 1 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # compiling the .tex file if out_file: write_out_to_tex(all_trees_from_all_sentences, out_file, grammar) else: print "(no output file selected. write '-o outfile.tex' as an argument when " \ "running the script if you want to generate a .tex file)"
S -> NP VP [0.9]| VP [0.1] VP -> TranV NP [0.3] VP -> InV [0.3] VP -> DatV NP PP [0.4] PP -> P NP [1.0] TranV -> "saw" [0.2] | "ate" [0.2] | "walked" [0.2] | "shot" [0.2] | "book" [0.2] InV -> "ate" [0.5] | "walked" [0.5] DatV -> "gave" [0.2] | "ate" [0.2] | "saw" [0.2] | "walked" [0.2] | "shot" [0.2] NP -> Prop [0.2]| Det N [0.4] | Det N PP [0.4] Prop -> "John" [0.25]| "Mary" [0.25] | "Bob" [0.25] | "I" [0.25] Det -> "a" [0.2] | "an" [0.2] | "the" [0.2] | "my" [0.2] | "that" [0.2] N -> "man" [0.15] | "dog" [0.15] | "cat" [0.15] | "park" [0.15] | "telescope" [0.1] | "flight" [0.1] | "elephant" [0.1] | "pajamas" [0.1] P -> "in" [0.2] | "on" [0.2] | "by" [0.2] | "with" [0.2] | "through" [0.2] """) viterbi_parser = nltk.ViterbiParser(prob_grammar) for tree in viterbi_parser.parse(['John', 'saw', 'a', 'telescope']): print(tree) ## Last week�s Exercise # Define sentences for the exercise (the last sentence is newly added here) sentex1 = "I want a flight through Houston".split() sentex2 = "Jack walked with the dog".split() sentex3 = "I want to book that flight".split() sentex4 = "John gave the dog a bone".split() # extend the flight grammar: flight_grammar = nltk.CFG.fromstring(""" S -> NP VP | VP VP -> V NP | V NP PP PP -> P NP
def pcfg_chartparser(grammarfile): f = open(grammarfile) grammar = nltk.PCFG.fromstring(f.read()) f.close() return nltk.ViterbiParser(grammar)
grammar1 = nltk.CFG.fromstring(""" S -> NP VP VP -> V NP | V NP PP PP -> P NP V -> "saw" | "ate" | "walked" NP -> "John" | "Mary" | "Bob" | Det N | Det N PP Det -> "a" | "an" | "the" | "my" N -> "man" | "dog" | "cat" | "telescope" | "park" P -> "in" | "on" | "by" | "with" """) #Using different NLTK parsers with the same grammar parser1 = nltk.ChartParser(groucho_grammar) parser2 = nltk.RecursiveDescentParser(groucho_grammar) parser3 = nltk.ShiftReduceParser(groucho_grammar) parser4 = nltk.ViterbiParser(groucho_grammar) sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas'] '''print("Output according to Chart parser") for tree in parser1.parse(sent): print(tree) print("Output according to RD parser") for tree in parser2.parse(sent): print(tree) print("Output according to SR parser") for tree in parser3.parse(sent): print(tree)
treebank_productions[0:10] # add productions for each word, POS tag for word, tag in treebank.tagged_words(): t = nltk.Tree.fromstring("(" + tag + " " + word + ")") for production in t.productions(): treebank_productions.append(production) # build the PCFG based grammar treebank_grammar = nltk.grammar.induce_pcfg( Nonterminal('S'), treebank_productions ) # build the parser viterbi_parser = nltk.ViterbiParser(treebank_grammar) # get sample sentence tokens tokens = nltk.word_tokenize(sentence) # get parse tree for sample sentence result = list(viterbi_parser.parse(tokens)) # get tokens and their POS tags from pattern.en import tag as pos_tagger tagged_sent = pos_tagger(sentence) print tagged_sent # extend productions for sample sentence tokens
import nltk grammar3 = nltk.data.load('file:pcfg2.cfg', 'pcfg') parser1 = nltk.ViterbiParser(grammar3) sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas'] for tree in parser1.parse(sent): print(tree)
def Viterbi_fromgrammar(grammar): return nltk.ViterbiParser(grammar)
P_assignment_grammar = nltk.PCFG.fromstring(""" S -> NP VP [1] VP -> V NP NP [0.25]| V TO VP ADVP [0.25]| V ADVP [0.50] ADVP -> ADV [0.25]| V ADV [0.25]| ADV ADVP [0.25]| ADV ADJ [0.25] NP -> Prop [0.43]| Prop N [0.1425]| NU N [0.1425]| Det ADJ N [0.1425]| N [0.1425] ADV -> "now" [.25]| "ago" [.25]| "not" [.25]| "always" [.25] ADJ -> "naive" [.5]| "nice" [.5] V -> "go" [.166]| "had" [.166]| "came" [.166]| "visit" [.166]| "may" [.166]| "are" [.166] Prop -> "We" [.2]| "She" [.2]| "me" [.2]| "You" [.2]| "Their" [.2] Det -> "a" [1] N -> "yesterday" [.25]| "days" [.25]| "kids" [.25]| "party" [.25] TO -> "to" [1] NU -> "two" [1] """) P_parser = nltk.ViterbiParser(P_assignment_grammar) P_sent1 = 'We had a nice party yesterday'.split() P_sent2 = 'She came to visit me two days ago'.split() P_sent3 = 'You may go now'.split() P_sent4 = 'Their kids are not always naive'.split() P_sent5 = 'Their kids had a nice party yesterday'.split() P_sent6 = 'You are not always nice'.split() P_sent7 = 'Their kids may go now'.split() P_sent8 = 'Their party had kids yesterday'.split() for treeP1 in P_parser.parse(P_sent1): print(treeP1) for treeP2 in P_parser.parse(P_sent2):
def parse(grammar, raw_sents, goldenStandard): """ Parses the raw text with the provided grammar and compares chunking result to the golden standard. Counts false positives and false negatives of chunked noun phrases """ parser = nltk.ViterbiParser(grammar) falsePositives = 0 falseNegative = 0 amountPosTest = 0 amountNegTest = 0 posSucess = 0 negSucess = 0 for i in range(0, len(goldenStandard)): if len(raw_sents[i]) > 12: continue print("==== Parsing sentence " + str(i), flush=True) # This will raise an exception if the tokens in the test_sentence # are not covered by the grammar; should not happen. grammar.check_coverage(raw_sents[i]) # Test prints for seeing each parsed sentenced ''' print(raw_test_set[i]) print("[" + str(i) + "] Reference parse:") print(test_set[i]) print("[" + str(i) + "] Parse trees:")''' for tree in parser.parse(raw_sents[i]): #print(tree) for parsedTree in tree.subtrees(): if 'NP' in parsedTree.label() and parsedTree.label( ) != 'NNP' and parsedTree.label() != 'NNPS': amountPosTest += 1 checkSuccess = posSucess for goldTree in goldenStandard[i].subtrees(): if 'NP' in goldTree.label() and goldTree.label( ) != 'NNP' and goldTree.label() != 'NNPS': if parsedTree.leaves() == goldTree.leaves(): posSucess += 1 break if checkSuccess == posSucess: falsePositives += 1 print( "FALSE POSITIVE, Noun phrase not in golden standard:", parsedTree.leaves()) for goldTree in goldenStandard[i].subtrees(): if 'NP' in goldTree.label( ) and goldTree.label() != 'NNP' and goldTree.label() != 'NNPS': amountNegTest += 1 checkSuccess = negSucess for parsedTree in tree.subtrees(): if 'NP' in parsedTree.label() and parsedTree.label( ) != 'NNP' and parsedTree.label() != 'NNPS': if parsedTree.leaves() == goldTree.leaves(): negSucess += 1 break if checkSuccess == negSucess: falseNegative += 1 print( "FALSE NEGATIVE, Noun phrase not in parsed tree:", goldTree.leaves()) print("false positives: ", falsePositives, "out of", amountPosTest, "tests") print("false negatives: ", falseNegative, "out of", amountNegTest, "tests") print("correctly parsed noun phrases:", posSucess, "out of", posSucess + falseNegative, "in gold standard")
def main(sentences, grammarfile, pcfg_grammar, algo, output, \ to_keeps, percent_discard, beam=0): grammar = nltk.data.load("file:%s" %(grammarfile)) chart_parser = ChartParser(grammar,strategy=EARLEY_STRATEGY,trace=0) f = open(pcfg_grammar) pcfgrammar = f.read() f.close() if algo == "viterbi": pcfg_parser = nltk.ViterbiParser(nltk.parse_pcfg(pcfgrammar)) elif algo == "inside": pcfg_parser = pchart.InsideChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "random": pcfg_parser = pchart.RandomChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "longest": pcfg_parser = pchart.LongestChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "unsorted": pcfg_parser = pchart.UnsortedChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "chart": pass else: print "unrecognized algorithm: %s" %(algo) return 1 forest = [] for sentence in sentences: parsed_sent = sentence.split() print "parsed_sent: %s" %(parsed_sent) start = datetime.now() if algo == "chart": trees = chart_parser.nbest_parse(parsed_sent) else: trees = pcfg_parser.nbest_parse(parsed_sent) end = datetime.now() elapsed = end - start print "parsing time elapsed: %s" %(elapsed) print "parsing time elapsed: %d us" %(elapsed.microseconds) if (len(trees) == 0): print "failed to parse: %s" %(sentence) return 1; forest.append(trees) all_productions = grammar.productions() # randomly shuffle the productions all_productions = all_productions[0:len(all_productions)] random.shuffle(all_productions) random.shuffle(all_productions) status = 0 for keep in to_keeps: for discard in percent_discard: status += create_pruned_grammar(forest, all_productions, keep,\ discard, output) return status
def pcfg_test(pcfg, trees, correct_trees, vocab): # Write a function pcfg_test() that takes as its input a nltk.PCFG object, # and a collection of nltk.tree.Tree objects. Within this function, use an # nltk.ViterbiParser built from your PCFG to parse the sentences from the # trees you have been given. Then, compare your highest-probability parses # with the correct parses that your function has been given. Measure recall # and precision for each sentence in your test set, and report overall # recall, precision and F1 score for all sentence in your test set. Have # this function print out your results. s = str(pcfg.productions()) missing = [] edited_trees = [] for t in trees: t_string = str(t) for word in t.leaves(): if word not in s: print(word) missing.append(word) t_string = re.sub(word, "<UNK>", t_string) new = nltk.Tree.fromstring(t_string) edited_trees.append(new) print( str(len(missing)) + " words from testset missing from grammar vocabulary during testing. \n" ) # Instantiate parser vp = nltk.ViterbiParser(pcfg) ## Accuracy stats # A hypothesized constituent C_n is correct if there is a constituent # in the reference C_r with the same wordwise starting point, wordwise ending # point, and nonterminal symbol. We need all three to be correct. # Recall is the number of correct constituents in hypothesis divided by # number of constituents in reference. So if we have 8 correct terminals in # our output and the correct reference tree has 10 terminals, our recall is 8/10. # Precision is the number of correct constituents in hypothesis divided by the total # number of constituents in hypothesis. So if our output tree has 8/11 terminals # correct, 8/11 is our precision. # F1-Measure: 2PR / P+R is the stat commonly reported. ## Build a list of the ranges of constituents in reference. A 1-4, C 2-4, E 2-2 # Replace bottom node (the one that is just A->"word") with a tree structure A->1 # Test if each range of constituents is in the reference list for i in range(len(edited_trees)): t = edited_trees[i] correct_terminals = correct_trees[i].leaves() print("Analyzing sentence: " + str(correct_terminals) + "\n") sent = t.leaves() print("Finding most probable parse for tokens: " + str(sent) + "\n") parses = vp.parse(sent) for p in parses: print("Predicted most likely parse tree: \n") print(p) print(p.leaves()) print("Recall :" + str(Recall(correct_terminals, p.leaves()))) print("Precision :" + str(Precision(correct_terminals, p.leaves()))) print("F1 Score :" + str(F1Score(correct_terminals, p.leaves()))) print("\n\n")
def pcfg_chartparser(grammarfile): f = open(grammarfile) grammar = f.read() f.close() return nltk.ViterbiParser(nltk.parse_pcfg(grammar))
#Save PCFG into a text file manual_pcfg = [] for key in rules_prob.keys(): manual_pcfg.append(str(key) + ' [' + str(rules_prob[key]) + ']') manual_pcfg = "\n".join([r for r in manual_pcfg]) pcfg_file = open("pcfg_manual.txt", "w") pcfg_file.write(manual_pcfg) pcfg_file.close() #In order to validate our results we'll use the NLTK package to from nltk import induce_pcfg S = Nonterminal('S') grammar = induce_pcfg(S, gram_rules) sent = "show me the meals on the flight from Phoenix".split() inside_parser = nltk.InsideChartParser(grammar) i = 1 for tree in inside_parser.parse(sent): print('Tree number ' + str(i) + ":") print(tree) i += 1 print( "Using the Viterbi parser from NLTK to determine which tree is most likely" ) viterbi_parser = nltk.ViterbiParser(grammar) for tree in viterbi_parser.parse(sent): print(tree)
VP -> V NP [0.25]| V P VP [0.25]| Aux V [0.25]| V Adv Adv Adj [0.25] NP -> N [0.143]| Det Adj N [0.143]| Prop [0.571]| Prop N [0.143] PP -> Adj N Adv [0.666]| Adv [0.333] V -> "had" [0.2]| "came" [0.2]| "go" [0.2]| "visit" [0.2]| "are" [0.2] Prop -> "We" [0.2]| "She" [0.2]| "You" [0.2]| "Their" [0.2]| "me" [0.2] Det -> "a" [1] N -> "party" [0.25]| "kids" [0.25]| "yesterday" [0.25]| "days" [0.25] P -> "to" [1] Adj -> "nice" [0.333]| "naive" [0.333]| "two" [0.333] Adv -> "always" [0.25]| "ago" [0.25]| "now" [0.25]| "not" [0.25] Aux -> "may" [1] """) # In[3]: HW3_parser = nltk.ViterbiParser(HW3_grammar) # In[4]: # HW3_parser = nltk.RecursiveDescentParser(HW3_grammar) # In[5]: sen1 = "We had a nice party yesterday" tree1 = HW3_parser.parse(sen1.split()) for tree in list(tree1): print(tree) # In[6]: sen2 = "She came to visit me two days ago"