def pcfg_parser(): # grammar = nltk.parse_pcfg(""" # S -> NP VP [1.0] # VP -> TV NP [0.4] # VP -> IV [0.3] # VP -> DatV NP NP [0.3] # TV -> 'saw' [1.0] # IV -> 'ate' [1.0] # DatV -> 'gave' [1.0] # NP -> 'telescopes' [0.8] # NP -> 'Jack' [0.2] # """) # alternative repr, or clause probs must sum to 1 grammar = nltk.parse_pcfg(""" S -> NP VP [1.0] VP -> TV NP [0.4] | IV [0.3] | DatV NP NP [0.3] TV -> 'saw' [1.0] IV -> 'ate' [1.0] DatV -> 'gave' [1.0] NP -> 'telescopes' [0.8] NP -> 'Jack' [0.2] """) print grammar viterbi_parser = nltk.ViterbiParser(grammar) print viterbi_parser.parse("Jack saw telescopes".split())
def pcfg_parser(): # grammar = nltk.parse_pcfg(""" # S -> NP VP [1.0] # VP -> TV NP [0.4] # VP -> IV [0.3] # VP -> DatV NP NP [0.3] # TV -> 'saw' [1.0] # IV -> 'ate' [1.0] # DatV -> 'gave' [1.0] # NP -> 'telescopes' [0.8] # NP -> 'Jack' [0.2] # """) # alternative repr, or clause probs must sum to 1 grammar = nltk.parse_pcfg( """ S -> NP VP [1.0] VP -> TV NP [0.4] | IV [0.3] | DatV NP NP [0.3] TV -> 'saw' [1.0] IV -> 'ate' [1.0] DatV -> 'gave' [1.0] NP -> 'telescopes' [0.8] NP -> 'Jack' [0.2] """ ) print grammar viterbi_parser = nltk.ViterbiParser(grammar) print viterbi_parser.parse("Jack saw telescopes".split())
def Viterbi_fromfile(grammarfile): print 'Build a parser from ', grammarfile f = open(grammarfile) grammarstring = f.read() f.close() grammar = nltk.parse_pcfg(grammarstring) print 'Grammar size: ', len(grammar.productions()) return nltk.ViterbiParser(grammar)
def Viterbi_fromfile(grammarfile): print 'Build a parser from ',grammarfile f = open(grammarfile) grammarstring = f.read() f.close() grammar = nltk.parse_pcfg(grammarstring) print 'Grammar size: ',len(grammar.productions()) return nltk.ViterbiParser(grammar)
def main(sentences, grammarfile, pcfg_grammar, algo, output, \ to_keeps, percent_discard, beam=0): grammar = nltk.data.load("file:%s" %(grammarfile)) chart_parser = ChartParser(grammar,strategy=EARLEY_STRATEGY,trace=0) f = open(pcfg_grammar) pcfgrammar = f.read() f.close() if algo == "viterbi": pcfg_parser = nltk.ViterbiParser(nltk.parse_pcfg(pcfgrammar)) elif algo == "inside": pcfg_parser = pchart.InsideChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "random": pcfg_parser = pchart.RandomChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "longest": pcfg_parser = pchart.LongestChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "unsorted": pcfg_parser = pchart.UnsortedChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "chart": pass else: print "unrecognized algorithm: %s" %(algo) return 1 forest = [] for sentence in sentences: parsed_sent = sentence.split() print "parsed_sent: %s" %(parsed_sent) start = datetime.now() if algo == "chart": trees = chart_parser.nbest_parse(parsed_sent) else: trees = pcfg_parser.nbest_parse(parsed_sent) end = datetime.now() elapsed = end - start print "parsing time elapsed: %s" %(elapsed) print "parsing time elapsed: %d us" %(elapsed.microseconds) if (len(trees) == 0): print "failed to parse: %s" %(sentence) return 1; forest.append(trees) all_productions = grammar.productions() # randomly shuffle the productions all_productions = all_productions[0:len(all_productions)] random.shuffle(all_productions) random.shuffle(all_productions) status = 0 for keep in to_keeps: for discard in percent_discard: status += create_pruned_grammar(forest, all_productions, keep,\ discard, output) return status
def pcfg_chartparser(grammarfile): f = open(grammarfile) grammar = f.read() f.close() return nltk.ViterbiParser(nltk.parse_pcfg(grammar))
def give(t): return t.node == 'VP' and len(t) > 2 and t[1].node == 'NP' and (t[2].node == 'PP-DTV' or t[2].node == 'NP') and ('give' in t[0].leaves() or 'gave' in t[0].leaves()) def sent(t): return ' '.join(token for token in t.leaves() if token[0] not in '*-0') def print_node(t, width): output = "%s %s: %s / %s: %s" % (sent(t[0]), t[1].node, sent(t[1]), t[2].node, sent(t[2])) if len(output) > width: output = output[:width] + "..." print output for tree in nltk.corpus.treebank.parsed_sents(): for t in tree.subtrees(give): print_node(t, 72) grammar = nltk.parse_pcfg(""" S -> NP VP [1.0] VP -> TV NP [0.4] VP -> IV [0.3] VP -> DatV NP NP [0.3] TV -> 'saw' [1.0] IV -> 'ate' [1.0] DatV -> 'gave' [1.0] NP -> 'telescopes' [0.8] NP -> 'Jack' [0.2] """) print grammar grammarDevelopmen()
pcfg = [] #creo la pcfg, quindi inserendo le probabilita' for p in probs: wf=str(p) #parse_pcfg non accetta tutta una serie di caratteri e non terminali composti da caratteri non alfanumerici #quindi sono necessarie un po' di replace wf=wf.replace(",","\",\"").replace("``","\"``\"").replace(".","\".\"").replace("=","--") wf=wf.replace(":","\":\"").replace("\'\'","\"\'\'\"").replace("#","\"#\"").replace("$","SS") wf=wf.replace("-LRB-","LRB-").replace("-NONE-","NONE-").replace("-RRB-","RRB-").replace("ADVP|PRT","ADV-PRT").strip() if not wf.startswith('\"'): pcfg.append(wf+" ["+str('{0:.10f}'.format(probs[str(p)]))+"]") #print p #creazione grammatica grammar = nltk.parse_pcfg(pcfg) viterbi_parser = nltk.ViterbiParser(grammar) #frase di prova sent = 'I can finally drink a beer now' print sent sent=sent.split() #parsificazione for tree in viterbi_parser.nbest_parse(sent,3): print tree
## print "%-8s\t %-16s\t %s" %("WORD", "FWD_PROB", "SURPRISAL") ## print "-"*50 ## for i in range(len(pre_probs)): ## if i > 0: ## print "%-8s\t %-16s\t %s" %(pre_probs[i][0], str(pre_probs[i][1]),\ ## str(math.log(pre_probs[i-1][1]/pre_probs[i][1], 2))) ## else: ## print "%-8s\t %-16s\t %s" %(pre_probs[i][0], str(pre_probs[i][1]),\ ## str(math.log(1./pre_probs[i][1], 2))) if __name__ == "__main__": ftext = open('allsents.pcfg.txt').read() + open('allsents.lexicon.txt').read() PROB_RE = re.compile(r'( \[ [+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)? \] ) \s*', re.VERBOSE) nltk.grammar._PROBABILITY_RE = PROB_RE gram = nltk.parse_pcfg(ftext) pparser = PrefixParser(gram) sentences = '''The actor who was impressed by the critic humiliated the director. The actor who the critic was impressed by humiliated the director. The actor who impressed the critic humiliated the director. The actor who the critic impressed humiliated the director. The director humiliated the actor who impressed the critic. The director humiliated the actor who the critic impressed. The activist began the rebellion by organizing the strike. The actress was praised by the director filming the movie. The babysitter grounded the child and called the parents. The dictator was loved by the people and hated by the world. The crowd admired the vocalist of the band. The dog was attacked by the leopard from the zoo.