def rerank(args, unknownwords, printk, printprob, start, k, doreestimate): maxlen = 999 #?? unparsed = 0 coarse = readbitpargrammar(args[0], args[1], unknownwords, logprob=True) fine = readbitpargrammar(args[2], args[3], unknownwords, freqs=False, logprob=True) for a in fine.toid: assert a.rsplit("@", 1)[0] in coarse.toid, "%s not in coarse grammar" % a assert start in fine.toid, "Start symbol %r not in grammar." % start if doreestimate: reestimate(coarse, fine) infile = open(args[4]) if len(args) >= 5 else stdin out = open(args[5], "w") if len(args) == 6 else stdout times = [time.clock()] mapping = getgrammarmapping(coarse, fine) for n, a in enumerate(infile.read().split("\n\n")): if not a.strip(): continue sent = a.splitlines() if len(sent) > maxlen: continue for word in sent: assert unknownwords or ( word in coarse.lexicon and word in fine.lexicon), ( "unknown word and no open class tags supplied") print >> stderr, "parsing:", n, " ".join(sent) chart, _ = parse(sent, coarse, None) print >> stderr, '' if chart[0][len(sent)].get(coarse.toid[start]): trees = [] candidates = lazykbest(chart, coarse.toid[start], 0, len(sent), k, coarse.tolabel) lexchart = doplexprobs(Tree(candidates[0][0]), fine) for m, (tree, prob) in enumerate(candidates): trees.append((dopparseprob(Tree(tree), fine, mapping, lexchart), tree)) print >> stderr, m, exp(-prob), exp(trees[-1][0]) stdout.flush() results = nlargest(printk, trees) # print k-best parsetrees if printprob: out.writelines("parseprob=%.16g\n%s\n" % (exp(prob), tree) for prob, tree in results) else: out.writelines("%s\n" % tree for _, tree in results) else: unparsed += 1 print >> stderr, "No parse" out.write("No parse for \"%s\"\n" % " ".join(sent)) out.write("\n") times.append(time.clock()) print >> stderr, times[-1] - times[-2], "s" out.flush() print >> stderr, "raw cpu time", time.clock() - times[0] times = [a - b for a, b in zip(times[1::2], times[::2])] print >> stderr, "average time per sentence", sum(times) / len(times) print >> stderr, "unparsed sentences:", unparsed print >> stderr, "finished" out.close()
def rerank(args, unknownwords, printk, printprob, start, k, doreestimate): maxlen = 999 #?? unparsed = 0 coarse = readbitpargrammar(args[0], args[1], unknownwords, logprob=True) fine = readbitpargrammar(args[2], args[3], unknownwords, freqs=False, logprob=True) for a in fine.toid: assert a.rsplit("@", 1)[0] in coarse.toid, "%s not in coarse grammar" % a assert start in fine.toid, "Start symbol %r not in grammar." % start if doreestimate: reestimate(coarse, fine) infile = open(args[4]) if len(args) >= 5 else stdin out = open(args[5], "w") if len(args) == 6 else stdout times = [time.clock()] mapping = getgrammarmapping(coarse, fine) for n, a in enumerate(infile.read().split("\n\n")): if not a.strip(): continue sent = a.splitlines() if len(sent) > maxlen: continue for word in sent: assert unknownwords or ( word in coarse.lexicon and word in fine.lexicon), ( "unknown word and no open class tags supplied") print >>stderr, "parsing:", n, " ".join(sent) chart, _ = parse(sent, coarse, None); print >>stderr, '' if chart[0][len(sent)].get(coarse.toid[start]): trees = [] candidates = lazykbest(chart, coarse.toid[start], 0, len(sent), k, coarse.tolabel) lexchart = doplexprobs(Tree(candidates[0][0]), fine) for m, (tree, prob) in enumerate(candidates): trees.append((dopparseprob(Tree(tree), fine, mapping, lexchart), tree)) print >>stderr, m, exp(-prob), exp(trees[-1][0]) stdout.flush() results = nlargest(printk, trees) # print k-best parsetrees if printprob: out.writelines("parseprob=%.16g\n%s\n" % (exp(prob), tree) for prob, tree in results) else: out.writelines("%s\n" % tree for _, tree in results) else: unparsed += 1 print >>stderr, "No parse" out.write("No parse for \"%s\"\n" % " ".join(sent)) out.write("\n") times.append(time.clock()) print >>stderr, times[-1] - times[-2], "s" out.flush() print >>stderr, "raw cpu time", time.clock() - times[0] times = [a - b for a, b in zip(times[1::2], times[::2])] print >>stderr, "average time per sentence", sum(times) / len(times) print >>stderr, "unparsed sentences:", unparsed print >>stderr, "finished" out.close()
def simple(args, unknownwords, k, printprob, start): grammar = readbitpargrammar(args[0], args[1], unknownwords) assert start in grammar.toid, "Start symbol %r not in grammar." % start infile = open(args[2]) if len(args) >= 3 else stdin out = open(args[3], "w") if len(args) == 4 else stdout times = [time.clock()] for n, a in enumerate(infile.read().split("\n\n")): if not a.strip(): continue sent = a.splitlines() for word in sent: assert word in grammar.lexicon or unknownwords, ( "unknown word %r and no open class tags supplied" % word) print >> stderr, "parsing:", n, " ".join(sent), stdout.flush() chart, _ = parse(sent, grammar, None) if chart[0][len(sent)].get(grammar.toid[start]): parsetrees = lazykbest(chart, grammar.toid[start], 0, len(sent), k, grammar.tolabel) assert len(parsetrees) == len(set(parsetrees)) assert len(parsetrees) == len(set(tree for tree, _ in parsetrees)) if printprob: out.writelines("vitprob=%.16g\n%s\n" % (exp(-prob), tree) for tree, prob in parsetrees) else: out.writelines("%s\n" % tree for tree, _ in parsetrees) else: out.write("(NP %s)\n" % "".join("(%s %s)" % (a, a) for a in sent)) #out.write("No parse for \"%s\"\n" % " ".join(sent)) #out.write("\n") out.flush() times.append(time.clock()) print >> stderr, times[-1] - times[-2], "s" print >> stderr, "raw cpu time", time.clock() - times[0] times = [a - b for a, b in zip(times[1::2], times[::2])] print >> stderr, "average time per sentence", sum(times) / len(times) print >> stderr, "finished" out.close()
def simple(args, unknownwords, k, printprob, start): grammar = readbitpargrammar(args[0], args[1], unknownwords) assert start in grammar.toid, "Start symbol %r not in grammar." % start infile = open(args[2]) if len(args) >= 3 else stdin out = open(args[3], "w") if len(args) == 4 else stdout times = [time.clock()] for n, a in enumerate(infile.read().split("\n\n")): if not a.strip(): continue sent = a.splitlines() for word in sent: assert word in grammar.lexicon or unknownwords, ( "unknown word %r and no open class tags supplied" % word) print >>stderr, "parsing:", n, " ".join(sent), stdout.flush() chart, _ = parse(sent, grammar, None) if chart[0][len(sent)].get(grammar.toid[start]): parsetrees = lazykbest(chart, grammar.toid[start], 0, len(sent), k, grammar.tolabel) assert len(parsetrees) == len(set(parsetrees)) assert len(parsetrees) == len(set(tree for tree, _ in parsetrees)) if printprob: out.writelines("vitprob=%.16g\n%s\n" % (exp(-prob), tree) for tree, prob in parsetrees) else: out.writelines("%s\n" % tree for tree, _ in parsetrees) else: out.write("(NP %s)\n" % "".join("(%s %s)" % (a,a) for a in sent)) #out.write("No parse for \"%s\"\n" % " ".join(sent)) #out.write("\n") out.flush() times.append(time.clock()) print >>stderr, times[-1] - times[-2], "s" print >>stderr, "raw cpu time", time.clock() - times[0] times = [a - b for a, b in zip(times[1::2], times[::2])] print >>stderr, "average time per sentence", sum(times) / len(times) print >>stderr, "finished" out.close()