def rerank(args, unknownwords, printk, printprob, start, k, doreestimate): maxlen = 999 #?? unparsed = 0 coarse = readbitpargrammar(args[0], args[1], unknownwords, logprob=True) fine = readbitpargrammar(args[2], args[3], unknownwords, freqs=False, logprob=True) for a in fine.toid: assert a.rsplit("@", 1)[0] in coarse.toid, "%s not in coarse grammar" % a assert start in fine.toid, "Start symbol %r not in grammar." % start if doreestimate: reestimate(coarse, fine) infile = open(args[4]) if len(args) >= 5 else stdin out = open(args[5], "w") if len(args) == 6 else stdout times = [time.clock()] mapping = getgrammarmapping(coarse, fine) for n, a in enumerate(infile.read().split("\n\n")): if not a.strip(): continue sent = a.splitlines() if len(sent) > maxlen: continue for word in sent: assert unknownwords or ( word in coarse.lexicon and word in fine.lexicon), ( "unknown word and no open class tags supplied") print >> stderr, "parsing:", n, " ".join(sent) chart, _ = parse(sent, coarse, None) print >> stderr, '' if chart[0][len(sent)].get(coarse.toid[start]): trees = [] candidates = lazykbest(chart, coarse.toid[start], 0, len(sent), k, coarse.tolabel) lexchart = doplexprobs(Tree(candidates[0][0]), fine) for m, (tree, prob) in enumerate(candidates): trees.append((dopparseprob(Tree(tree), fine, mapping, lexchart), tree)) print >> stderr, m, exp(-prob), exp(trees[-1][0]) stdout.flush() results = nlargest(printk, trees) # print k-best parsetrees if printprob: out.writelines("parseprob=%.16g\n%s\n" % (exp(prob), tree) for prob, tree in results) else: out.writelines("%s\n" % tree for _, tree in results) else: unparsed += 1 print >> stderr, "No parse" out.write("No parse for \"%s\"\n" % " ".join(sent)) out.write("\n") times.append(time.clock()) print >> stderr, times[-1] - times[-2], "s" out.flush() print >> stderr, "raw cpu time", time.clock() - times[0] times = [a - b for a, b in zip(times[1::2], times[::2])] print >> stderr, "average time per sentence", sum(times) / len(times) print >> stderr, "unparsed sentences:", unparsed print >> stderr, "finished" out.close()
def rerank(args, unknownwords, printk, printprob, start, k, doreestimate): maxlen = 999 #?? unparsed = 0 coarse = readbitpargrammar(args[0], args[1], unknownwords, logprob=True) fine = readbitpargrammar(args[2], args[3], unknownwords, freqs=False, logprob=True) for a in fine.toid: assert a.rsplit("@", 1)[0] in coarse.toid, "%s not in coarse grammar" % a assert start in fine.toid, "Start symbol %r not in grammar." % start if doreestimate: reestimate(coarse, fine) infile = open(args[4]) if len(args) >= 5 else stdin out = open(args[5], "w") if len(args) == 6 else stdout times = [time.clock()] mapping = getgrammarmapping(coarse, fine) for n, a in enumerate(infile.read().split("\n\n")): if not a.strip(): continue sent = a.splitlines() if len(sent) > maxlen: continue for word in sent: assert unknownwords or ( word in coarse.lexicon and word in fine.lexicon), ( "unknown word and no open class tags supplied") print >>stderr, "parsing:", n, " ".join(sent) chart, _ = parse(sent, coarse, None); print >>stderr, '' if chart[0][len(sent)].get(coarse.toid[start]): trees = [] candidates = lazykbest(chart, coarse.toid[start], 0, len(sent), k, coarse.tolabel) lexchart = doplexprobs(Tree(candidates[0][0]), fine) for m, (tree, prob) in enumerate(candidates): trees.append((dopparseprob(Tree(tree), fine, mapping, lexchart), tree)) print >>stderr, m, exp(-prob), exp(trees[-1][0]) stdout.flush() results = nlargest(printk, trees) # print k-best parsetrees if printprob: out.writelines("parseprob=%.16g\n%s\n" % (exp(prob), tree) for prob, tree in results) else: out.writelines("%s\n" % tree for _, tree in results) else: unparsed += 1 print >>stderr, "No parse" out.write("No parse for \"%s\"\n" % " ".join(sent)) out.write("\n") times.append(time.clock()) print >>stderr, times[-1] - times[-2], "s" out.flush() print >>stderr, "raw cpu time", time.clock() - times[0] times = [a - b for a, b in zip(times[1::2], times[::2])] print >>stderr, "average time per sentence", sum(times) / len(times) print >>stderr, "unparsed sentences:", unparsed print >>stderr, "finished" out.close()
def simple(args, unknownwords, k, printprob, start): grammar = readbitpargrammar(args[0], args[1], unknownwords) assert start in grammar.toid, "Start symbol %r not in grammar." % start infile = open(args[2]) if len(args) >= 3 else stdin out = open(args[3], "w") if len(args) == 4 else stdout times = [time.clock()] for n, a in enumerate(infile.read().split("\n\n")): if not a.strip(): continue sent = a.splitlines() for word in sent: assert word in grammar.lexicon or unknownwords, ( "unknown word %r and no open class tags supplied" % word) print >> stderr, "parsing:", n, " ".join(sent), stdout.flush() chart, _ = parse(sent, grammar, None) if chart[0][len(sent)].get(grammar.toid[start]): parsetrees = lazykbest(chart, grammar.toid[start], 0, len(sent), k, grammar.tolabel) assert len(parsetrees) == len(set(parsetrees)) assert len(parsetrees) == len(set(tree for tree, _ in parsetrees)) if printprob: out.writelines("vitprob=%.16g\n%s\n" % (exp(-prob), tree) for tree, prob in parsetrees) else: out.writelines("%s\n" % tree for tree, _ in parsetrees) else: out.write("(NP %s)\n" % "".join("(%s %s)" % (a, a) for a in sent)) #out.write("No parse for \"%s\"\n" % " ".join(sent)) #out.write("\n") out.flush() times.append(time.clock()) print >> stderr, times[-1] - times[-2], "s" print >> stderr, "raw cpu time", time.clock() - times[0] times = [a - b for a, b in zip(times[1::2], times[::2])] print >> stderr, "average time per sentence", sum(times) / len(times) print >> stderr, "finished" out.close()
def simple(args, unknownwords, k, printprob, start): grammar = readbitpargrammar(args[0], args[1], unknownwords) assert start in grammar.toid, "Start symbol %r not in grammar." % start infile = open(args[2]) if len(args) >= 3 else stdin out = open(args[3], "w") if len(args) == 4 else stdout times = [time.clock()] for n, a in enumerate(infile.read().split("\n\n")): if not a.strip(): continue sent = a.splitlines() for word in sent: assert word in grammar.lexicon or unknownwords, ( "unknown word %r and no open class tags supplied" % word) print >>stderr, "parsing:", n, " ".join(sent), stdout.flush() chart, _ = parse(sent, grammar, None) if chart[0][len(sent)].get(grammar.toid[start]): parsetrees = lazykbest(chart, grammar.toid[start], 0, len(sent), k, grammar.tolabel) assert len(parsetrees) == len(set(parsetrees)) assert len(parsetrees) == len(set(tree for tree, _ in parsetrees)) if printprob: out.writelines("vitprob=%.16g\n%s\n" % (exp(-prob), tree) for tree, prob in parsetrees) else: out.writelines("%s\n" % tree for tree, _ in parsetrees) else: out.write("(NP %s)\n" % "".join("(%s %s)" % (a,a) for a in sent)) #out.write("No parse for \"%s\"\n" % " ".join(sent)) #out.write("\n") out.flush() times.append(time.clock()) print >>stderr, times[-1] - times[-2], "s" print >>stderr, "raw cpu time", time.clock() - times[0] times = [a - b for a, b in zip(times[1::2], times[::2])] print >>stderr, "average time per sentence", sum(times) / len(times) print >>stderr, "finished" out.close()
def ctf(args, unknownwords, k, printprob, start, threshold, posterior, doreestimate, mpd): m = 10000 # number of derivations from fine grammar to marginalize if posterior: threshold = exp(threshold) maxlen = 999 #65 unparsed = 0 coarse = readbitpargrammar(args[0], args[1], unknownwords, logprob=not posterior) fine = readbitpargrammar(args[2], args[3], unknownwords, freqs=False) for a in fine.toid: assert a.rsplit("@", 1)[0] in coarse.toid, "%s not in coarse grammar" % a assert start in fine.toid, "Start symbol %r not in grammar." % start if doreestimate: reestimate(coarse, fine) infile = open(args[4]) if len(args) >= 5 else stdin out = open(args[5], "w") if len(args) == 6 else stdout times = [time.clock()] if posterior: inside = np.zeros((maxlen, maxlen + 1, len(coarse.toid)), dtype='d') outside = np.zeros_like(inside) else: coarsechart = np.empty((maxlen, maxlen + 1, len(coarse.toid)), dtype='d') coarsechart.fill(np.inf) finechart = np.empty_like(coarsechart) finechart.fill(np.NAN) mapping = nonterminalmapping(coarse, fine) for n, a in enumerate(infile.read().split("\n\n")): if not a.strip(): continue sent = a.splitlines() if len(sent) > maxlen: continue for word in sent: assert unknownwords or ( word in coarse.lexicon and word in fine.lexicon), ( "unknown word and no open class tags supplied") print >> stderr, "parsing:", n, " ".join(sent) if posterior: inside, outside = doinsideoutside(sent, coarse, inside, outside) #print "inside"; pprint_matrix(inside, sent, coarse.tolabel) #print "outside"; pprint_matrix(outside, sent, coarse.tolabel) else: coarsechart[:len(sent), :len(sent) + 1, :] = np.inf chart, coarsechart = parse(sent, coarse, coarsechart) if posterior: goalitem = inside[0, len(sent), coarse.toid[start]] else: goalitem = chart[0][len(sent)].get(coarse.toid[start]) if goalitem: print >> stderr, "pruning ...", stdout.flush() if posterior: finechart = whitelistfromposteriors2(inside, outside, coarse.toid[start], len(sent), coarse, fine, mapping, threshold) else: finechart = whitelistfromkbest(chart, coarse.toid[start], len(sent), coarse, fine, threshold, mapping) #chart, finechart = parse(sent, fine, finechart) chart = parse_sparse(sent, fine, finechart) assert chart[0][len(sent)][fine.toid[start]], ( "sentence covered by coarse grammar could not be parsed "\ "by fine grammar") parsetrees = marginalize(chart, fine.toid[start], fine.tolabel, sent, n=m, mpd=mpd) results = nlargest(k, parsetrees, key=parsetrees.get) # print k-best parsetrees if printprob: label = "derivprob" if mpd else "parseprob" out.writelines("%s=%.16g\n%s\n" % (label, parsetrees[tree], tree) for tree in results) else: out.writelines("%s\n" % tree for tree in results) else: unparsed += 1 print >> stderr, "No parse" out.write("No parse for \"%s\"\n" % " ".join(sent)) out.write("\n") times.append(time.clock()) print >> stderr, times[-1] - times[-2], "s" out.flush() print >> stderr, "raw cpu time", time.clock() - times[0] times = [a - b for a, b in zip(times[1::2], times[::2])] print >> stderr, "average time per sentence", sum(times) / len(times) print >> stderr, "unparsed sentences:", unparsed print >> stderr, "finished" out.close()
def ctf(args, unknownwords, k, printprob, start, threshold, posterior, doreestimate, mpd): m = 10000 # number of derivations from fine grammar to marginalize if posterior: threshold = exp(threshold) maxlen = 999 #65 unparsed = 0 coarse = readbitpargrammar(args[0], args[1], unknownwords, logprob=not posterior) fine = readbitpargrammar(args[2], args[3], unknownwords, freqs=False) for a in fine.toid: assert a.rsplit("@", 1)[0] in coarse.toid, "%s not in coarse grammar" % a assert start in fine.toid, "Start symbol %r not in grammar." % start if doreestimate: reestimate(coarse, fine) infile = open(args[4]) if len(args) >= 5 else stdin out = open(args[5], "w") if len(args) == 6 else stdout times = [time.clock()] if posterior: inside = np.zeros((maxlen, maxlen + 1, len(coarse.toid)), dtype='d') outside = np.zeros_like(inside) else: coarsechart = np.empty((maxlen, maxlen + 1, len(coarse.toid)), dtype='d') coarsechart.fill(np.inf) finechart = np.empty_like(coarsechart) finechart.fill(np.NAN) mapping = nonterminalmapping(coarse, fine) for n, a in enumerate(infile.read().split("\n\n")): if not a.strip(): continue sent = a.splitlines() if len(sent) > maxlen: continue for word in sent: assert unknownwords or ( word in coarse.lexicon and word in fine.lexicon), ( "unknown word and no open class tags supplied") print >>stderr, "parsing:", n, " ".join(sent) if posterior: inside, outside = doinsideoutside(sent, coarse, inside, outside) #print "inside"; pprint_matrix(inside, sent, coarse.tolabel) #print "outside"; pprint_matrix(outside, sent, coarse.tolabel) else: coarsechart[:len(sent), :len(sent)+1, :] = np.inf chart, coarsechart = parse(sent, coarse, coarsechart) if posterior: goalitem = inside[0, len(sent), coarse.toid[start]] else: goalitem = chart[0][len(sent)].get(coarse.toid[start]) if goalitem: print >>stderr, "pruning ...", stdout.flush() if posterior: finechart = whitelistfromposteriors2(inside, outside, coarse.toid[start], len(sent), coarse, fine, mapping, threshold) else: finechart = whitelistfromkbest(chart, coarse.toid[start], len(sent), coarse, fine, threshold, mapping) #chart, finechart = parse(sent, fine, finechart) chart = parse_sparse(sent, fine, finechart) assert chart[0][len(sent)][fine.toid[start]], ( "sentence covered by coarse grammar could not be parsed "\ "by fine grammar") parsetrees = marginalize(chart, fine.toid[start], fine.tolabel, sent, n=m, mpd=mpd) results = nlargest(k, parsetrees, key=parsetrees.get) # print k-best parsetrees if printprob: label = "derivprob" if mpd else "parseprob" out.writelines("%s=%.16g\n%s\n" % (label, parsetrees[tree], tree) for tree in results) else: out.writelines("%s\n" % tree for tree in results) else: unparsed += 1 print >>stderr, "No parse" out.write("No parse for \"%s\"\n" % " ".join(sent)) out.write("\n") times.append(time.clock()) print >>stderr, times[-1] - times[-2], "s" out.flush() print >>stderr, "raw cpu time", time.clock() - times[0] times = [a - b for a, b in zip(times[1::2], times[::2])] print >>stderr, "average time per sentence", sum(times) / len(times) print >>stderr, "unparsed sentences:", unparsed print >>stderr, "finished" out.close()