示例#1
0
def rerank(args, unknownwords, printk, printprob, start, k, doreestimate):
    maxlen = 999  #??
    unparsed = 0
    coarse = readbitpargrammar(args[0], args[1], unknownwords, logprob=True)
    fine = readbitpargrammar(args[2],
                             args[3],
                             unknownwords,
                             freqs=False,
                             logprob=True)
    for a in fine.toid:
        assert a.rsplit("@",
                        1)[0] in coarse.toid, "%s not in coarse grammar" % a
    assert start in fine.toid, "Start symbol %r not in grammar." % start
    if doreestimate: reestimate(coarse, fine)
    infile = open(args[4]) if len(args) >= 5 else stdin
    out = open(args[5], "w") if len(args) == 6 else stdout
    times = [time.clock()]
    mapping = getgrammarmapping(coarse, fine)
    for n, a in enumerate(infile.read().split("\n\n")):
        if not a.strip(): continue
        sent = a.splitlines()
        if len(sent) > maxlen: continue
        for word in sent:
            assert unknownwords or (
                word in coarse.lexicon and word in fine.lexicon), (
                    "unknown word and no open class tags supplied")
        print >> stderr, "parsing:", n, " ".join(sent)
        chart, _ = parse(sent, coarse, None)
        print >> stderr, ''
        if chart[0][len(sent)].get(coarse.toid[start]):
            trees = []
            candidates = lazykbest(chart, coarse.toid[start], 0, len(sent), k,
                                   coarse.tolabel)
            lexchart = doplexprobs(Tree(candidates[0][0]), fine)
            for m, (tree, prob) in enumerate(candidates):
                trees.append((dopparseprob(Tree(tree), fine, mapping,
                                           lexchart), tree))
                print >> stderr, m, exp(-prob), exp(trees[-1][0])
                stdout.flush()
            results = nlargest(printk, trees)
            # print k-best parsetrees
            if printprob:
                out.writelines("parseprob=%.16g\n%s\n" % (exp(prob), tree)
                               for prob, tree in results)
            else:
                out.writelines("%s\n" % tree for _, tree in results)
        else:
            unparsed += 1
            print >> stderr, "No parse"
            out.write("No parse for \"%s\"\n" % " ".join(sent))
        out.write("\n")
        times.append(time.clock())
        print >> stderr, times[-1] - times[-2], "s"
        out.flush()
    print >> stderr, "raw cpu time", time.clock() - times[0]
    times = [a - b for a, b in zip(times[1::2], times[::2])]
    print >> stderr, "average time per sentence", sum(times) / len(times)
    print >> stderr, "unparsed sentences:", unparsed
    print >> stderr, "finished"
    out.close()
示例#2
0
def rerank(args, unknownwords, printk, printprob, start, k, doreestimate):
	maxlen = 999 #??
	unparsed = 0
	coarse = readbitpargrammar(args[0], args[1], unknownwords, logprob=True)
	fine = readbitpargrammar(args[2], args[3], unknownwords, freqs=False, logprob=True)
	for a in fine.toid:
		assert a.rsplit("@", 1)[0] in coarse.toid, "%s not in coarse grammar" % a
	assert start in fine.toid, "Start symbol %r not in grammar." % start
	if doreestimate: reestimate(coarse, fine)
	infile = open(args[4]) if len(args) >= 5 else stdin
	out = open(args[5], "w") if len(args) == 6 else stdout
	times = [time.clock()]
	mapping = getgrammarmapping(coarse, fine)
	for n, a in enumerate(infile.read().split("\n\n")):
		if not a.strip(): continue
		sent = a.splitlines()
		if len(sent) > maxlen: continue
		for word in sent:
			assert unknownwords or (
				word in coarse.lexicon and word in fine.lexicon), (
				"unknown word and no open class tags supplied")
		print >>stderr, "parsing:", n, " ".join(sent)
		chart, _ = parse(sent, coarse, None); print >>stderr, ''
		if chart[0][len(sent)].get(coarse.toid[start]):
			trees = []
			candidates = lazykbest(chart, coarse.toid[start], 0, len(sent),
					k, coarse.tolabel)
			lexchart = doplexprobs(Tree(candidates[0][0]), fine)
			for m, (tree, prob) in enumerate(candidates):
				trees.append((dopparseprob(Tree(tree),
						fine, mapping, lexchart), tree))
				print >>stderr, m, exp(-prob), exp(trees[-1][0])
				stdout.flush()
			results = nlargest(printk, trees)
			# print k-best parsetrees
			if printprob:
				out.writelines("parseprob=%.16g\n%s\n" % (exp(prob), tree)
						for prob, tree in results)
			else:
				out.writelines("%s\n" % tree for _, tree in results)
		else:
			unparsed += 1
			print >>stderr, "No parse"
			out.write("No parse for \"%s\"\n" % " ".join(sent))
		out.write("\n")
		times.append(time.clock())
		print >>stderr, times[-1] - times[-2], "s"
		out.flush()
	print >>stderr, "raw cpu time", time.clock() - times[0]
	times = [a - b for a, b in zip(times[1::2], times[::2])]
	print >>stderr, "average time per sentence", sum(times) / len(times)
	print >>stderr, "unparsed sentences:", unparsed
	print >>stderr, "finished"
	out.close()
示例#3
0
def simple(args, unknownwords, k, printprob, start):
    grammar = readbitpargrammar(args[0], args[1], unknownwords)
    assert start in grammar.toid, "Start symbol %r not in grammar." % start
    infile = open(args[2]) if len(args) >= 3 else stdin
    out = open(args[3], "w") if len(args) == 4 else stdout
    times = [time.clock()]
    for n, a in enumerate(infile.read().split("\n\n")):
        if not a.strip(): continue
        sent = a.splitlines()
        for word in sent:
            assert word in grammar.lexicon or unknownwords, (
                "unknown word %r and no open class tags supplied" % word)
        print >> stderr, "parsing:", n, " ".join(sent),
        stdout.flush()
        chart, _ = parse(sent, grammar, None)
        if chart[0][len(sent)].get(grammar.toid[start]):
            parsetrees = lazykbest(chart, grammar.toid[start], 0, len(sent), k,
                                   grammar.tolabel)
            assert len(parsetrees) == len(set(parsetrees))
            assert len(parsetrees) == len(set(tree for tree, _ in parsetrees))
            if printprob:
                out.writelines("vitprob=%.16g\n%s\n" % (exp(-prob), tree)
                               for tree, prob in parsetrees)
            else:
                out.writelines("%s\n" % tree for tree, _ in parsetrees)
        else:
            out.write("(NP %s)\n" % "".join("(%s %s)" % (a, a) for a in sent))
            #out.write("No parse for \"%s\"\n" % " ".join(sent))
        #out.write("\n")
        out.flush()
        times.append(time.clock())
        print >> stderr, times[-1] - times[-2], "s"
    print >> stderr, "raw cpu time", time.clock() - times[0]
    times = [a - b for a, b in zip(times[1::2], times[::2])]
    print >> stderr, "average time per sentence", sum(times) / len(times)
    print >> stderr, "finished"
    out.close()
示例#4
0
def simple(args, unknownwords, k, printprob, start):
	grammar = readbitpargrammar(args[0], args[1], unknownwords)
	assert start in grammar.toid, "Start symbol %r not in grammar." % start
	infile = open(args[2]) if len(args) >= 3 else stdin
	out = open(args[3], "w") if len(args) == 4 else stdout
	times = [time.clock()]
	for n, a in enumerate(infile.read().split("\n\n")):
		if not a.strip(): continue
		sent = a.splitlines()
		for word in sent:
			assert word in grammar.lexicon or unknownwords, (
				"unknown word %r and no open class tags supplied" % word)
		print >>stderr, "parsing:", n, " ".join(sent),
		stdout.flush()
		chart, _ = parse(sent, grammar, None)
		if chart[0][len(sent)].get(grammar.toid[start]):
			parsetrees = lazykbest(chart, grammar.toid[start], 0, len(sent),
					k, grammar.tolabel)
			assert len(parsetrees) == len(set(parsetrees))
			assert len(parsetrees) == len(set(tree for tree, _ in parsetrees))
			if printprob:
				out.writelines("vitprob=%.16g\n%s\n" % (exp(-prob), tree)
					for tree, prob in parsetrees)
			else: out.writelines("%s\n" % tree for tree, _ in parsetrees)
		else:
			out.write("(NP %s)\n" % "".join("(%s %s)" % (a,a) for a in sent))
			#out.write("No parse for \"%s\"\n" % " ".join(sent))
		#out.write("\n")
		out.flush()
		times.append(time.clock())
		print >>stderr, times[-1] - times[-2], "s"
	print >>stderr, "raw cpu time", time.clock() - times[0]
	times = [a - b for a, b in zip(times[1::2], times[::2])]
	print >>stderr, "average time per sentence", sum(times) / len(times)
	print >>stderr, "finished"
	out.close()