示例#1
0
def main():
	assert len(sys.argv) == 1

	sentence = 0
	for l in sys.stdin:
		sentence += 1

		if not l: assert(0)
#		# Skip blank lines
#		if not string.strip(l):
#			print
#			continue

		tree = parsetree.read_tree(l)
		assert tree != None

		# Sanity check that the tree's already been regularized.
		treestr = tree.to_string()
		tree = parsetree.regularize(tree)
		assert tree.to_string() == treestr

		if duplicate_top_item:
			# Add a second TOP label, s.t. we can raise punctuation
			# above the first TOP label
			node = parsetree.Node()
			node.isleaf = 0
			node.label = "TOP"
			node.children = [tree]
			tree = parsetree.refresh(node)
			tree = parsetree.preprocess(tree)
		else:
			tree = parsetree.preprocess(tree)

		for n in tree.leaves():
			# Make sure that the headtag is a terminal label (POS tag)
			assert vocab.label_to_idx[n.headtag][1] == 1
			# Make sure that the headword is in the vocabulary
			assert vocab.vocab_to_idx[n.headword] > 0

		for n in tree.internal_nodes():
			# Make sure that the label is a constituent label
			assert vocab.label_to_idx[n.label][1] == 0

		print tree.to_string()

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)
示例#2
0
def main():
	assert len(sys.argv) == 1

	sentence = 0
	for l in sys.stdin:
		sentence += 1

		if not l: assert(0)

		tree = parsetree.read_tree(l)
		tree = parsetree.regularize(tree)
		assert tree != None

		# SANITY CHECK:
		# Ensure that the cleaned output is "stable", i.e. that
		# this script will produce identical output if we pipe
		# cleaned output from this script back into it.
		assert tree.to_string() == parsetree.read_tree(tree.to_string()).to_string()
		print tree.to_string()

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)