Пример #1
0
def preprocess(tree, delexicalize=True):
	if convert_PRT_to_ADVP:
		for n in tree.internal_nodes():
			if n.label == 'PRT':
				n.label = 'ADVP'

	if remove_quotation_marks:
		tree.prune_labels(["``", "''"])

	if remove_outermost_punctuation:
		tree.remove_leftmost_punctuation()
		tree.remove_rightmost_punctuation()

	if raise_punctuation:
		tree.raise_punctuation()

	# Noise that can be removed reliably should be removed as early as possible.
	if remove_unary_projections_to_self:
		tree.remove_productions_to_self()

	# This should occur *after* we remove unary projections to self.
	if add_basal_nps:
		tree = basenp.transform(tree)

	# This transform must occur *after* we raise punctuation.
	if use_S_transform:
		assert not add_basal_nps
		tree = S_transform(tree)

		# Sanity check
		for n in tree.internal_nodes():
			if n.label != "S": continue
			if len(n.children) < 2: continue
			if n.children[0].label == "NP" and n.children[-1].label == "VP":
				assert len([c for c in n.children[1:-1] if c.label == "NP"])==0
				assert len([c for c in n.children[1:-1] if c.label == "VP"])==0

		# Sanity check that the transform is stable.
		treestr = tree.to_string()
		tree = S_transform(tree)
		assert tree.to_string() == treestr

	# This transform must occur *after* we raise punctuation.
	if use_SPRIME_transform:
		assert not add_basal_nps
		tree = SPRIME_transform(tree)

#		# Sanity check that the transform is stable.
#		treestr = tree.to_string()
#		tree = SPRIME_transform(tree)
#		assert tree.to_string() == treestr

	if lowercase_vocabulary:
		for n in tree.leaves():
			n.headword = string.lower(n.headword)

	if delexicalize:
		# Delexicalize infrequent words (words not in the vocabulary)
		vocab.read()
		for n in tree.leaves():
			if n.headword not in vocab.vocab_to_idx:
				n.headword = '*rare*'

	tree = refresh(tree)

	return tree
Пример #2
0
#   12. Output the tree.
#
#
#   $Id: preprocess.py 1657 2006-06-04 03:03:05Z turian $
#
#
#######################################################################
# Copyright (c) 2004-2006, New York University. All rights reserved
#######################################################################


from variables import *

import parsetree
import vocab
vocab.read()

import sys

if mysys == "Linux" and not profile:
	try:
		debug(1, "Linux detected... using psyco")
		import psyco
		psyco.full(memory=4096)

#	psyco.full()

#	psyco.log()
#	psyco.full(memory=128)
#	psyco.profile(0.05, memory=1024)
#	psyco.profile(0.2)