Exemplo n.º 1
0
def get_depend_graph(semantics):

    # 'coord', 'xycoord', 'alpha' or 'roman'
    grammar = get_grammar()
    grammar.formalism.cl_output_options("tsformat=coord")   
    coords = zip(*grammar.formalism.semantics_to_coordinates(semantics))[0]
    funs = zip(*grammar.formalism.semantics_to_functions(semantics))[0]
    gold_seq = zip(coords, funs)

    tags = []
    for g in gold_seq:
        t = "%s,%s" % (coordinate_to_roman_name(g[0]).replace("-","").replace("b", ""), g[1])
        tags.append(t)

    gold_graph,gold_time_map = semantics_to_dependency_graph(semantics) 
    depend_graph_tags = eval("%s" % gold_graph.get_graph_pos(tags))
    gold_graph = eval("%s" % gold_graph.get_graph_index())
    return [gold_graph, depend_graph_tags]
def main():	

	features = {}
	input_files = glob.glob(PARSES_FILES)
	
	for file_results in input_files:
		# We read in the whole file (it's pickled, so we have to), but don't 
		#  keep the pres object after the loop iteration, because it can 
		#  be very big
		try:
			pres = ParseResults.from_file(file_results)
		except ParseResults.LoadError, err:
			if options.errors:
				# Print all load errors
				print >>sys.stderr, "Error loading file: %s" % (err)
			errors.append(file_results)
			continue

		print file_results
		if len(pres.semantics) == 0:
			continue
			
		top_result = pres.semantics[0][1]
		gold_result = pres.get_gold_semantics()

		# 'coord', 'xycoord', 'alpha' or 'roman'
		grammar = get_grammar()
		grammar.formalism.cl_output_options("tsformat=coord")	
		coords = zip(*grammar.formalism.semantics_to_coordinates(gold_result))[0]
		funs = zip(*grammar.formalism.semantics_to_functions(gold_result))[0]
		gold_seq = zip(coords, funs)

		tags = []
		for g in gold_seq:
			t = "%s,%s" % (coordinate_to_roman_name(g[0]), g[1])
			tags.append(t)

		gold_graph,gold_time_map = semantics_to_dependency_graph(gold_result)	
		depend_graph = eval("%s" % gold_graph.get_graph_pos(tags))
		gold_graph = eval("%s" % gold_graph.get_graph_index())

		# Words
		for g in gold_graph:
			word1 = g[0].split(",")
			uni_word = "UNIGRAM:"+str(word1[0])
			if uni_word not in features:
				features[uni_word] = 0
			else:
				features[uni_word] += 1

		for dep in depend_graph:
			word1 = dep[0].split(",")
			uni_word = "UNIGRAM:"+str(word1[0])
			if uni_word not in features:
				features[uni_word] = 0
			else:
				features[uni_word] += 1

		# Tags
		for dep in depend_graph:
			word1 = dep[0].split(",")
			uni_tag = "UNIGRAM:"+str(word1[1])
			if uni_tag not in features:
				features[uni_tag] = 0
			else:
				features[uni_tag] += 1

		# Bigram Words
		for g in gold_graph:
			word1 = g[0].split(",")
			if g[1] == "ROOT":
				bigram_word = "BIGRAM:"+str(word1[0])+":ROOT"
			else:
				word2 = g[1].split(",")
				bigram_word = "BIGRAM:"+str(word1[0])+":"+str(word2[0])
			if bigram_word not in features:
				features[bigram_word] = 0	
			else:
				features[bigram_word] += 1

		for dep in depend_graph:
			word1 = dep[0].split(",")
			if dep[1] == "ROOT":
				bigram_word = "BIGRAM:"+str(word1[0])+":ROOT"
			else:
				word2 = dep[1].split(",")
				bigram_word = "BIGRAM:"+str(word1[0])+":"+str(word2[0])
			if bigram_word not in features:
				features[bigram_word] = 0	
			else:
				features[bigram_word] += 1

		# Bigram Tags
		for dep in depend_graph:
			word1 = dep[0].split(",")
			if dep[1] == "ROOT":
				bigram_tag = "BIGRAM:"+str(word1[1])+":ROOT"
			else:
				word2 = dep[1].split(",")
				bigram_tag = "BIGRAM:"+str(word1[1])+":"+str(word2[1])
			if bigram_tag not in features:
				features[bigram_tag] = 0			
			else:
				features[bigram_tag] += 1

		# Bigram Words/Tags
		for dep in depend_graph:
			word1 = dep[0].split(",")
			if dep[1] == "ROOT":
				bigram_words_tags = "BIGRAM:"+str(word1[0])+":"+str(word1[1])+":ROOT"
			else:
				word2 = dep[1].split(",")
				bigram_words_tags = "BIGRAM:"+str(word1[0])+":"+str(word1[1])+":"+str(word2[0])+":"+str(word2[1])
			if bigram_words_tags not in features:
				features[bigram_words_tags] = 0
			else:
				features[bigram_words_tags] += 1

		# Trigram words
		for i in range(len(gold_graph)):
			if gold_graph[i][1] == "ROOT":
				# Get trigram
				if gold_graph[i-1][1] != "ROOT" and gold_graph[i-2][1] != "ROOT":
					head_root_word = gold_graph[i][0].split(",")[0]
					head_i1_word = gold_graph[i-1][0].split(",")[0]
					head_i2_word = gold_graph[i-2][0].split(",")[0]
					trigram_word = "TRIGRAM:" + head_root_word + ":" + head_i1_word + ":" + head_i2_word
					if trigram_word not in features:
						features[trigram_word] = 0
					else:
						features[trigram_word] += 1

		for i in range(len(depend_graph)):
			if depend_graph[i][1] == "ROOT":
				# Get trigram
				if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT":
					head_root_word = depend_graph[i][0].split(",")[0]
					head_i1_word = depend_graph[i-1][0].split(",")[0]
					head_i2_word = depend_graph[i-2][0].split(",")[0]
					trigram_word = "TRIGRAM:" + head_root_word + ":" + head_i1_word + ":" + head_i2_word
					if trigram_word not in features:
						features[trigram_word] = 0
					else:
						features[trigram_word] += 1

		# Trigram tags
		for i in range(len(depend_graph)):
			if depend_graph[i][1] == "ROOT":
				# Get trigram
				if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT":
					head_root_tag = depend_graph[i][0].split(",")[1]
					head_i1_tag = depend_graph[i-1][0].split(",")[1]
					head_i2_tag = depend_graph[i-2][0].split(",")[1]
					trigram_tag = "TRIGRAM:" + head_root_tag + ":" + head_i1_tag + ":" + head_i2_tag
					if trigram_tag not in features:
						features[trigram_tag] = 0
					else:
						features[trigram_tag] += 1

		# Trigram words/tags
		for i in range(len(depend_graph)):
			if depend_graph[i][1] == "ROOT":
				# Get trigram
				if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT":
					head_root = depend_graph[i][0].split(",")
					head_root_word_tag = head_root[0] + ":" + head_root[1]
					# words/tags
					head_i1 = depend_graph[i-1][0].split(",")
					head_i2 = depend_graph[i-2][0].split(",")
					head_i1_word_tag = head_i1[0] + ":" + head_i1[1]
					head_i2_word_tag = head_i2[0] + ":" + head_i2[1]

					trigram_word_tag = "TRIGRAM:" + head_root_word_tag + ":" + head_i1_word_tag + ":" + head_i2_word_tag
					if trigram_word_tag not in features:
						features[trigram_word_tag] = 0
					else:
						features[trigram_word_tag] += 1