def min_non_pronoun(cluster, text, parses, heads, check_head=False):
	ans = None
	for mention in cluster:
		if coreference.mention_type(mention, text, parses, heads) == 'pronoun':
			continue
		if check_head:
			head = coreference.mention_head(mention, text, parses, heads, default_last=True)
			if coreference.mention_type((mention[0], head[0][0], head[0][1]), text, parses, heads) == 'pronoun':
				continue
		if ans is None or ans > mention:
			ans = mention
	return ans
def cluster_error_properties(cluster, text, parses, heads, gold_doc):
	ans = []

	# How big is the cluster
	ans.append(len(cluster))

	# Counts of each type in the cluster
	counts = [0, 0, 0]
	for mention in cluster:
		mtype = coreference.mention_type(mention, text, parses, heads)
		if mtype == 'name':
			counts[0] += 1
		elif mtype == 'nominal':
			counts[1] += 1
		elif mtype == 'pronoun':
			counts[2] += 1
	ans += counts

	# If it is one pronoun and something else, more info on the pronoun
	if counts[0] + counts[1] == 1 and counts[2] == 1:
		pronoun = None
		for mention in cluster:
			mtype = coreference.mention_type(mention, text, parses, heads)
			if mtype == 'pronoun':
				pronoun = mention
		mtext = coreference_rendering.mention_text(text, pronoun).lower()
		ans.append(mtext)
	else:
		ans.append(None)

	# Number of cataphoric pronouns
	cataphora = 0
	non_pronoun = min_non_pronoun(cluster, text, parses, heads, True)
	for mention in cluster:
		if mention < non_pronoun:
			mtype = coreference.mention_type(mention, text, parses, heads)
			if mtype == 'pronoun':
				cataphora += 1
	ans.append(cataphora)

	# NER types
	ner = set()
	for mention in cluster:
		if mention in gold_doc['ner']:
			ner.add(gold_doc['ner'][mention])
	ner = list(ner)
	ner.sort()
	ans.append(ner)

	# Are all the mentions the same?
	mtext = set()
	for mention in cluster:
		mtext.add(coreference_rendering.mention_text(text, mention).lower())
	ans.append(len(mtext) == 1)

	# Are all the heads the same?
	mhead = set()
	for mention in cluster:
		mhead.add(coreference.mention_head(mention, text, parses, heads)[1].lower())
	ans.append(len(mhead) == 1)

	return ans
def mention_error_properties(mention, cluster, text, parses, heads, gold_doc):
	ans = []
	rest = cluster.difference({mention})

	# Type of mention
	mtype = coreference.mention_type(mention, text, parses, heads)
	ans.append(mtype)

	# Text of mention
	mtext = coreference_rendering.mention_text(text, mention).lower()
	ans.append('_'.join(mtext.split()))

	# Does it have a string match with something in the cluster?
	matches = 'no_text_match'
	for omention in rest:
		otext = coreference_rendering.mention_text(text, omention).lower()
		if otext == mtext:
			matches = 'text_match'
			break
	ans.append(matches)

	# Does it have a head match with something in the cluster?
	matches = 'no_head_match'
	mhead = coreference.mention_head(mention, text, parses, heads)[1].lower()
	for omention in rest:
		ohead = coreference.mention_head(omention, text, parses, heads)[1].lower()
		if mhead == ohead:
			matches = 'head_match'
			break
	ans.append(matches)

	# Is it nested within another mention in the cluster
	nested = 'not_nested'
	for omention in rest:
		if omention[0] == mention[0]:
			if mention[1] < omention[1] and omention[2] < mention[2]:
				if nested == 'nested_inside':
					nested = 'nested_both'
					break
				else:
					nested = 'nested_outside'
			if omention[1] < mention[1] and mention[2] < omention[2]:
				if nested == 'nested_outside':
					nested = 'nested_both'
					break
				else:
					nested = 'nested_inside'
	ans.append(nested)

	# Was it first in the cluster?
	ans.append(mention == min(cluster))

	# Was it last in the cluster?
	ans.append(mention == max(cluster))

	# Is it a case of cataphora?
	non_pronoun = min_non_pronoun(cluster, text, parses, heads)
	ans.append(non_pronoun is not None and mention < non_pronoun)

	# Do NER, number, person, or gender of mention and cluster match?
	cluster_properties = get_cluster_info(rest, gold_doc)
	mention_properties = get_cluster_info({mention}, gold_doc)
	words = ['ner', 'number', 'person', 'gender']
	for i in xrange(4):
		if len(mention_properties[i]) == 0 or len(cluster_properties[i]) == 0:
			ans.append(words[i] + '_unknown')
		elif len(mention_properties[i].intersection(cluster_properties[i])) > 0:
			ans.append(words[i] + '_matches')
		else:
			ans.append(words[i] + '_does_not_match')

	return ans
def match_boundaries(gold_mention_set, auto_mention_set, auto_mentions, auto_clusters, text, parses, heads):
	changed = set()
	# Apply changes for cases where the difference is only leading or trailing punctuation
	mapping = {}
	used_gold = set()
	unique_to_gold = gold_mention_set.difference(auto_mention_set)
	unique_to_auto =  auto_mention_set.difference(gold_mention_set)
	for amention in unique_to_auto:
		sentence, astart, aend = amention
		while (astart < aend - 1 and
		       (text[sentence][astart] == "the" or
		       (len(text[sentence][astart]) == 1 and
		       text[sentence][astart][0] not in string.letters))):
			astart += 1
		while (astart < aend - 1 and
		       (text[sentence][aend - 1] == "'s" or
		       (len(text[sentence][aend - 1]) == 1 and
		       text[sentence][aend - 1][0] not in string.letters))):
			aend -= 1
		for gmention in unique_to_gold:
			if gmention in used_gold:
				continue
			gsentence, gstart, gend = gmention
			if sentence != gsentence:
				continue
			while (gstart < gend - 1 and
			       (text[sentence][gstart] == "the" or
			       (len(text[sentence][gstart]) == 1 and
			       text[sentence][gstart][0] not in string.letters))):
				gstart += 1
			while (gstart < gend - 1 and
			       (text[sentence][gend - 1] == "'s" or
			       (len(text[sentence][gend - 1]) == 1 and
			       text[sentence][gend - 1][0] not in string.letters))):
				gend -= 1
			if astart == gstart and aend == gend:
				mapping[amention] = gmention
				used_gold.add(gmention)
	# Apply mapping to create new auto_mention_set
	for mention in mapping:
		auto_mention_set.remove(mention)
		auto_mention_set.add(mapping[mention])
		cluster_id = auto_mentions.pop(mention)
		auto_mentions[mapping[mention]] = cluster_id
		auto_clusters[cluster_id].remove(mention)
		auto_clusters[cluster_id].append(mapping[mention])
		changed.add((mention, mapping[mention]))

	# Create a mapping based on heads
	head_dict = defaultdict(lambda: {'auto': [], 'gold': []})
	for mention in auto_mention_set.difference(gold_mention_set):
		sentence, start, end = mention
		head = coreference.mention_head(mention, text, parses, heads, default_last=True)
		# This will default to last word if the mention is not a constituent, is
		# there an alternative?
		if head is not None:
			head = (mention[0], head[0])
			head_dict[head]['auto'].append(mention)
	for mention in gold_mention_set.difference(auto_mention_set):
		sentence, start, end = mention
		head = coreference.mention_head(mention, text, parses, heads, default_last=True)
		if head is not None:
			head = (mention[0], head[0])
			head_dict[head]['gold'].append(mention)

	mapping = {}
	for head in head_dict:
		amentions = head_dict[head]['auto']
		gmentions = head_dict[head]['gold']
		if len(amentions) == 1 and len(gmentions) == 1:
			mapping[amentions[0]] = gmentions[0]

	# Apply mapping to create new auto_mention_set
	for mention in mapping:
		auto_mention_set.remove(mention)
		auto_mention_set.add(mapping[mention])
		cluster_id = auto_mentions.pop(mention)
		auto_mentions[mapping[mention]] = cluster_id
		auto_clusters[cluster_id].remove(mention)
		auto_clusters[cluster_id].append(mapping[mention])
		changed.add((mention, mapping[mention]))

	# Add notes
	nchanges = []
	for smention, gmention in changed:
		properties = [smention, gmention]
		pre_extra_text = None
		pre_missing_text = None
		post_extra_text = None
		post_missing_text = None
		pre_extra_nodes = None
		pre_missing_nodes = None
		post_extra_nodes = None
		post_missing_nodes = None
		if smention[1] < gmention[1]:
			pre_extra_text = ' '.join(text[smention[0]][smention[1]:gmention[1]]).lower()
			nodes = parses[gmention[0]].get_spanning_nodes(smention[1], gmention[1])
			pre_extra_nodes = ' '.join([node.label for node in nodes])
		if smention[1] > gmention[1]:
			pre_missing_text = ' '.join(text[smention[0]][gmention[1]:smention[1]]).lower()
			nodes = parses[gmention[0]].get_spanning_nodes(gmention[1], smention[1])
			pre_missing_nodes = ' '.join([node.label for node in nodes])
		if smention[2] < gmention[2]:
			post_missing_text = ' '.join(text[smention[0]][smention[2]:gmention[2]]).lower()
			nodes = parses[gmention[0]].get_spanning_nodes(smention[2], gmention[2])
			post_missing_nodes = ' '.join([node.label for node in nodes])
		if smention[2] > gmention[2]:
			post_extra_text = ' '.join(text[smention[0]][gmention[2]:smention[2]]).lower()
			nodes = parses[gmention[0]].get_spanning_nodes(gmention[2], smention[2])
			post_extra_nodes = ' '.join([node.label for node in nodes])
		snode = parses[smention[0]].get_nodes('lowest', smention[1], smention[2])
		properties.append("in the parse" if snode is not None else "not in the parse")
		properties.append(pre_extra_text)
		properties.append(pre_missing_text)
		properties.append(post_extra_text)
		properties.append(post_missing_text)
		properties.append(pre_extra_nodes)
		properties.append(pre_missing_nodes)
		properties.append(post_extra_nodes)
		properties.append(post_missing_nodes)
		nchanges.append(tuple(properties))
	return nchanges
def split_merge_properties(part, cluster, auto, gold, text, parses, heads, gold_mentions, gold_clusters, auto_mentions, gold_doc):
	ans = []
	rest = cluster.difference(part)

	# Size of part
	ans.append(len(part)) # 0

	# Size of rest
	ans.append(len(rest)) # 1

	# If size 1, what the text is
	mtext = None
	if len(part) == 1:
		mention = iter(part).next()
		mtext = '_'.join(coreference_rendering.mention_text(text, mention).lower().split())
	ans.append(mtext) # 2

	# Does this part have any cataphoric pronouns
	count = 0
	acluster = set()
	for mention in cluster:
		if mention in auto_mentions:
			acluster.add(mention)
	non_pronoun = min_non_pronoun(acluster, text, parses, heads)
	if non_pronoun is not None and non_pronoun not in part:
		for mention in part:
			if mention in auto_mentions and mention < non_pronoun:
				mtype = coreference.mention_type(mention, text, parses, heads)
				if mtype == 'pronoun':
					count += 1
	ans.append("%d_cataphoric" % count)

	# Number of pronouns, nominals, names present in it
	type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0}
	for mention in part:
		mtype = coreference.mention_type(mention, text, parses, heads)
		type_counts[mtype] += 1
	ans.append(type_counts['name']) # 3
	ans.append(type_counts['nominal']) # 4
	ans.append(type_counts['pronoun']) # 5

	# Number of pronouns, nominals, names, in rest
	type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0}
	for mention in rest:
		mtype = coreference.mention_type(mention, text, parses, heads)
		type_counts[mtype] += 1
	ans.append(type_counts['name']) # 6
	ans.append(type_counts['nominal']) # 7
	ans.append(type_counts['pronoun']) # 8

	# Whether this is extra
	all_extra = True
	for mention in part:
		if mention in gold_mentions:
			all_extra = False
	ans.append(all_extra) # 9

	# Whether the rest is all extra
	all_extra = True
	for mention in rest:
		if mention in gold_mentions:
			all_extra = False
	ans.append(all_extra) # 10

	# Whether there is an exact string match between a mention in the part and cluster (excluding pronouns)
	match_present = 'no_string_match'
	for smention in part:
		mtype = coreference.mention_type(smention, text, parses, heads)
		if mtype == 'pronoun':
			continue
		for rmention in rest:
			mtype = coreference.mention_type(rmention, text, parses, heads)
			if mtype == 'pronoun':
				continue
			stext = coreference_rendering.mention_text(text, smention).lower()
			rtext = coreference_rendering.mention_text(text, rmention).lower()
			if stext == rtext:
				match_present = 'string_match'
				break
		if 'no' not in match_present:
			break
	ans.append(match_present) # 11

	# Whether there is a head match between a mention in the part and cluster (excluding pronouns)
	match_present = 'no_head_match'
	for smention in part:
		mtype = coreference.mention_type(smention, text, parses, heads)
		if mtype == 'pronoun':
			continue
		for rmention in rest:
			mtype = coreference.mention_type(rmention, text, parses, heads)
			if mtype == 'pronoun':
				continue
			shead = coreference.mention_head(smention, text, parses, heads)[1].lower()
			rhead = coreference.mention_head(rmention, text, parses, heads)[1].lower()
			if shead == rhead:
				match_present = 'head_match'
				break
		if 'no' not in match_present:
			break
	ans.append(match_present) # 12

	# What has happened, or will happen
	example = iter(part).next()
	action = 'nothing'
	if example not in gold_mentions:
		action = 'delete'
	elif part != set(gold_clusters[gold_mentions[example]]):
		action = 'merge'
	ans.append(action) # 13

	action = 'nothing'
	if example not in auto_mentions:
		action = 'introduce'
	else:
		for acluster in auto:
			if example in acluster:
				if acluster != part:
					action = 'split'
				break
	ans.append(action) # 14

	# NER, number, person, gender
	cproperties = get_cluster_info(rest, gold_doc)
	pproperties = get_cluster_info(part, gold_doc)
	for prop in xrange(4):
		ans.append(cproperties[prop] == pproperties[prop])
		cprop = list(cproperties[prop])
		cprop.sort()
		pprop = list(pproperties[prop])
		pprop.sort()
		ans.append('part_' + '_'.join(pprop))
		ans.append('cluster_' + '_'.join(cprop))

	return ans
def match_boundaries(gold_mention_set, auto_mention_set, auto_mentions,
		auto_clusters, auto_cluster_set, text, parses, heads):
	# Apply changes for cases where the difference is only leading or trailing
	# punctuation
	mapping = {}
	used_gold = set()
	unique_to_gold = gold_mention_set.difference(auto_mention_set)
	unique_to_auto = auto_mention_set.difference(gold_mention_set)
	for amention in unique_to_auto:
		sentence, astart, aend = amention
		while (aend - astart > 1
				and (text[sentence][astart] == "the"
					or (len(text[sentence][astart]) == 1
						and text[sentence][astart][0]
							not in string.ascii_letters))):
			astart += 1
		while (aend - astart > 1
				and (text[sentence][aend - 1] == "'s"
					or (len(text[sentence][aend - 1]) == 1
						and text[sentence][aend - 1][0]
							not in string.ascii_letters))):
			aend -= 1
		for gmention in unique_to_gold:
			gsentence, gstart, gend = gmention
			if sentence != gsentence or gmention in used_gold:
				continue
			while (gend - gstart > 1
					and (text[sentence][gstart] == "the"
						or (len(text[sentence][gstart]) == 1
							and text[sentence][gstart][0]
								not in string.ascii_letters))):
				gstart += 1
			while (gend - gstart > 1
					and (text[sentence][gend - 1] == "'s"
						or (len(text[sentence][gend - 1]) == 1
							and text[sentence][gend - 1][0]
								not in string.ascii_letters))):
				gend -= 1
			if astart == gstart and aend == gend:
				mapping[amention] = gmention
				used_gold.add(gmention)
	# Apply mapping to create new auto_mention_set
	for mention in mapping:
		auto_mention_set.remove(mention)
		auto_mention_set.add(mapping[mention])
		cluster_id = auto_mentions.pop(mention)
		auto_mentions[mapping[mention]] = cluster_id
		auto_clusters[cluster_id].remove(mention)
		auto_clusters[cluster_id].append(mapping[mention])
		to_remove = None
		for cluster in auto_cluster_set:
			if mention in cluster:
				to_remove = cluster
		auto_cluster_set.remove(to_remove)
		ncluster = []
		for mention2 in to_remove:
			if mention2 == mention:
				mention2 = mapping[mention]
			ncluster.append(mention2)
		ncluster = tuple(ncluster)
		auto_cluster_set.add(ncluster)

	# Create a mapping based on heads
	head_dict = defaultdict(lambda: {'auto': [], 'gold': []})
	for mention in auto_mention_set.difference(gold_mention_set):
		sentence, start, end = mention
		head = coreference.mention_head(mention,
				text,
				parses,
				heads,
				default_last=True)
		# This will default to last word if the mention is not a constituent,
		# is there an alternative?
		if head is not None:
			head = (mention[0], head[0])
			head_dict[head]['auto'].append(mention)
	for mention in gold_mention_set.difference(auto_mention_set):
		sentence, start, end = mention
		head = coreference.mention_head(mention,
				text,
				parses,
				heads,
				default_last=True)
		if head is not None:
			head = (mention[0], head[0])
			head_dict[head]['gold'].append(mention)

	mapping = {}
	for head in head_dict:
		amentions = head_dict[head]['auto']
		gmentions = head_dict[head]['gold']
		if len(amentions) == 1 and len(gmentions) == 1:
			mapping[amentions[0]] = gmentions[0]

	# Apply mapping to create new auto_mention_set
	for mention in mapping:
		auto_mention_set.remove(mention)
		auto_mention_set.add(mapping[mention])
		cluster_id = auto_mentions.pop(mention)
		auto_mentions[mapping[mention]] = cluster_id
		auto_clusters[cluster_id].remove(mention)
		auto_clusters[cluster_id].append(mapping[mention])
		to_remove = None
		for cluster in auto_cluster_set:
			if mention in cluster:
				to_remove = cluster
		auto_cluster_set.remove(to_remove)
		ncluster = []
		for mention2 in to_remove:
			if mention2 == mention:
				mention2 = mapping[mention]
			ncluster.append(mention2)
		ncluster = tuple(ncluster)
		auto_cluster_set.add(ncluster)