def min_non_pronoun(cluster, text, parses, heads, check_head=False):
	ans = None
	for mention in cluster:
		if coreference.mention_type(mention, text, parses, heads) == 'pronoun':
			continue
		if check_head:
			head = coreference.mention_head(mention, text, parses, heads, default_last=True)
			if coreference.mention_type((mention[0], head[0][0], head[0][1]), text, parses, heads) == 'pronoun':
				continue
		if ans is None or ans > mention:
			ans = mention
	return ans
def print_pre_change_info(out, auto, gold, auto_mentions, gold_mention_set, text, parses, heads, gold_clusters, gold_mentions, gold_doc, auto_clusters):
	# Cataphora
	mentions = defaultdict(lambda: [None, None, None])

	for cluster in gold:
		non_pronoun = min_non_pronoun(cluster, text, parses, heads)
		for mention in cluster:
			mtype = coreference.mention_type(mention, text, parses, heads)
			if mtype == 'pronoun':
				if non_pronoun is not None and mention < non_pronoun:
					mentions[mention][0] = True
				else:
					mentions[mention][0] = False

	for cluster in auto:
		non_pronoun = min_non_pronoun(cluster, text, parses, heads)
		for mention in cluster:
			mtype = coreference.mention_type(mention, text, parses, heads)
			if mtype == 'pronoun':
				if non_pronoun is not None and mention < non_pronoun:
					mentions[mention][1] = True
				else:
					mentions[mention][1] = False

	in_both = []
	for mention in mentions:
		if mentions[mention][0] and mentions[mention][1]:
			in_both.append(mention)
	for mention in in_both:
		acluster = auto_clusters[auto_mentions[mention]]
		gcluster = gold_clusters[gold_mentions[mention]]
		anon_pronoun = min_non_pronoun(acluster, text, parses, heads)
		gnon_pronoun = min_non_pronoun(gcluster, text, parses, heads)
		if anon_pronoun == gnon_pronoun:
			mentions[mention][2] = True
		else:
			mentions[mention][2] = False

	for mention in mentions:
		mtext = coreference_rendering.mention_text(text, mention).lower()
		print >> out['out'], "Cataphoric properties", mentions[mention], mtext
def categorise(auto, gold, changes, text, parses, heads, gold_mention_set, auto_mentions, gold_doc):
	# Not an Entity
	# A set of splits to singles that cover an entire cluster
	to_add = defaultdict(lambda: [])
	for split in changes['split']:
		is_disjoint = True
		for mention in split[1]:
			if mention in gold_mention_set:
				mtype = coreference.mention_type(mention, text, parses, heads)
				if mtype != 'pronoun':
					is_disjoint = False
					break
		if is_disjoint:
			all_extra = True
			for mention in split[0]:
				if mention in gold_mention_set:
					all_extra = False
					break
			if all_extra:
				to_add[tuple(split[1])].append(split)
	for cluster in to_add:
		splits = to_add[cluster]
		cluster = set(cluster)
		split_cluster = set()
		for split in splits:
			split_cluster.update(split[0])
		if len(split_cluster) == 1:
			continue
		properties = ['extra'] + cluster_error_properties(split_cluster, text, parses, heads, gold_doc)
		changes['extra entity'].append((split_cluster, cluster.copy(), properties))
		for split in splits:
			changes['split'].remove(split)
			to_remove = None
			for remove in changes['remove']:
				if iter(split[0]).next() in remove[0]:
					to_remove = remove
					break
			if to_remove is not None:
				changes['remove'].remove(to_remove)

	# Missed Entity
	# A set of merges of singles that form a single cluster
	to_remove = []
	for cluster in gold:
		is_disjoint = True
		missing = 0
		for mention in cluster:
			if mention not in auto_mentions:
				missing += 1
			else:
				if coreference.mention_type(mention, text, parses, heads) != 'pronoun':
					is_disjoint = False
					break
		if is_disjoint and missing > 1:
			properties = ['missing'] + cluster_error_properties(cluster, text, parses, heads, gold_doc)
			changes['missing entity'].append((cluster.copy(),properties))
			for mention in cluster:
				if mention in auto_mentions:
					continue
				operations = []
				for merge in changes['merge']:
					if len(merge[0]) == 1 and mention in merge[0]:
						operations.append(merge)
						break
				for introduce in changes['introduce']:
					if len(introduce[0]) == 1 and mention in introduce[0]:
						operations.append(introduce)
						break
				to_remove.append(tuple(operations))
	for merge, introduce in to_remove:
		changes['merge'].remove(merge)
		changes['introduce'].remove(introduce)

	# Remove the splits and merges that involve the earliest non-pronoun mentions in the cluster
	to_remove = []
	for split in changes['split']:
		if min_non_pronoun(split[0], text, parses, heads) == min_non_pronoun(split[1], text, parses, heads):
			if min_non_pronoun(split[0], text, parses, heads) is None and min(split[0]) != min(split[1]):
				continue
			found = False
			for remove in changes['remove']:
				if split[0] == remove[0]:
					to_remove.append((split, remove))
					found = True
					break
			if not found:
				to_remove.append((split, None))
	for split, remove in to_remove:
		changes['split'].remove(split)
		if remove is not None:
			changes['remove'].remove(remove)
	to_remove = []
	for merge in changes['merge']:
		if min_non_pronoun(merge[0], text, parses, heads) == min_non_pronoun(merge[1], text, parses, heads):
			if min_non_pronoun(merge[0], text, parses, heads) is None and min(merge[0]) != min(merge[1]):
				continue
			found = False
			for introduce in changes['introduce']:
				if introduce[0] == merge[0]:
					found = True
					to_remove.append((merge, introduce))
					break
			if not found:
				to_remove.append((merge, None))
	for merge, introduce in to_remove:
		changes['merge'].remove(merge)
		if introduce is not None:
			changes['introduce'].remove(introduce)

	# Remaining cases of splitting a singleton, which does not get merged, are incorrectly referential
	to_remove = []
	for split in changes['split']:
		if len(split[0]) == 1:
			if split[2] != '':
				to_remove.append(split)
	for split in to_remove:
		changes['split'].remove(split)
		to_remove = None
		for remove in changes['remove']:
			if iter(split[0]).next() in remove[0]:
				to_remove = remove
				break
		if to_remove is not None:
			changes['remove'].remove(to_remove)
		properties = ['extra'] + mention_error_properties(iter(split[0]).next(), split[1], text, parses, heads, gold_doc)
		changes['extra mention'].append((split[0], split, properties))

	# Pair up introduces and merges to form incorrectly non-referential
	to_remove = []
	for merge in changes['merge']:
		if len(merge[0]) == 1:
			elsewhere = False
			for split in changes['split']:
				if len(split[0]) == 1:
					smention = list(split[0])[0]
					mmention = list(merge[0])[0]
					if smention == mmention:
						elsewhere = True
						break
			if not elsewhere:
				mention = list(merge[0])[0]
				if mention != min_non_pronoun(merge[1], text, parses, heads) and mention not in auto_mentions:
					properties = ['missing'] + mention_error_properties(mention, merge[1], text, parses, heads, gold_doc)
					changes['missing mention'].append(({mention}, merge[1], merge, properties))
					for introduce in changes['introduce']:
						if len(introduce[0]) == 1 and mention in introduce[0]:
							to_remove.append((merge, introduce))
							break
	for merge, introduce in to_remove:
		changes['merge'].remove(merge)
		changes['introduce'].remove(introduce)

	return changes
def cluster_error_properties(cluster, text, parses, heads, gold_doc):
	ans = []

	# How big is the cluster
	ans.append(len(cluster))

	# Counts of each type in the cluster
	counts = [0, 0, 0]
	for mention in cluster:
		mtype = coreference.mention_type(mention, text, parses, heads)
		if mtype == 'name':
			counts[0] += 1
		elif mtype == 'nominal':
			counts[1] += 1
		elif mtype == 'pronoun':
			counts[2] += 1
	ans += counts

	# If it is one pronoun and something else, more info on the pronoun
	if counts[0] + counts[1] == 1 and counts[2] == 1:
		pronoun = None
		for mention in cluster:
			mtype = coreference.mention_type(mention, text, parses, heads)
			if mtype == 'pronoun':
				pronoun = mention
		mtext = coreference_rendering.mention_text(text, pronoun).lower()
		ans.append(mtext)
	else:
		ans.append(None)

	# Number of cataphoric pronouns
	cataphora = 0
	non_pronoun = min_non_pronoun(cluster, text, parses, heads, True)
	for mention in cluster:
		if mention < non_pronoun:
			mtype = coreference.mention_type(mention, text, parses, heads)
			if mtype == 'pronoun':
				cataphora += 1
	ans.append(cataphora)

	# NER types
	ner = set()
	for mention in cluster:
		if mention in gold_doc['ner']:
			ner.add(gold_doc['ner'][mention])
	ner = list(ner)
	ner.sort()
	ans.append(ner)

	# Are all the mentions the same?
	mtext = set()
	for mention in cluster:
		mtext.add(coreference_rendering.mention_text(text, mention).lower())
	ans.append(len(mtext) == 1)

	# Are all the heads the same?
	mhead = set()
	for mention in cluster:
		mhead.add(coreference.mention_head(mention, text, parses, heads)[1].lower())
	ans.append(len(mhead) == 1)

	return ans
def mention_error_properties(mention, cluster, text, parses, heads, gold_doc):
	ans = []
	rest = cluster.difference({mention})

	# Type of mention
	mtype = coreference.mention_type(mention, text, parses, heads)
	ans.append(mtype)

	# Text of mention
	mtext = coreference_rendering.mention_text(text, mention).lower()
	ans.append('_'.join(mtext.split()))

	# Does it have a string match with something in the cluster?
	matches = 'no_text_match'
	for omention in rest:
		otext = coreference_rendering.mention_text(text, omention).lower()
		if otext == mtext:
			matches = 'text_match'
			break
	ans.append(matches)

	# Does it have a head match with something in the cluster?
	matches = 'no_head_match'
	mhead = coreference.mention_head(mention, text, parses, heads)[1].lower()
	for omention in rest:
		ohead = coreference.mention_head(omention, text, parses, heads)[1].lower()
		if mhead == ohead:
			matches = 'head_match'
			break
	ans.append(matches)

	# Is it nested within another mention in the cluster
	nested = 'not_nested'
	for omention in rest:
		if omention[0] == mention[0]:
			if mention[1] < omention[1] and omention[2] < mention[2]:
				if nested == 'nested_inside':
					nested = 'nested_both'
					break
				else:
					nested = 'nested_outside'
			if omention[1] < mention[1] and mention[2] < omention[2]:
				if nested == 'nested_outside':
					nested = 'nested_both'
					break
				else:
					nested = 'nested_inside'
	ans.append(nested)

	# Was it first in the cluster?
	ans.append(mention == min(cluster))

	# Was it last in the cluster?
	ans.append(mention == max(cluster))

	# Is it a case of cataphora?
	non_pronoun = min_non_pronoun(cluster, text, parses, heads)
	ans.append(non_pronoun is not None and mention < non_pronoun)

	# Do NER, number, person, or gender of mention and cluster match?
	cluster_properties = get_cluster_info(rest, gold_doc)
	mention_properties = get_cluster_info({mention}, gold_doc)
	words = ['ner', 'number', 'person', 'gender']
	for i in xrange(4):
		if len(mention_properties[i]) == 0 or len(cluster_properties[i]) == 0:
			ans.append(words[i] + '_unknown')
		elif len(mention_properties[i].intersection(cluster_properties[i])) > 0:
			ans.append(words[i] + '_matches')
		else:
			ans.append(words[i] + '_does_not_match')

	return ans
def split_merge_properties(part, cluster, auto, gold, text, parses, heads, gold_mentions, gold_clusters, auto_mentions, gold_doc):
	ans = []
	rest = cluster.difference(part)

	# Size of part
	ans.append(len(part)) # 0

	# Size of rest
	ans.append(len(rest)) # 1

	# If size 1, what the text is
	mtext = None
	if len(part) == 1:
		mention = iter(part).next()
		mtext = '_'.join(coreference_rendering.mention_text(text, mention).lower().split())
	ans.append(mtext) # 2

	# Does this part have any cataphoric pronouns
	count = 0
	acluster = set()
	for mention in cluster:
		if mention in auto_mentions:
			acluster.add(mention)
	non_pronoun = min_non_pronoun(acluster, text, parses, heads)
	if non_pronoun is not None and non_pronoun not in part:
		for mention in part:
			if mention in auto_mentions and mention < non_pronoun:
				mtype = coreference.mention_type(mention, text, parses, heads)
				if mtype == 'pronoun':
					count += 1
	ans.append("%d_cataphoric" % count)

	# Number of pronouns, nominals, names present in it
	type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0}
	for mention in part:
		mtype = coreference.mention_type(mention, text, parses, heads)
		type_counts[mtype] += 1
	ans.append(type_counts['name']) # 3
	ans.append(type_counts['nominal']) # 4
	ans.append(type_counts['pronoun']) # 5

	# Number of pronouns, nominals, names, in rest
	type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0}
	for mention in rest:
		mtype = coreference.mention_type(mention, text, parses, heads)
		type_counts[mtype] += 1
	ans.append(type_counts['name']) # 6
	ans.append(type_counts['nominal']) # 7
	ans.append(type_counts['pronoun']) # 8

	# Whether this is extra
	all_extra = True
	for mention in part:
		if mention in gold_mentions:
			all_extra = False
	ans.append(all_extra) # 9

	# Whether the rest is all extra
	all_extra = True
	for mention in rest:
		if mention in gold_mentions:
			all_extra = False
	ans.append(all_extra) # 10

	# Whether there is an exact string match between a mention in the part and cluster (excluding pronouns)
	match_present = 'no_string_match'
	for smention in part:
		mtype = coreference.mention_type(smention, text, parses, heads)
		if mtype == 'pronoun':
			continue
		for rmention in rest:
			mtype = coreference.mention_type(rmention, text, parses, heads)
			if mtype == 'pronoun':
				continue
			stext = coreference_rendering.mention_text(text, smention).lower()
			rtext = coreference_rendering.mention_text(text, rmention).lower()
			if stext == rtext:
				match_present = 'string_match'
				break
		if 'no' not in match_present:
			break
	ans.append(match_present) # 11

	# Whether there is a head match between a mention in the part and cluster (excluding pronouns)
	match_present = 'no_head_match'
	for smention in part:
		mtype = coreference.mention_type(smention, text, parses, heads)
		if mtype == 'pronoun':
			continue
		for rmention in rest:
			mtype = coreference.mention_type(rmention, text, parses, heads)
			if mtype == 'pronoun':
				continue
			shead = coreference.mention_head(smention, text, parses, heads)[1].lower()
			rhead = coreference.mention_head(rmention, text, parses, heads)[1].lower()
			if shead == rhead:
				match_present = 'head_match'
				break
		if 'no' not in match_present:
			break
	ans.append(match_present) # 12

	# What has happened, or will happen
	example = iter(part).next()
	action = 'nothing'
	if example not in gold_mentions:
		action = 'delete'
	elif part != set(gold_clusters[gold_mentions[example]]):
		action = 'merge'
	ans.append(action) # 13

	action = 'nothing'
	if example not in auto_mentions:
		action = 'introduce'
	else:
		for acluster in auto:
			if example in acluster:
				if acluster != part:
					action = 'split'
				break
	ans.append(action) # 14

	# NER, number, person, gender
	cproperties = get_cluster_info(rest, gold_doc)
	pproperties = get_cluster_info(part, gold_doc)
	for prop in xrange(4):
		ans.append(cproperties[prop] == pproperties[prop])
		cprop = list(cproperties[prop])
		cprop.sort()
		pprop = list(pproperties[prop])
		pprop.sort()
		ans.append('part_' + '_'.join(pprop))
		ans.append('cluster_' + '_'.join(cprop))

	return ans