def min_non_pronoun(cluster, text, parses, heads, check_head=False): ans = None for mention in cluster: if coreference.mention_type(mention, text, parses, heads) == 'pronoun': continue if check_head: head = coreference.mention_head(mention, text, parses, heads, default_last=True) if coreference.mention_type((mention[0], head[0][0], head[0][1]), text, parses, heads) == 'pronoun': continue if ans is None or ans > mention: ans = mention return ans
def print_pre_change_info(out, auto, gold, auto_mentions, gold_mention_set, text, parses, heads, gold_clusters, gold_mentions, gold_doc, auto_clusters): # Cataphora mentions = defaultdict(lambda: [None, None, None]) for cluster in gold: non_pronoun = min_non_pronoun(cluster, text, parses, heads) for mention in cluster: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': if non_pronoun is not None and mention < non_pronoun: mentions[mention][0] = True else: mentions[mention][0] = False for cluster in auto: non_pronoun = min_non_pronoun(cluster, text, parses, heads) for mention in cluster: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': if non_pronoun is not None and mention < non_pronoun: mentions[mention][1] = True else: mentions[mention][1] = False in_both = [] for mention in mentions: if mentions[mention][0] and mentions[mention][1]: in_both.append(mention) for mention in in_both: acluster = auto_clusters[auto_mentions[mention]] gcluster = gold_clusters[gold_mentions[mention]] anon_pronoun = min_non_pronoun(acluster, text, parses, heads) gnon_pronoun = min_non_pronoun(gcluster, text, parses, heads) if anon_pronoun == gnon_pronoun: mentions[mention][2] = True else: mentions[mention][2] = False for mention in mentions: mtext = coreference_rendering.mention_text(text, mention).lower() print >> out['out'], "Cataphoric properties", mentions[mention], mtext
def categorise(auto, gold, changes, text, parses, heads, gold_mention_set, auto_mentions, gold_doc): # Not an Entity # A set of splits to singles that cover an entire cluster to_add = defaultdict(lambda: []) for split in changes['split']: is_disjoint = True for mention in split[1]: if mention in gold_mention_set: mtype = coreference.mention_type(mention, text, parses, heads) if mtype != 'pronoun': is_disjoint = False break if is_disjoint: all_extra = True for mention in split[0]: if mention in gold_mention_set: all_extra = False break if all_extra: to_add[tuple(split[1])].append(split) for cluster in to_add: splits = to_add[cluster] cluster = set(cluster) split_cluster = set() for split in splits: split_cluster.update(split[0]) if len(split_cluster) == 1: continue properties = ['extra'] + cluster_error_properties(split_cluster, text, parses, heads, gold_doc) changes['extra entity'].append((split_cluster, cluster.copy(), properties)) for split in splits: changes['split'].remove(split) to_remove = None for remove in changes['remove']: if iter(split[0]).next() in remove[0]: to_remove = remove break if to_remove is not None: changes['remove'].remove(to_remove) # Missed Entity # A set of merges of singles that form a single cluster to_remove = [] for cluster in gold: is_disjoint = True missing = 0 for mention in cluster: if mention not in auto_mentions: missing += 1 else: if coreference.mention_type(mention, text, parses, heads) != 'pronoun': is_disjoint = False break if is_disjoint and missing > 1: properties = ['missing'] + cluster_error_properties(cluster, text, parses, heads, gold_doc) changes['missing entity'].append((cluster.copy(),properties)) for mention in cluster: if mention in auto_mentions: continue operations = [] for merge in changes['merge']: if len(merge[0]) == 1 and mention in merge[0]: operations.append(merge) break for introduce in changes['introduce']: if len(introduce[0]) == 1 and mention in introduce[0]: operations.append(introduce) break to_remove.append(tuple(operations)) for merge, introduce in to_remove: changes['merge'].remove(merge) changes['introduce'].remove(introduce) # Remove the splits and merges that involve the earliest non-pronoun mentions in the cluster to_remove = [] for split in changes['split']: if min_non_pronoun(split[0], text, parses, heads) == min_non_pronoun(split[1], text, parses, heads): if min_non_pronoun(split[0], text, parses, heads) is None and min(split[0]) != min(split[1]): continue found = False for remove in changes['remove']: if split[0] == remove[0]: to_remove.append((split, remove)) found = True break if not found: to_remove.append((split, None)) for split, remove in to_remove: changes['split'].remove(split) if remove is not None: changes['remove'].remove(remove) to_remove = [] for merge in changes['merge']: if min_non_pronoun(merge[0], text, parses, heads) == min_non_pronoun(merge[1], text, parses, heads): if min_non_pronoun(merge[0], text, parses, heads) is None and min(merge[0]) != min(merge[1]): continue found = False for introduce in changes['introduce']: if introduce[0] == merge[0]: found = True to_remove.append((merge, introduce)) break if not found: to_remove.append((merge, None)) for merge, introduce in to_remove: changes['merge'].remove(merge) if introduce is not None: changes['introduce'].remove(introduce) # Remaining cases of splitting a singleton, which does not get merged, are incorrectly referential to_remove = [] for split in changes['split']: if len(split[0]) == 1: if split[2] != '': to_remove.append(split) for split in to_remove: changes['split'].remove(split) to_remove = None for remove in changes['remove']: if iter(split[0]).next() in remove[0]: to_remove = remove break if to_remove is not None: changes['remove'].remove(to_remove) properties = ['extra'] + mention_error_properties(iter(split[0]).next(), split[1], text, parses, heads, gold_doc) changes['extra mention'].append((split[0], split, properties)) # Pair up introduces and merges to form incorrectly non-referential to_remove = [] for merge in changes['merge']: if len(merge[0]) == 1: elsewhere = False for split in changes['split']: if len(split[0]) == 1: smention = list(split[0])[0] mmention = list(merge[0])[0] if smention == mmention: elsewhere = True break if not elsewhere: mention = list(merge[0])[0] if mention != min_non_pronoun(merge[1], text, parses, heads) and mention not in auto_mentions: properties = ['missing'] + mention_error_properties(mention, merge[1], text, parses, heads, gold_doc) changes['missing mention'].append(({mention}, merge[1], merge, properties)) for introduce in changes['introduce']: if len(introduce[0]) == 1 and mention in introduce[0]: to_remove.append((merge, introduce)) break for merge, introduce in to_remove: changes['merge'].remove(merge) changes['introduce'].remove(introduce) return changes
def cluster_error_properties(cluster, text, parses, heads, gold_doc): ans = [] # How big is the cluster ans.append(len(cluster)) # Counts of each type in the cluster counts = [0, 0, 0] for mention in cluster: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'name': counts[0] += 1 elif mtype == 'nominal': counts[1] += 1 elif mtype == 'pronoun': counts[2] += 1 ans += counts # If it is one pronoun and something else, more info on the pronoun if counts[0] + counts[1] == 1 and counts[2] == 1: pronoun = None for mention in cluster: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': pronoun = mention mtext = coreference_rendering.mention_text(text, pronoun).lower() ans.append(mtext) else: ans.append(None) # Number of cataphoric pronouns cataphora = 0 non_pronoun = min_non_pronoun(cluster, text, parses, heads, True) for mention in cluster: if mention < non_pronoun: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': cataphora += 1 ans.append(cataphora) # NER types ner = set() for mention in cluster: if mention in gold_doc['ner']: ner.add(gold_doc['ner'][mention]) ner = list(ner) ner.sort() ans.append(ner) # Are all the mentions the same? mtext = set() for mention in cluster: mtext.add(coreference_rendering.mention_text(text, mention).lower()) ans.append(len(mtext) == 1) # Are all the heads the same? mhead = set() for mention in cluster: mhead.add(coreference.mention_head(mention, text, parses, heads)[1].lower()) ans.append(len(mhead) == 1) return ans
def mention_error_properties(mention, cluster, text, parses, heads, gold_doc): ans = [] rest = cluster.difference({mention}) # Type of mention mtype = coreference.mention_type(mention, text, parses, heads) ans.append(mtype) # Text of mention mtext = coreference_rendering.mention_text(text, mention).lower() ans.append('_'.join(mtext.split())) # Does it have a string match with something in the cluster? matches = 'no_text_match' for omention in rest: otext = coreference_rendering.mention_text(text, omention).lower() if otext == mtext: matches = 'text_match' break ans.append(matches) # Does it have a head match with something in the cluster? matches = 'no_head_match' mhead = coreference.mention_head(mention, text, parses, heads)[1].lower() for omention in rest: ohead = coreference.mention_head(omention, text, parses, heads)[1].lower() if mhead == ohead: matches = 'head_match' break ans.append(matches) # Is it nested within another mention in the cluster nested = 'not_nested' for omention in rest: if omention[0] == mention[0]: if mention[1] < omention[1] and omention[2] < mention[2]: if nested == 'nested_inside': nested = 'nested_both' break else: nested = 'nested_outside' if omention[1] < mention[1] and mention[2] < omention[2]: if nested == 'nested_outside': nested = 'nested_both' break else: nested = 'nested_inside' ans.append(nested) # Was it first in the cluster? ans.append(mention == min(cluster)) # Was it last in the cluster? ans.append(mention == max(cluster)) # Is it a case of cataphora? non_pronoun = min_non_pronoun(cluster, text, parses, heads) ans.append(non_pronoun is not None and mention < non_pronoun) # Do NER, number, person, or gender of mention and cluster match? cluster_properties = get_cluster_info(rest, gold_doc) mention_properties = get_cluster_info({mention}, gold_doc) words = ['ner', 'number', 'person', 'gender'] for i in xrange(4): if len(mention_properties[i]) == 0 or len(cluster_properties[i]) == 0: ans.append(words[i] + '_unknown') elif len(mention_properties[i].intersection(cluster_properties[i])) > 0: ans.append(words[i] + '_matches') else: ans.append(words[i] + '_does_not_match') return ans
def split_merge_properties(part, cluster, auto, gold, text, parses, heads, gold_mentions, gold_clusters, auto_mentions, gold_doc): ans = [] rest = cluster.difference(part) # Size of part ans.append(len(part)) # 0 # Size of rest ans.append(len(rest)) # 1 # If size 1, what the text is mtext = None if len(part) == 1: mention = iter(part).next() mtext = '_'.join(coreference_rendering.mention_text(text, mention).lower().split()) ans.append(mtext) # 2 # Does this part have any cataphoric pronouns count = 0 acluster = set() for mention in cluster: if mention in auto_mentions: acluster.add(mention) non_pronoun = min_non_pronoun(acluster, text, parses, heads) if non_pronoun is not None and non_pronoun not in part: for mention in part: if mention in auto_mentions and mention < non_pronoun: mtype = coreference.mention_type(mention, text, parses, heads) if mtype == 'pronoun': count += 1 ans.append("%d_cataphoric" % count) # Number of pronouns, nominals, names present in it type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0} for mention in part: mtype = coreference.mention_type(mention, text, parses, heads) type_counts[mtype] += 1 ans.append(type_counts['name']) # 3 ans.append(type_counts['nominal']) # 4 ans.append(type_counts['pronoun']) # 5 # Number of pronouns, nominals, names, in rest type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0} for mention in rest: mtype = coreference.mention_type(mention, text, parses, heads) type_counts[mtype] += 1 ans.append(type_counts['name']) # 6 ans.append(type_counts['nominal']) # 7 ans.append(type_counts['pronoun']) # 8 # Whether this is extra all_extra = True for mention in part: if mention in gold_mentions: all_extra = False ans.append(all_extra) # 9 # Whether the rest is all extra all_extra = True for mention in rest: if mention in gold_mentions: all_extra = False ans.append(all_extra) # 10 # Whether there is an exact string match between a mention in the part and cluster (excluding pronouns) match_present = 'no_string_match' for smention in part: mtype = coreference.mention_type(smention, text, parses, heads) if mtype == 'pronoun': continue for rmention in rest: mtype = coreference.mention_type(rmention, text, parses, heads) if mtype == 'pronoun': continue stext = coreference_rendering.mention_text(text, smention).lower() rtext = coreference_rendering.mention_text(text, rmention).lower() if stext == rtext: match_present = 'string_match' break if 'no' not in match_present: break ans.append(match_present) # 11 # Whether there is a head match between a mention in the part and cluster (excluding pronouns) match_present = 'no_head_match' for smention in part: mtype = coreference.mention_type(smention, text, parses, heads) if mtype == 'pronoun': continue for rmention in rest: mtype = coreference.mention_type(rmention, text, parses, heads) if mtype == 'pronoun': continue shead = coreference.mention_head(smention, text, parses, heads)[1].lower() rhead = coreference.mention_head(rmention, text, parses, heads)[1].lower() if shead == rhead: match_present = 'head_match' break if 'no' not in match_present: break ans.append(match_present) # 12 # What has happened, or will happen example = iter(part).next() action = 'nothing' if example not in gold_mentions: action = 'delete' elif part != set(gold_clusters[gold_mentions[example]]): action = 'merge' ans.append(action) # 13 action = 'nothing' if example not in auto_mentions: action = 'introduce' else: for acluster in auto: if example in acluster: if acluster != part: action = 'split' break ans.append(action) # 14 # NER, number, person, gender cproperties = get_cluster_info(rest, gold_doc) pproperties = get_cluster_info(part, gold_doc) for prop in xrange(4): ans.append(cproperties[prop] == pproperties[prop]) cprop = list(cproperties[prop]) cprop.sort() pprop = list(pproperties[prop]) pprop.sort() ans.append('part_' + '_'.join(pprop)) ans.append('cluster_' + '_'.join(cprop)) return ans