def get_edges(parse, edges, spines, head_map, symbol_map): for subparse in parse.subtrees: get_edges(subparse, edges, spines, head_map, symbol_map) if not parse.is_terminal(): phead = head_finder.get_head(head_map, parse, True) # Normal edges, added by looking at which subparses are not the head of this non-terminal for subparse in parse.subtrees: chead = head_finder.get_head(head_map, subparse, True) if phead is not None and chead is not None and phead[0] != chead[0]: psym = symbol_map[parse.unique_id] csym = subparse.label + "_0" if not subparse.is_terminal(): cysm = symbol_map[subparse.unique_id] edges.append( (chead[0][1], csym, phead[0][1], psym, "_", False, False))
def label_level(parse, head_map, label=None): head = head_finder.get_head(head_map, parse, True) if label is None: label = treebanks.remove_coindexation_from_label(parse.label) count = 0 done = False while not done: done = True for subparse in parse.subtrees: slabel = treebanks.remove_coindexation_from_label(subparse.label) if head == head_finder.get_head(head_map, subparse, True): done = False parse = subparse if slabel == label: count += 1 break return count
def mention_head(mention, text, parses, heads, default_last=True): sentence, start, end = mention node = parses[sentence].get_nodes('lowest', start, end) if node is None: if default_last: node = parses[sentence].get_nodes('lowest', end - 1, end) else: return None return head_finder.get_head(heads[sentence], node)
def get_spines(parse, head_map, spines, symbol_map, traces): if not parse.is_terminal(): for subparse in parse.subtrees: get_spines(subparse, head_map, spines, symbol_map, traces) elif not parse.is_trace(): # Add spine, happens on the terminal that this spine will be assigned to chead = head_finder.get_head(head_map, parse, True) chain = [] cur = parse.parent symbol_counts = defaultdict(lambda: 0) trace_symbol_counts = defaultdict(lambda: 0) # Walk up the parse via parent links, gradually building the spine while cur is not None and chead == head_finder.get_head( head_map, cur, True): if cur.parent is not None: # Avoid the case of the ROOT chain.append('') # Add null elements that need to live here. # Most complex case: # ( (S # (ADJP-TPC-1 (RB Not) (RB likely) ) # (, ,) # (NP-SBJ (PRP I) ) # (VP (VBP think) # (SBAR (-NONE- 0) # (S # (NP-SBJ (-NONE- *) ) # (ADJP-PRD (-NONE- *T*-1) )))) # (. .) )) for node in cur.subtrees: if node.wordspan[0] == node.wordspan[1]: to_add = compress_null_for_spine( node, trace_symbol_counts, symbol_map, traces) if to_add is not None: chain[-1] += to_add + "_" # Add the non-terminal label = treebanks.remove_coindexation_from_label(cur.label) num = symbol_counts[label] symbol_counts[label] += 1 symbol_map[cur.unique_id] = "{}_{}".format(label, num) chain[-1] += label cur = cur.parent spines.append((parse.wordspan[1], parse.label, chain, parse.word))
def mention_head(mention, text, parses, heads, default_last=True): # Canasai's addition begin #mention = special_cases(mention, text, parses, heads) # Canasai's addition end sentence, start, end = mention node = parses[sentence].get_nodes('lowest', start, end) if node is None: if default_last: node = parses[sentence].get_nodes('lowest', end - 1, end) else: return None return head_finder.get_head(heads[sentence], node)
def mention_text(text, mention, parses=None, heads=None, colour=None): sentence, start, end = mention head = None if parses is not None and heads is not None and end - start > 1: node = parses[sentence].get_nodes('lowest', start, end) if node is not None: head = head_finder.get_head(heads[sentence], node) ans = [] for i in xrange(start, end): ans.append(text[sentence][i]) if head is not None: if head[0][0] == i: ans[-1] = "\033[4m" + ans[-1] + "\033[0m" ans = ' '.join(ans) if colour is not None: ans = ans.split("\033[0m") if len(ans) == 1 or len(ans[1]) == 0: ans = colour + ans[0] + "\033[0m" else: ans = colour + ans[0] + "\033[0m" + colour + ans[1] + "\033[0m" return ans
def print_mention_text(out, gold_mentions, auto_mention_set, gold_parses, gold_heads, text): '''Document text with both system and gold mentions marked: - Gold mentions are marked with '[ ... ]' - System mentions are marked with '( ... )' - Mentions that occur in both are marked with '{ ... }' Colour is used to indicate missing and extra mentions. Blue for missing, red for extra, and purple where they overlap.''' mentions_by_sentence = defaultdict(lambda: [[], []]) for mention in gold_mentions: mentions_by_sentence[mention[0]][0].append(mention) for mention in auto_mention_set: mentions_by_sentence[mention[0]][1].append(mention) # Maps from word locations to tuples of: # ( in missing mention , in extra mention , is a head , # [(is gold? , end)] # [(is gold? , start)] ) word_colours = {} heads = set() for mention in gold_mentions: node = gold_parses[mention[0]].get_nodes('lowest', mention[1], mention[2]) if node is not None: head = head_finder.get_head(gold_heads[mention[0]], node) heads.add((mention[0], head[0][0])) for mention in auto_mention_set: node = gold_parses[mention[0]].get_nodes('lowest', mention[1], mention[2]) if node is not None: head = head_finder.get_head(gold_heads[mention[0]], node) heads.add((mention[0], head[0][0])) words = defaultdict(lambda: defaultdict(lambda: [False, False])) for mention in gold_mentions: for i in xrange(mention[1], mention[2]): words[mention[0], i][mention][0] = True for mention in auto_mention_set: for i in xrange(mention[1], mention[2]): words[mention[0], i][mention][1] = True # Printing for sentence in xrange(len(text)): output = [] for word in xrange(len(text[sentence])): text_word = text[sentence][word] if (sentence, word) in words: mention_dict = words[(sentence, word)] missing = set() for mention in mention_dict: if mention_dict[mention][0] and not mention_dict[mention][1]: missing.add(mention) extra = set() for mention in mention_dict: if not mention_dict[mention][0] and mention_dict[mention][1]: extra.add(mention) starts = [] for mention in mention_dict: if mention[1] == word: starts.append((mention[2], mention_dict[mention], mention)) starts.sort(reverse=True) ends = [] for mention in mention_dict: if mention[2] - 1 == word: ends.append((mention[1], mention_dict[mention], mention)) ends.sort(reverse=True) start = '' for mention in starts: character = '' if mention[1][0] and mention[1][1]: character = '{' elif mention[1][0]: character = '[' elif mention[1][1]: character = '(' inside_missing = False for emention in missing: if emention[1] <= mention[2][1] and mention[2][2] <= emention[2]: inside_missing = True inside_extra = False for emention in extra: if emention[1] <= mention[2][1] and mention[2][2] <= emention[2]: inside_extra = True colour = '15' if inside_missing and inside_extra: colour = '5' elif inside_missing: colour = '4' elif inside_extra: colour = '1' start += "\033[38;5;{}m{}\033[0m".format(colour, character) end = '' for mention in ends: character = '' if mention[1][0] and mention[1][1]: character = '}' elif mention[1][0]: character = ']' elif mention[1][1]: character = ')' inside_missing = False for emention in missing: if emention[1] <= mention[2][1] and mention[2][2] <= emention[2]: inside_missing = True inside_extra = False for emention in extra: if emention[1] <= mention[2][1] and mention[2][2] <= emention[2]: inside_extra = True colour = '15' if inside_missing and inside_extra: colour = '5' elif inside_missing: colour = '4' elif inside_extra: colour = '1' end += "\033[38;5;{}m{}\033[0m".format(colour, character) colour = '15' if len(extra) > 0 and len(missing) > 0: colour = '5' elif len(missing) > 0: colour = '4' elif len(extra) > 0: colour = '1' # head if (sentence, word) in heads: colour += ';4' text_word = start + "\033[38;5;{}m{}\033[0m".format(colour, text_word) + end output.append(text_word) word += 1 print >> out, ' '.join(output) + '\n' sentence += 1
def shg_format(parse, depth=0, head_map=None, traces=None, edges=None): parse.calculate_spans() traces = treebanks.resolve_traces(parse) base_parse = treebanks.remove_traces(parse, False) head_map = head_finder.pennconverter_find_heads(base_parse) edges = [] ### for node in pstree.TreeIterator(parse): ### head = head_finder.get_head(head_map, node, True) ### print head, node.span, node.label, text_words(node) # Prefix ans = [] ans = ["# Parse " + line for line in text_tree(parse, False, True).split("\n")] words = text_words(parse).split() ans.append("# Sent") for i, w in enumerate(words): ans[-1] += " {} {}".format(i + 1, w) # Trace info (for debugging) for i in range(6): if i in [0, 3, 4]: for signature in traces[i]: ans.append("# Trace {} {} {}".format(i, signature, traces[i][signature])) if i in [1, 2]: for num in traces[i]: for tparse in traces[i][num]: ans.append("# Trace {} {} {} {}".format(i, num, tparse, tparse.span)) edges = [] spines = [] label = treebanks.remove_coindexation_from_label(parse.label) head = head_finder.get_head(head_map, parse, True) level = label_level(parse, head_map) edges.append((head[0][1], '_', 0, label + "_" + str(level), "_")) get_edges(parse, edges, spines, head_map, traces) # Graph properties nedges = [] for edge in edges: a = int(edge[0]) b = int(edge[2]) if a < b: nedges.append((a, b)) else: nedges.append((b, a)) graph_type = '# Graph type - ' if check_proj(nedges): graph_type += " proj" elif check_1ec(nedges): graph_type += " 1ec" else: graph_type += "other" graph_type += ' tree' if check_tree(nedges) else ' graph' ans.append(graph_type) # Spines and edges spines.sort() for spine in spines: word, POS, chain, token = spine chain = '_'.join(chain) if len(chain) > 0 else '_' line = "{} {} {} {}".format(word, token, POS, chain) to_add = [] for edge in edges: if edge[0] == word: parent = edge[2] label = edge[1] etype = edge[3] trace_info = edge[4] part = " | {} {} {} {}".format(parent, label, etype, trace_info) if trace_info == '_': to_add.insert(0, part) else: to_add.append(part) ans.append(line + ''.join(to_add)) ans.append('') return "\n".join(ans)
def get_edges(parse, edges, spines, head_map, traces): # Add spine chead = head_finder.get_head(head_map, parse, True) if parse.is_terminal() and not parse.is_trace(): chain = [] cur = parse.parent while cur is not None and chead == head_finder.get_head(head_map, cur, True): chain.append(treebanks.remove_coindexation_from_label(cur.label)) signature = (cur.span, cur.label) target, null_cur = None, None if signature in traces[3]: target, null_cur, onum = traces[3][signature] if signature in traces[4]: target, null_cur = traces[4][signature] if target is not None: null = [null_cur.word] null_cur = null_cur.parent while null_cur != target: null.append(treebanks.remove_coindexation_from_label(null_cur.label)) null_cur = null_cur.parent null.reverse() chain[-1] += "({})".format("_".join(null)) cur = cur.parent spines.append((parse.wordspan[1], parse.label, chain, parse.word)) # Add edges if not parse.is_terminal(): # Normal edges for subparse in parse.subtrees: shead = head_finder.get_head(head_map, subparse, True) if shead is not None and chead is not None: if shead[0] != chead[0]: plabel = treebanks.remove_coindexation_from_label(parse.label) clabel = treebanks.remove_coindexation_from_label(subparse.label) plevel = label_level(parse, head_map) clevel = label_level(subparse, head_map) edges.append((shead[0][1], plabel + '_' + str(plevel), chead[0][1], clabel + "_" + str(clevel), "_")) # Traces signature = (parse.span, parse.label) # A trace where both locations are NONE if signature in traces[3]: cparent, cparse, num = traces[3][signature] chead = head_finder.get_head(head_map, parse, True) clabel = treebanks.remove_coindexation_from_label(cparse.parent.label) clevel = label_level(cparse.parent, head_map) if num in traces[1]: for subparse in traces[1][num]: trace_type = clabel + '_' + str(clevel) parent = subparse while head_finder.get_head(head_map, parent, True) is None and parent.parent is not None: parent = parent.parent phead = head_finder.get_head(head_map, parent, True) plabel = treebanks.remove_coindexation_from_label(parent.label) ilabel = treebanks.remove_coindexation_from_label(subparse.parent.label) ilabel += "_"+ '-'.join(subparse.word.split('-')[:-1]) level = label_level(parent, head_map) edges.append((chead[0][1], plabel + '_' + str(level), phead[0][1], trace_type, ilabel)) # The realisation point of the trace (either with or without an observed word) if signature in traces[0]: num = traces[0][signature][0] if num in traces[1]: # If this is the middle of a chain of traces, follow the chain thead = chead tparse = parse in_chain = False working = True while thead is None and working: working = False word = tparse.subtrees[0].word if word is None: # Ugh, these are messy cases, just find something to follow for option in tparse.word_yield(None, True): if '-' in option: word = option if '-' in word: onum = word.split('-')[-1] for signature in traces[0]: if traces[0][signature][0] == onum: tparse = traces[0][signature][1] thead = head_finder.get_head(head_map, tparse, True) working = True in_chain = True for subparse in traces[1][num]: slabel = treebanks.remove_coindexation_from_label(tparse.label) slevel = label_level(tparse, head_map) trace_type = "{}_{}".format(slabel, slevel) parent = subparse.parent # Attachment point plabel = treebanks.remove_coindexation_from_label(parent.parent.label) plevel = label_level(parent.parent, head_map) null_wrap = treebanks.remove_coindexation_from_label(parent.label) null_wrap += "_"+ '-'.join(subparse.word.split('-')[:-1]) while head_finder.get_head(head_map, parent, True) is None and parent.parent is not None: parent = parent.parent phead = head_finder.get_head(head_map, parent, True) if thead is not None: edges.append((thead[0][1], plabel + '_' + str(plevel), phead[0][1], trace_type, null_wrap)) elif in_chain: # Not handled by the null - null case above tparse = tparse.parent slevel = label_level(tparse, head_map, slabel) thead = head_finder.get_head(head_map, tparse, True) trace_type = "{}_{}".format(slabel, slevel) if thead is not None: edges.append((thead[0][1], plabel + '_' + str(plevel), phead[0][1], trace_type, null_wrap)) # For each (P-# ... ) add a link from all (P=# ... ) that match if num in traces[2]: phead = head_finder.get_head(head_map, parse, True) for subparse in traces[2][num]: shead = head_finder.get_head(head_map, subparse, True) plabel = treebanks.remove_coindexation_from_label(parse.label) clabel = treebanks.remove_coindexation_from_label(subparse.label) plevel = label_level(parse, head_map) clevel = label_level(subparse, head_map) if phead is None: phead = head_finder.get_head(head_map, parse.parent, True) plabel = treebanks.remove_coindexation_from_label(parse.parent.label) plevel = label_level(parse.parent, head_map) if shead is None: print "# Failed on = with (P=# (NONE))" else: edges.append((shead[0][1], plabel + '_' + str(plevel), phead[0][1], clabel + "_" + str(clevel), "=")) for subparse in parse.subtrees: get_edges(subparse, edges, spines, head_map, traces)