Пример #1
0
def get_edges(parse, edges, spines, head_map, symbol_map):
    for subparse in parse.subtrees:
        get_edges(subparse, edges, spines, head_map, symbol_map)

    if not parse.is_terminal():
        phead = head_finder.get_head(head_map, parse, True)
        # Normal edges, added by looking at which subparses are not the head of this non-terminal
        for subparse in parse.subtrees:
            chead = head_finder.get_head(head_map, subparse, True)
            if phead is not None and chead is not None and phead[0] != chead[0]:
                psym = symbol_map[parse.unique_id]
                csym = subparse.label + "_0"
                if not subparse.is_terminal():
                    cysm = symbol_map[subparse.unique_id]
                edges.append(
                    (chead[0][1], csym, phead[0][1], psym, "_", False, False))
Пример #2
0
def label_level(parse, head_map, label=None):
  head = head_finder.get_head(head_map, parse, True)
  if label is None:
    label = treebanks.remove_coindexation_from_label(parse.label)
  count = 0
  done = False
  while not done:
    done = True
    for subparse in parse.subtrees:
      slabel = treebanks.remove_coindexation_from_label(subparse.label)
      if head == head_finder.get_head(head_map, subparse, True):
        done = False
        parse = subparse
        if slabel == label:
          count += 1
        break
  return count
Пример #3
0
def label_level(parse, head_map, label=None):
    head = head_finder.get_head(head_map, parse, True)
    if label is None:
        label = treebanks.remove_coindexation_from_label(parse.label)
    count = 0
    done = False
    while not done:
        done = True
        for subparse in parse.subtrees:
            slabel = treebanks.remove_coindexation_from_label(subparse.label)
            if head == head_finder.get_head(head_map, subparse, True):
                done = False
                parse = subparse
                if slabel == label:
                    count += 1
                break
    return count
def mention_head(mention, text, parses, heads, default_last=True):
	sentence, start, end = mention
	node = parses[sentence].get_nodes('lowest', start, end)
	if node is None:
		if default_last:
			node = parses[sentence].get_nodes('lowest', end - 1, end)
		else:
			return None
	return head_finder.get_head(heads[sentence], node)
Пример #5
0
def get_spines(parse, head_map, spines, symbol_map, traces):
    if not parse.is_terminal():
        for subparse in parse.subtrees:
            get_spines(subparse, head_map, spines, symbol_map, traces)
    elif not parse.is_trace():
        # Add spine, happens on the terminal that this spine will be assigned to
        chead = head_finder.get_head(head_map, parse, True)
        chain = []
        cur = parse.parent
        symbol_counts = defaultdict(lambda: 0)
        trace_symbol_counts = defaultdict(lambda: 0)
        # Walk up the parse via parent links, gradually building the spine
        while cur is not None and chead == head_finder.get_head(
                head_map, cur, True):
            if cur.parent is not None:  # Avoid the case of the ROOT
                chain.append('')
                # Add null elements that need to live here.
                # Most complex case:
                # ( (S
                #     (ADJP-TPC-1 (RB Not) (RB likely) )
                #     (, ,)
                #     (NP-SBJ (PRP I) )
                #     (VP (VBP think)
                #       (SBAR (-NONE- 0)
                #         (S
                #           (NP-SBJ (-NONE- *) )
                #           (ADJP-PRD (-NONE- *T*-1) ))))
                #     (. .) ))
                for node in cur.subtrees:
                    if node.wordspan[0] == node.wordspan[1]:
                        to_add = compress_null_for_spine(
                            node, trace_symbol_counts, symbol_map, traces)
                        if to_add is not None:
                            chain[-1] += to_add + "_"
                # Add the non-terminal
                label = treebanks.remove_coindexation_from_label(cur.label)
                num = symbol_counts[label]
                symbol_counts[label] += 1
                symbol_map[cur.unique_id] = "{}_{}".format(label, num)
                chain[-1] += label
            cur = cur.parent
        spines.append((parse.wordspan[1], parse.label, chain, parse.word))
Пример #6
0
def mention_head(mention, text, parses, heads, default_last=True):
	# Canasai's addition begin
	#mention = special_cases(mention, text, parses, heads)
	# Canasai's addition end

	sentence, start, end = mention
	node = parses[sentence].get_nodes('lowest', start, end)
	if node is None:
		if default_last:
			node = parses[sentence].get_nodes('lowest', end - 1, end)
		else:
			return None
	return head_finder.get_head(heads[sentence], node)
def mention_text(text, mention, parses=None, heads=None, colour=None):
	sentence, start, end = mention
	head = None
	if parses is not None and heads is not None and end - start > 1:
		node = parses[sentence].get_nodes('lowest', start, end)
		if node is not None:
			head = head_finder.get_head(heads[sentence], node)
	ans = []
	for i in xrange(start, end):
		ans.append(text[sentence][i])
		if head is not None:
			if head[0][0] == i:
				ans[-1] = "\033[4m" + ans[-1] + "\033[0m"
	ans = ' '.join(ans)
	if colour is not None:
		ans = ans.split("\033[0m")
		if len(ans) == 1 or len(ans[1]) == 0:
			ans = colour + ans[0] + "\033[0m"
		else:
			ans = colour + ans[0] + "\033[0m" + colour + ans[1] + "\033[0m"
	return ans
def print_mention_text(out, gold_mentions, auto_mention_set, gold_parses, gold_heads, text):
	'''Document text with both system and gold mentions marked:
 - Gold mentions are marked with '[ ... ]'
 - System mentions are marked with '( ... )'
 - Mentions that occur in both are marked with '{ ... }'
Colour is used to indicate missing and extra mentions.  Blue for missing, red
for extra, and purple where they overlap.'''

	mentions_by_sentence = defaultdict(lambda: [[], []])
	for mention in gold_mentions:
		mentions_by_sentence[mention[0]][0].append(mention)
	for mention in auto_mention_set:
		mentions_by_sentence[mention[0]][1].append(mention)

	# Maps from word locations to tuples of:
	# ( in missing mention , in extra mention , is a head ,
	#   [(is gold? , end)]
	#   [(is gold? , start)] )
	word_colours = {}
	heads = set()
	for mention in gold_mentions:
		node = gold_parses[mention[0]].get_nodes('lowest', mention[1], mention[2])
		if node is not None:
			head = head_finder.get_head(gold_heads[mention[0]], node)
			heads.add((mention[0], head[0][0]))
	for mention in auto_mention_set:
		node = gold_parses[mention[0]].get_nodes('lowest', mention[1], mention[2])
		if node is not None:
			head = head_finder.get_head(gold_heads[mention[0]], node)
			heads.add((mention[0], head[0][0]))

	words = defaultdict(lambda: defaultdict(lambda: [False, False]))
	for mention in gold_mentions:
		for i in xrange(mention[1], mention[2]):
			words[mention[0], i][mention][0] = True
	for mention in auto_mention_set:
		for i in xrange(mention[1], mention[2]):
			words[mention[0], i][mention][1] = True

	# Printing
	for sentence in xrange(len(text)):
		output = []
		for word in xrange(len(text[sentence])):
			text_word = text[sentence][word]
			if (sentence, word) in words:
				mention_dict = words[(sentence, word)]

				missing = set()
				for mention in mention_dict:
					if mention_dict[mention][0] and not mention_dict[mention][1]:
						missing.add(mention)
				extra = set()
				for mention in mention_dict:
					if not mention_dict[mention][0] and mention_dict[mention][1]:
						extra.add(mention)
				starts = []
				for mention in mention_dict:
					if mention[1] == word:
						starts.append((mention[2], mention_dict[mention], mention))
				starts.sort(reverse=True)
				ends = []
				for mention in mention_dict:
					if mention[2] - 1 == word:
						ends.append((mention[1], mention_dict[mention], mention))
				ends.sort(reverse=True)

				start = ''
				for mention in starts:
					character = ''
					if mention[1][0] and mention[1][1]:
						character = '{'
					elif mention[1][0]:
						character = '['
					elif mention[1][1]:
						character = '('
					inside_missing = False
					for emention in missing:
						if emention[1] <= mention[2][1] and mention[2][2] <= emention[2]:
							inside_missing = True
					inside_extra = False
					for emention in extra:
						if emention[1] <= mention[2][1] and mention[2][2] <= emention[2]:
							inside_extra = True
					colour = '15'
					if inside_missing and inside_extra:
						colour = '5'
					elif inside_missing:
						colour = '4'
					elif inside_extra:
						colour = '1'
					start += "\033[38;5;{}m{}\033[0m".format(colour, character)

				end = ''
				for mention in ends:
					character = ''
					if mention[1][0] and mention[1][1]:
						character = '}'
					elif mention[1][0]:
						character = ']'
					elif mention[1][1]:
						character = ')'
					inside_missing = False
					for emention in missing:
						if emention[1] <= mention[2][1] and mention[2][2] <= emention[2]:
							inside_missing = True
					inside_extra = False
					for emention in extra:
						if emention[1] <= mention[2][1] and mention[2][2] <= emention[2]:
							inside_extra = True
					colour = '15'
					if inside_missing and inside_extra:
						colour = '5'
					elif inside_missing:
						colour = '4'
					elif inside_extra:
						colour = '1'
					end += "\033[38;5;{}m{}\033[0m".format(colour, character)

				colour = '15'
				if len(extra) > 0 and len(missing) > 0:
					colour = '5'
				elif len(missing) > 0:
					colour = '4'
				elif len(extra) > 0:
					colour = '1'
				# head
				if (sentence, word) in heads:
					colour += ';4'
				text_word = start + "\033[38;5;{}m{}\033[0m".format(colour, text_word) + end
			output.append(text_word)
			word += 1
		print >> out, ' '.join(output) + '\n'
		sentence += 1
Пример #9
0
def shg_format(parse, depth=0, head_map=None, traces=None, edges=None):
  parse.calculate_spans()
  traces = treebanks.resolve_traces(parse)
  base_parse = treebanks.remove_traces(parse, False)
  head_map = head_finder.pennconverter_find_heads(base_parse)
  edges = []
###  for node in pstree.TreeIterator(parse):
###    head = head_finder.get_head(head_map, node, True)
###    print head, node.span, node.label, text_words(node)

  # Prefix
  ans = []
  ans = ["# Parse  " + line for line in text_tree(parse, False, True).split("\n")]
  words = text_words(parse).split()
  ans.append("# Sent")
  for i, w in enumerate(words):
    ans[-1] += "  {} {}".format(i + 1, w)

  # Trace info (for debugging)
  for i in range(6):
    if i in [0, 3, 4]:
      for signature in traces[i]:
        ans.append("# Trace {} {} {}".format(i, signature, traces[i][signature]))
    if i in [1, 2]:
      for num in traces[i]:
        for tparse in traces[i][num]:
          ans.append("# Trace {} {} {} {}".format(i, num, tparse, tparse.span))

  edges = []
  spines = []
  label = treebanks.remove_coindexation_from_label(parse.label)
  head = head_finder.get_head(head_map, parse, True)
  level = label_level(parse, head_map)
  edges.append((head[0][1], '_', 0, label + "_" + str(level), "_"))

  get_edges(parse, edges, spines, head_map, traces)

  # Graph properties
  nedges = []
  for edge in edges:
    a = int(edge[0])
    b = int(edge[2])
    if a < b:
      nedges.append((a, b))
    else:
      nedges.append((b, a))
  graph_type = '# Graph type - '
  if check_proj(nedges):
    graph_type += " proj"
  elif check_1ec(nedges):
    graph_type += "  1ec"
  else:
    graph_type += "other"
  graph_type += ' tree' if check_tree(nedges) else ' graph'
  ans.append(graph_type)

  # Spines and edges
  spines.sort()
  for spine in spines:
    word, POS, chain, token = spine
    chain = '_'.join(chain) if len(chain) > 0 else '_'
    line = "{} {} {} {}".format(word, token, POS, chain)
    to_add = []
    for edge in edges:
      if edge[0] == word:
        parent = edge[2]
        label = edge[1]
        etype = edge[3]
        trace_info = edge[4]
        part = " | {} {} {} {}".format(parent, label, etype, trace_info)
        if trace_info == '_':
          to_add.insert(0, part)
        else:
          to_add.append(part)
    ans.append(line + ''.join(to_add))
  ans.append('')
  return "\n".join(ans)
Пример #10
0
def get_edges(parse, edges, spines, head_map, traces):
  # Add spine
  chead = head_finder.get_head(head_map, parse, True)
  if parse.is_terminal() and not parse.is_trace():
    chain = []
    cur = parse.parent
    while cur is not None and chead == head_finder.get_head(head_map, cur, True):
      chain.append(treebanks.remove_coindexation_from_label(cur.label))
      signature = (cur.span, cur.label)
      target, null_cur = None, None
      if signature in traces[3]:
        target, null_cur, onum = traces[3][signature]
      if signature in traces[4]:
        target, null_cur = traces[4][signature]
      if target is not None:
        null = [null_cur.word]
        null_cur = null_cur.parent
        while null_cur != target:
          null.append(treebanks.remove_coindexation_from_label(null_cur.label))
          null_cur = null_cur.parent
        null.reverse()
        chain[-1] += "({})".format("_".join(null))
      cur = cur.parent
    spines.append((parse.wordspan[1], parse.label, chain, parse.word))

  # Add edges
  if not parse.is_terminal():
    # Normal edges
    for subparse in parse.subtrees:
      shead = head_finder.get_head(head_map, subparse, True)
      if shead is not None and chead is not None:
        if shead[0] != chead[0]:
          plabel = treebanks.remove_coindexation_from_label(parse.label)
          clabel = treebanks.remove_coindexation_from_label(subparse.label)
          plevel = label_level(parse, head_map)
          clevel = label_level(subparse, head_map)
          edges.append((shead[0][1], plabel + '_' + str(plevel), chead[0][1], clabel + "_" + str(clevel), "_"))

    # Traces
    signature = (parse.span, parse.label)

    # A trace where both locations are NONE
    if signature in traces[3]:
      cparent, cparse, num = traces[3][signature]
      chead = head_finder.get_head(head_map, parse, True)
      clabel = treebanks.remove_coindexation_from_label(cparse.parent.label)
      clevel = label_level(cparse.parent, head_map)
      if num in traces[1]:
        for subparse in traces[1][num]:
          trace_type = clabel + '_' + str(clevel)
          parent = subparse
          while head_finder.get_head(head_map, parent, True) is None and parent.parent is not None:
            parent = parent.parent
          phead = head_finder.get_head(head_map, parent, True)
          plabel = treebanks.remove_coindexation_from_label(parent.label)
          ilabel = treebanks.remove_coindexation_from_label(subparse.parent.label)
          ilabel += "_"+ '-'.join(subparse.word.split('-')[:-1])
          level = label_level(parent, head_map)
          edges.append((chead[0][1], plabel + '_' + str(level), phead[0][1], trace_type, ilabel))

    # The realisation point of the trace (either with or without an observed word)
    if signature in traces[0]:
      num = traces[0][signature][0]
      if num in traces[1]:
        # If this is the middle of a chain of traces, follow the chain
        thead = chead
        tparse = parse
        in_chain = False
        working = True
        while thead is None and working:
          working = False
          word = tparse.subtrees[0].word
          if word is None:
            # Ugh, these are messy cases, just find something to follow
            for option in tparse.word_yield(None, True):
              if '-' in option:
                word = option
          if '-' in word:
            onum = word.split('-')[-1]
            for signature in traces[0]:
              if traces[0][signature][0] == onum:
                tparse = traces[0][signature][1]
                thead = head_finder.get_head(head_map, tparse, True)
                working = True
                in_chain = True

        for subparse in traces[1][num]:
          slabel = treebanks.remove_coindexation_from_label(tparse.label)
          slevel = label_level(tparse, head_map)
          trace_type = "{}_{}".format(slabel, slevel)
          parent = subparse.parent # Attachment point
          plabel = treebanks.remove_coindexation_from_label(parent.parent.label)
          plevel = label_level(parent.parent, head_map)
          null_wrap = treebanks.remove_coindexation_from_label(parent.label)
          null_wrap += "_"+ '-'.join(subparse.word.split('-')[:-1])
          while head_finder.get_head(head_map, parent, True) is None and parent.parent is not None:
            parent = parent.parent
          phead = head_finder.get_head(head_map, parent, True)
          if thead is not None:
            edges.append((thead[0][1], plabel + '_' + str(plevel), phead[0][1], trace_type, null_wrap))
          elif in_chain:
            # Not handled by the null - null case above
            tparse = tparse.parent
            slevel = label_level(tparse, head_map, slabel)
            thead = head_finder.get_head(head_map, tparse, True)
            trace_type = "{}_{}".format(slabel, slevel)
            if thead is not None:
              edges.append((thead[0][1], plabel + '_' + str(plevel), phead[0][1], trace_type, null_wrap))

      # For each (P-# ... ) add a link from all (P=# ... ) that match
      if num in traces[2]:
        phead = head_finder.get_head(head_map, parse, True)
        for subparse in traces[2][num]:
          shead = head_finder.get_head(head_map, subparse, True)
          plabel = treebanks.remove_coindexation_from_label(parse.label)
          clabel = treebanks.remove_coindexation_from_label(subparse.label)
          plevel = label_level(parse, head_map)
          clevel = label_level(subparse, head_map)
          if phead is None:
            phead = head_finder.get_head(head_map, parse.parent, True)
            plabel = treebanks.remove_coindexation_from_label(parse.parent.label)
            plevel = label_level(parse.parent, head_map)
          if shead is None:
            print "# Failed on = with (P=# (NONE))"
          else:
            edges.append((shead[0][1], plabel + '_' + str(plevel), phead[0][1], clabel + "_" + str(clevel), "="))

    for subparse in parse.subtrees:
      get_edges(subparse, edges, spines, head_map, traces)