예제 #1
0
def _get_min_dep_path(sentence, span1, span2):
    """Return the shortest dependency path between two Span objects

    Args:
        sentence: a list of Word objects
        span1: the first Span
        span2: the second Span
    Returns: a list of DepEdge objects
    """
    min_path = None
    min_path_length = 200  # ridiculously high number?
    for i in range(span1.begin_word_id, span1.begin_word_id + span1.length):
        for j in range(span2.begin_word_id, span2.begin_word_id + span2.length):
            p = dep_path_between_words(sentence, i, j)
            if len(p) < min_path_length:
                min_path = p
    return min_path
예제 #2
0
def _get_min_dep_path(sentence, span1, span2):
    """Return the shortest dependency path between two Span objects

    Args:
        sentence: a list of Word objects
        span1: the first Span
        span2: the second Span
    Returns: a list of DepEdge objects
    """
    min_path = None
    min_path_length = 200  # ridiculously high number?
    for i in range(span1.begin_word_id, span1.begin_word_id + span1.length):
        for j in range(span2.begin_word_id,
                       span2.begin_word_id + span2.length):
            p = dep_path_between_words(sentence, i, j)
            if len(p) < min_path_length:
                min_path = p
    return min_path
예제 #3
0
def get_recurrent_features(row):
  line = row.strip().split('\t')
  dep_graph_str = string.replace(line[1], '\\t', '\t')
  dep_graph_str = string.replace(dep_graph_str, '\\n', '\n')
  #dep_graph_str = string.replace(dep_graph_str, '\\\'', '\'')
  lemma_str = line[3]
  words_str = line[2]
  words_str = string.replace(words_str, "\",\"", "~^~")
  # skip sentences with empty dependency graphs
  #if dep_graph_str == "":
  #  return ""
  types = [line[9], line[13]]
  starts = [line[14], line[16]]
  ends = [line[15], line[17]]
  lemma = lemma_str.split(ARR_DELIM)
  dep_graph = dep_graph_str.split("\n")
  #PATTERN = re.compile(r'''((?:"[^"]*")+)''')
  #words = PATTERN.split(words_str[1:-1])[1::2]
  words = words_str.split(",")
  for i,word in enumerate(words):
	if word == "~^~":
		words[i] = ','
  mention_ids = [line[7], line[11]]
  mention_words = [[words[int(starts[0]): int(ends[0])]],[words[int(starts[1]):int(ends[1])]]]
  # create a list of mentions
  mentions = zip(mention_ids, mention_words, types, starts, ends)
  mentions = map(lambda x: {"mention_id" : x[0], "word" : x[1], "type" : x[2], "start" : int(x[3]), "end" : int(x[4])}, mentions)

  relation = None
  if len(line) == 21:
	relation = line[18]
  # get a list of Word objects
  obj = {}
  obj['lemma'] = lemma
  obj['words'] = words
  obj['dep_graph'] = dep_graph
  word_obj_list = ddlib.unpack_words(obj, lemma='lemma', words='words', dep_graph='dep_graph', dep_graph_parser=dep_format_parser)
  # at this point we have a list of the mentions in this sentence

  # go through all pairs of mentions
  for m1 in mentions:
    start1 = m1["start"]
    end1 = m1["end"]

    #if m1["type"] not in ["PERSON", "ORGANIZATION"]:
    #  continue

    for m2 in mentions:
      #if m1["mention_id"] == m2["mention_id"]:
        #continue

      start2 = m2["start"]
      end2 = m2["end"]

      edges = ddlib.dep_path_between_words(word_obj_list, end1 - 1, end2 - 1)
      #print edges
      if len(edges) > 0:
        num_roots = 0 # the number of root nodes
        num_left = 0 # the number of edges to the left of the root
        num_right = 0 # the number of edges to the right of the root
        left_path = "" # the dependency path to the left of the root
        right_path = "" # the dependency path to the right of the root

        # find the index of the switch from up to down
        switch_direction_index = -1
        for i in range(len(edges)):
          if not edges[i].is_bottom_up:
            switch_direction_index = i
            break
        
        # iterate through the edge list
        for i in range(len(edges)):
          curr_edge = edges[i]

          # count the number of roots; if there are more than 1 root then our dependency
          # path is disconnected
          if curr_edge.label == 'ROOT':
            num_roots += 1

          # going from the left to the root
          if curr_edge.is_bottom_up:
            num_left += 1

            # if this is the edge pointing to the root (word2 is the root)
            if i == switch_direction_index - 1:
              left_path = left_path + ("--" + curr_edge.label + "->")
              root = curr_edge.word2.lemma.lower()
	      #root = curr_edge.word2.word
            # this edge does not point to the root
            else:
              # if we are at the last edge, don't include the word (part of the mention)
              if i == len(edges) - 1:
                left_path = left_path + ("--" + curr_edge.label + "->")
              else:
                left_path = left_path + ("--" + curr_edge.label + "->" + curr_edge.word2.lemma.lower())
	        #left_path = left_path + ("--" + curr_edge.label + "->" + curr_edge.word2.word)

          # going from the root to the right
          else:
            num_right += 1

            # the first edge to the right of the root
            if i == switch_direction_index:
              right_path = right_path + "<-" + curr_edge.label + "--"
	      #right_path = right_path + "<-" + curr_edge.label + "--"

            # this edge does not point from the root
            else:
              # if we are at the first edge, don't include the word (part of the mention)
              if i == 0:
                right_path = right_path + ("<-" + curr_edge.label + "--")
              else:
                # word1 is the parent for right to left
                right_path = right_path + (curr_edge.word1.lemma.lower() + "<-" + curr_edge.label + "--")
		#right_path = right_path + (curr_edge.word1.word + "<-" + curr_edge.label + "--")

        # if the root is at the end or at the beginning (direction was all up or all down)
        if num_right == 0:
          root = "|SAMEPATH"
        elif num_left == 0:
          root = "SAMEPATH|"

        # if the edges have a disconnect
        elif num_roots > 1:
          root = "|NONEROOT|"

        # this is a normal tree with a connected root in the middle
        else:
          root = "|" + root + "|"

        path = left_path + root + right_path

        feat = [m1["word"], m2["word"], m1["type"], m2["type"], path]
        # make sure each of the strings we will output is encoded as utf-8
	if relation is not None:
		feat.append(relation[1:-1])
	return feat
  return [m1["word"], m2["word"], m1["type"], m2["type"], ""]