Exemplo n.º 1
0
 def test_tokens_between_spans(self):
   span1 = dd.Span(0, 2)
   span2 = dd.Span(3, 5)
   words_between = dd.tokens_between_spans(self.words, span1, span2)
   self.assertEqual(words_between[:], (False, ["Jake"]))
   words_between = dd.tokens_between_spans(self.words, span2, span1)
   self.assertEqual(words_between[:], (True, ["Jake"]))
   words_between = dd.tokens_between_spans(self.words, span1, span1)
   self.assertEqual(words_between[:], (False, []))
Exemplo n.º 2
0
 def test_tokens_between_spans(self):
     span1 = dd.Span(0, 2)
     span2 = dd.Span(3, 5)
     words_between = dd.tokens_between_spans(self.words, span1, span2)
     self.assertEqual(words_between[:], (False, ["Jake"]))
     words_between = dd.tokens_between_spans(self.words, span2, span1)
     self.assertEqual(words_between[:], (True, ["Jake"]))
     words_between = dd.tokens_between_spans(self.words, span1, span1)
     self.assertEqual(words_between[:], (False, []))
Exemplo n.º 3
0
 def test_tokens_between_spans(self):
   span1 = dd.Span(0, 2)
   span2 = dd.Span(3, 5)
   words_between = dd.tokens_between_spans(self.words, span1, span2)
   self.assertEqual([words_between[0], list(words_between[1])], [False, ["Jake"]])
   words_between = dd.tokens_between_spans(self.words, span2, span1)
   self.assertEqual([words_between[0], list(words_between[1])], [True, ["Jake"]])
   words_between = dd.tokens_between_spans(self.words, span1, span1)
   self.assertEqual([words_between[0], list(words_between[1])], [False, []])
Exemplo n.º 4
0
 def test_tokens_between_spans(self):
     span1 = dd.Span(0, 2)
     span2 = dd.Span(3, 5)
     words_between = dd.tokens_between_spans(self.words, span1, span2)
     self.assertEqual(
         [words_between[0], list(words_between[1])], [False, ["Jake"]])
     words_between = dd.tokens_between_spans(self.words, span2, span1)
     self.assertEqual(
         [words_between[0], list(words_between[1])], [True, ["Jake"]])
     words_between = dd.tokens_between_spans(self.words, span1, span1)
     self.assertEqual(
         [words_between[0], list(words_between[1])], [False, []])
  
  # Get all fields from a row
  words = parts[0].split(ARR_DELIM)
  relation_id = parts[1]
  p2_text = parts[2]
  p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[3:]]

  # Unpack input into tuples.
  span1 = ddlib.Span(begin_word_id=p1_start, length=p1_length)
  span2 = ddlib.Span(begin_word_id=p2_start, length=p2_length)

  # Features for this pair come in here
  features = set()
  
  # Feature 1: Bag of words between the two phrases
  words_between = ddlib.tokens_between_spans(words, span1, span2)
  count = 1
  for word in words_between.elements:
    if count < nbWordsBetweenPeopleCompanyConsidered:
      features.add("word_between=" + word)
    count +=1
    

  # Feature 2: Number of words between the two phrases
  features.add("num_words_between=%s" % len(words_between.elements))

  # Feature 3: Is the last name of the founder included in the name of the company?
  last_word_left = ddlib.materialize_span(words, span1)[-1]
  if (last_word_left in p2_text):
    features.add("potential_last_name_match")
Exemplo n.º 6
0
import sys, json
import ddlib     # DeepDive python utility

# For each input tuple
for row in sys.stdin:
  obj = json.loads(row)
  words = obj["words"]
  # Unpack input into tuples.
  span1 = ddlib.Span(begin_word_id=obj['p1_start'], length=obj['p1_length'])
  span2 = ddlib.Span(begin_word_id=obj['p2_start'], length=obj['p2_length'])

  # Features for this pair come in here
  features = set()

  # Feature 1: Bag of words between the two phrases
  words_between = ddlib.tokens_between_spans(words, span1, span2)
  for word in words_between.elements:
    features.add("word_between=" + word)

  # Feature 2: Number of words between the two phrases
  features.add("num_words_between=%s" % len(words_between.elements))

  # Feature 3: Does the last word (last name) match?
  last_word_left = ddlib.materialize_span(words, span1)[-1]
  last_word_right = ddlib.materialize_span(words, span2)[-1]
  if (last_word_left == last_word_right):
    features.add("potential_last_name_match")

  ########################
  # Improved Feature Set #
  ########################
Exemplo n.º 7
0
# sample json
for row in sys.stdin:

  # Unpack input into tuples.
  #
  obj = json.loads(row)
  words, lemmas = obj["words"], obj["lemma"]
  span1 = ddlib.Span(begin_word_id=obj['p1.start_position'], length=obj['p1.length'])
  span2 = ddlib.Span(begin_word_id=obj['p2.start_position'], length=obj['p2.length'])

  features = set()

  # Feature 1: Find out if a lemma of marry occurs.
  # A better feature would ensure this is on the dependency path between the two.
  #
  lemma_between = ddlib.tokens_between_spans(lemmas, span1, span2)
  married_words = ('marry', 'widow')
  for lemma in lemma_between.elements:
    if lemma in married_words:
      features.add("important_word=%s" % lemma)

  # Feature 2: The number of words between the two phrases.
  # Intuition: if they are close by, the link may be stronger.
  #
  words_between = ddlib.tokens_between_spans(words, span1, span2)
  l = len(words_between.elements)
  features.add("num_words_between=%s" % l if l<5 else "many_words_between")

  # Feature 3: Check if the last name matches heuristically.
  #
  last_word_left = ddlib.materialize_span(words, span1)[-1]
Exemplo n.º 8
0
# sample json
for row in sys.stdin:

  # Unpack input into tuples.
  #
  obj = json.loads(row)
  words, lemmas = obj["words"], obj["lemma"]
  span1 = ddlib.Span(begin_word_id=obj['p1.start_position'], length=obj['p1.length'])
  span2 = ddlib.Span(begin_word_id=obj['p2.start_position'], length=obj['p2.length'])

  features = set()

  # Feature 1: Find out if a lemma of marry occurs.
  # A better feature would ensure this is on the dependency path between the two.
  #
  lemma_between = ddlib.tokens_between_spans(lemmas, span1, span2)
  married_words = ('marry', 'widow')
  for lemma in lemma_between.elements:
    if lemma in married_words:
      features.add("important_word=%s" % lemma) 

  # Feature 2: The number of words between the two phrases.
  # Intuition: if they are close by, the link may be stronger.
  #
  words_between = ddlib.tokens_between_spans(words, span1, span2)
  l = len(words_between.elements)
  features.add("num_words_between=%s" % l if l<5 else "many_words_between")

  # Feature 3: Check if the last name matches heuristically.
  # 
  last_word_left = ddlib.materialize_span(words, span1)[-1]