示例#1
0
def is_corruption_shuffled(entry):
    a = entry['sentence_A']
    b = entry['sentence_B']
    
    if sorted(a.split()) == sorted(b.split()):
      norm_o = norm(entry['sentence_A_original'])

      if edit_distance(norm_o, norm(a)) > edit_distance(norm_o, norm(b)):
        return 1
      else:
        return 2

    return 0
示例#2
0
def are_sem_opposites(entry):
  a = entry['sentence_A']
  b = entry['sentence_B']

  # Are near synonyms but not qualifier replacement
  if one_word_diff(entry['sentence_A'], entry['sentence_B']):
    # S1S3 same set pairs only have a 0.9% chance of being labeled entailment
    if entry['entailment_label'] != 'ENTAILMENT' and not is_corruption_det_replace(entry):
      
      norm_o = norm(entry['sentence_A_original'])

      if edit_distance(norm_o, norm(a)) > edit_distance(norm_o, norm(b)):
        return 1
      else:
        return 2

  return 0
示例#3
0
def is_corruption_det_replace(entry):
    def normalize(sent):
        s = sent.lower().split()
        s = [ w for w in s if (w!='a' and w!='the') ]
        return s
    a = entry['sentence_A']
    b = entry['sentence_B']

    if normalize(a) == normalize(b):
      norm_o = norm(entry['sentence_A_original'])

      if edit_distance(norm_o, norm(a)) > edit_distance(norm_o, norm(b)):
        return 1
      else:
        return 2

    return 0
def closest_match(sent, dataset):
  """
    Using edit distance, find the closest key match to the given sentence.
  """

  min_dist = float('inf')
  match = None
  
  if dataset == 'FLICKR':
    data_reverse = flickr_reverse
  elif dataset == 'SEMEVAL':
    data_reverse = msr_reverse
  else:
    raise Exception('unknown dataset %s' % dataset)

  # take what you can get
  if sent in data_reverse:
      return sent

  bins = defaultdict(list)
  for s in data_reverse.keys():
    # Token-based edit distance
    dist = edit_distance(sent.split(), s.split())
    bins[dist].append(s)

  lowest = min(bins.keys())

  for s in bins[lowest]:
    dist = edit_distance(sent, s)

    # guaranteed okay
    if dist == 0:
      return s

    if dist < min_dist:
      min_dist = dist
      match = s

  assert match != None, 'no match found for %s' % sent

  return match
示例#5
0
def closest_match(sent, dataset):
    """
    Using edit distance, find the closest key match to the given sentence.
  """

    min_dist = float('inf')
    match = None

    if dataset == 'FLICKR':
        data_reverse = flickr_reverse
    elif dataset == 'SEMEVAL':
        data_reverse = msr_reverse
    else:
        raise Exception('unknown dataset %s' % dataset)

    # take what you can get
    if sent in data_reverse:
        return sent

    bins = defaultdict(list)
    for s in data_reverse.keys():
        # Token-based edit distance
        dist = edit_distance(sent.split(), s.split())
        bins[dist].append(s)

    lowest = min(bins.keys())

    for s in bins[lowest]:
        dist = edit_distance(sent, s)

        # guaranteed okay
        if dist == 0:
            return s

        if dist < min_dist:
            min_dist = dist
            match = s

    assert match != None, 'no match found for %s' % sent

    return match