def is_corruption_shuffled(entry): a = entry['sentence_A'] b = entry['sentence_B'] if sorted(a.split()) == sorted(b.split()): norm_o = norm(entry['sentence_A_original']) if edit_distance(norm_o, norm(a)) > edit_distance(norm_o, norm(b)): return 1 else: return 2 return 0
def are_sem_opposites(entry): a = entry['sentence_A'] b = entry['sentence_B'] # Are near synonyms but not qualifier replacement if one_word_diff(entry['sentence_A'], entry['sentence_B']): # S1S3 same set pairs only have a 0.9% chance of being labeled entailment if entry['entailment_label'] != 'ENTAILMENT' and not is_corruption_det_replace(entry): norm_o = norm(entry['sentence_A_original']) if edit_distance(norm_o, norm(a)) > edit_distance(norm_o, norm(b)): return 1 else: return 2 return 0
def is_corruption_det_replace(entry): def normalize(sent): s = sent.lower().split() s = [ w for w in s if (w!='a' and w!='the') ] return s a = entry['sentence_A'] b = entry['sentence_B'] if normalize(a) == normalize(b): norm_o = norm(entry['sentence_A_original']) if edit_distance(norm_o, norm(a)) > edit_distance(norm_o, norm(b)): return 1 else: return 2 return 0
def closest_match(sent, dataset): """ Using edit distance, find the closest key match to the given sentence. """ min_dist = float('inf') match = None if dataset == 'FLICKR': data_reverse = flickr_reverse elif dataset == 'SEMEVAL': data_reverse = msr_reverse else: raise Exception('unknown dataset %s' % dataset) # take what you can get if sent in data_reverse: return sent bins = defaultdict(list) for s in data_reverse.keys(): # Token-based edit distance dist = edit_distance(sent.split(), s.split()) bins[dist].append(s) lowest = min(bins.keys()) for s in bins[lowest]: dist = edit_distance(sent, s) # guaranteed okay if dist == 0: return s if dist < min_dist: min_dist = dist match = s assert match != None, 'no match found for %s' % sent return match