def get_labels(self): """label each slot in the sausage (O=correct X=incorrect)""" if self.correct(): # everything is correct return ['O'] * self.num_slots() # align the ref and the best hyp a = Sequence(self.ref()) b = Sequence(self.best_hyp()) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = StrictGlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) alignment = v.decodeSequenceAlignment(encodeds[0]) # get labels according to alignment labels = [] for a,b in zip(alignment.first, alignment.second): if a == b or a == '-' and b == '*DELETE*': labels.append('O') else: labels.append('X') return labels
def align_ref_long(hyp, ref): ''' Aligns a ref to a sausage-aligned hype using the align library ''' # align ref to hyp sr = Sequence(ref) sh = Sequence(hyp) v = Vocabulary() rEncoded = v.encodeSequence(sr) hEncoded = v.encodeSequence(sh) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = StrictGlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(hEncoded, rEncoded, backtrace=True) # Iterate over optimal alignments and print them. alignment = v.decodeSequenceAlignment(encodeds[0]) ref_align_raw = [token[0] for token in alignment if token[0] != '-'] ref_align = [] for token in ref_align_raw: if token == '-': ref_align.append(delete_token) else: ref_align.append(token) for i in range(len(hyp) - len(ref_align_raw)): ref_align.append(delete_token) return ref_align
def get_labels(self): """label each slot in the sausage (O=correct X=incorrect)""" if self.correct(): # everything is correct return ['O'] * self.num_slots() # align the ref and the best hyp a = Sequence(self.ref()) b = Sequence(self.best_hyp()) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = StrictGlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) alignment = v.decodeSequenceAlignment(encodeds[0]) # get labels according to alignment labels = [] for a, b in zip(alignment.first, alignment.second): if a == b or a == '-' and b == '*DELETE*': labels.append('O') else: labels.append('X') return labels
def align_sequences(seq_a, seq_b): # Must escape '-' because alignment library uses it as a gap # marker. escaped_seq_a = ['\\-' if x == '-' else x for x in seq_a] escaped_seq_b = ['\\-' if x == '-' else x for x in seq_b] v = Vocabulary() encoded_a = v.encodeSequence(Sequence(escaped_seq_a)) encoded_b = v.encodeSequence(Sequence(escaped_seq_b)) scoring = SimpleScoring(matchScore=3, mismatchScore=-1) aligner = StrictGlobalSequenceAligner(scoring, gapScore=-2) _, encodeds = aligner.align(encoded_a, encoded_b, backtrace=True) return encodeds[0]
def __init__(self): self.__dict__.update(locals()) del self.self self.scoring = SimpleScoring(2, -1) self.aligner = StrictGlobalSequenceAligner(self.scoring, -2)
class EditFinder(object): def __init__(self): self.__dict__.update(locals()) del self.self self.scoring = SimpleScoring(2, -1) self.aligner = StrictGlobalSequenceAligner(self.scoring, -2) def align(self, word, error): vocab = Vocabulary() a = vocab.encodeSequence(Sequence(word)) b = vocab.encodeSequence(Sequence(error)) score, encodings = self.aligner.align(a, b, backtrace=True) # Choose the highest-score alignment. score = -sys.maxsize best_alignment = None for encoding in encodings: alignment = vocab.decodeSequenceAlignment(encoding) if alignment.score > score: best_alignment = alignment score = alignment.score return best_alignment.first, best_alignment.second def edit_is_rotation(self, first, second, start, end): first_span = first[start:end+1] second_span = [c for c in reversed(second[start:end+1])] return first_span == second_span and \ '-' not in first_span and '-' not in second_span def build_rotation(self, first, second, start, end): first_span = first[start:end+1] second_span = second[start:end+1] if start == 0: first_span.insert(0, '^') second_span.insert(0, '^') else: first_span.insert(0, first[start-1]) second_span.insert(0, first[start-1]) return (''.join(first_span), ''.join(second_span)) def edit_is_transposition(self, first, second, start, end): first_span = first[start:end+1] second_span = [c for c in reversed(second[start:end+1])] return first_span == second_span and \ first_span[0] == '-' and second_span[0] == '-' def build_transposition(self, first, second, start, end): first_span = first[start+1:end+1] second_span = second[start:end] return (''.join(first_span), ''.join(second_span)) def edit_is_insertion(self, first, second, start, end): ret = first[start] == '-' #print('edit_is_insertion', first, second, start, end, first[start] == '-', ret) return ret def build_insertion(self, first, second, start, end): extent = 0 for c in first[start:]: if c != '-': break extent += 1 if start == 0: first_span = "^" + ''.join(first[:1]) second_span = first_span[:-1] + ''.join(second[max(0,start-1):start+extent]) else: first_span = ''.join(first[max(0,start-2):start+1]) second_span = first_span[:-2] + ''.join(second[max(0,start-1):start+extent]) first_span = ''.join(c for c in first_span if c != "-") second_span = ''.join(c for c in second_span if c != "-") return (first_span, second_span) def edit_is_deletion(self, first, second, start, end): #ret = start == end and second[start] == '-' ret = second[start] == '-' #print('edit_is_deletion', first, second, start, end, second[start] == '-', ret) return ret def build_deletion(self, first, second, start, end): #print('build_deletion', first, second, start, end, len(first)) extent = 0 for c in second[start:]: if c != '-': break extent += 1 if start == 0: first_span = '^' + first[start] second_span = '^' else: first_span = ''.join(first[start-1:start+extent]) second_span = first[start-1] return (first_span, second_span) def edit_is_substitution(self, first, second, start, end): ret = '-' not in [first[start], second[start]] and \ first[start] != second[start] #print('edit_is_substitution', first, second, start, end, first[start] == '-', ret) return ret def build_substitution(self, first, second, start, end): #print('build_substitution', first, second, start, end) extent = 0 for f,s in zip(first[start:],second[start:]): if f==s or f == "-" or s == "-": break extent += 1 return (''.join(first[max(0,start-1):start+extent]), ''.join(second[max(0,start-1):start+extent])) #return (first[start], second[start]) def build_edits(self, first, second): positions = [] for i in range(len(first)): if first[i] != second[i]: positions.append(i) edits = [] edit_indices = [] #print('positions', positions) skip_next = 0 for i in range(len(positions)): start = positions[i] try: end = positions[i+1] except IndexError: end = start if skip_next: skip_next -= positions[i] - positions[i-1] #print skip_next if skip_next: if skip_next > 0: continue else: skip_next = 0 #print('i', i, 'start', start, 'end', end, 'edits', edits) edit_indices.append(i) if self.edit_is_rotation(first, second, start, end): #print('found a rotation in ' + str(first) + ' -> ' + str(second)) edits.append(self.build_rotation(first, second, start, end)) skip_next = len(edits[-1][1]) elif self.edit_is_transposition(first, second, start, end): #print('found a transposition in ' + str(first) + ' -> ' + str(second)) edits.append(self.build_transposition(first, second, start, end)) skip_next = 3 elif self.edit_is_insertion(first, second, start, end): #print('found an insertion in ' + str(first) + ' -> ' + str(second)) edits.append(self.build_insertion(first, second, start, end)) skip_next = len(edits[-1][1])-1 #print "for edit",edits[-1] #print "setting skip next to",skip_next elif self.edit_is_deletion(first, second, start, end): #print('found a deletion in ' + str(first) + ' -> ' + str(second)) edits.append(self.build_deletion(first, second, start, end)) skip_next = len(edits[-1][0])-1 elif self.edit_is_substitution(first, second, start, end): #print('found a substitution in ' + str(first) + ' -> ' + str(second)) edits.append(self.build_substitution(first, second, start, end)) skip_next = len(edits[-1][0])-1 else: raise ValueError('did not find any edits in %s => %s' % ( first, second)) return edits def find(self, word, error): first, second = self.align(word, error) edits = self.build_edits(first, second) return edits def apply(self, word, edits): word = "^" + word planned = [] for from_gram, to_gram in edits: index = word.find(from_gram) if index != -1: planned.append((index, len(from_gram), len(to_gram), to_gram)) if len(planned) < len(edits): raise ValueError('could not apply all edits to "%s"' % word) planned.sort(reverse=True) new_word = word for index, size, _, to_gram in planned: #print new_word new_word = new_word[:index] + to_gram + new_word[index+size:] new_word = new_word.strip("^") return new_word def remove_dashes(self, index, word): new_word = [] new_index = index for i,c in enumerate(word): if c == "-": if i < index: new_index -= 1 else: new_word.append(c) return new_index, ''.join(new_word)