示例#1
0
    def get_labels(self):
        """label each slot in the sausage (O=correct X=incorrect)"""
        if self.correct():
            # everything is correct
            return ['O'] * self.num_slots()

        # align the ref and the best hyp
        a = Sequence(self.ref())
        b = Sequence(self.best_hyp())
        v = Vocabulary()
        aEncoded = v.encodeSequence(a)
        bEncoded = v.encodeSequence(b)
        scoring = SimpleScoring(2, -1)
        aligner = StrictGlobalSequenceAligner(scoring, -2)
        score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
        alignment = v.decodeSequenceAlignment(encodeds[0])

        # get labels according to alignment
        labels = []
        for a,b in zip(alignment.first, alignment.second):
            if a == b or a == '-' and b == '*DELETE*':
                labels.append('O')
            else:
                labels.append('X')
        return labels
示例#2
0
def align_ref_long(hyp, ref):
    ''' Aligns a ref to a sausage-aligned hype 
        using the align library '''
    
    # align ref to hyp
    sr = Sequence(ref)
    sh = Sequence(hyp)
    
    v = Vocabulary()
    rEncoded = v.encodeSequence(sr)
    hEncoded = v.encodeSequence(sh)
    
    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = StrictGlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(hEncoded, rEncoded, backtrace=True)
    

    # Iterate over optimal alignments and print them.
    alignment = v.decodeSequenceAlignment(encodeds[0])
    ref_align_raw = [token[0] for token in alignment if token[0] != '-']
    
    ref_align  = []
    for token in ref_align_raw:
        if token == '-':
            ref_align.append(delete_token)
        else:
            ref_align.append(token)
            
    for i in range(len(hyp) - len(ref_align_raw)):
        ref_align.append(delete_token)
        
    return ref_align
示例#3
0
    def get_labels(self):
        """label each slot in the sausage (O=correct X=incorrect)"""
        if self.correct():
            # everything is correct
            return ['O'] * self.num_slots()

        # align the ref and the best hyp
        a = Sequence(self.ref())
        b = Sequence(self.best_hyp())
        v = Vocabulary()
        aEncoded = v.encodeSequence(a)
        bEncoded = v.encodeSequence(b)
        scoring = SimpleScoring(2, -1)
        aligner = StrictGlobalSequenceAligner(scoring, -2)
        score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
        alignment = v.decodeSequenceAlignment(encodeds[0])

        # get labels according to alignment
        labels = []
        for a, b in zip(alignment.first, alignment.second):
            if a == b or a == '-' and b == '*DELETE*':
                labels.append('O')
            else:
                labels.append('X')
        return labels
示例#4
0
def align_sequences(seq_a, seq_b):
    # Must escape '-' because alignment library uses it as a gap
    # marker.
    escaped_seq_a = ['\\-' if x == '-' else x for x in seq_a]
    escaped_seq_b = ['\\-' if x == '-' else x for x in seq_b]

    v = Vocabulary()
    encoded_a = v.encodeSequence(Sequence(escaped_seq_a))
    encoded_b = v.encodeSequence(Sequence(escaped_seq_b))

    scoring = SimpleScoring(matchScore=3, mismatchScore=-1)
    aligner = StrictGlobalSequenceAligner(scoring, gapScore=-2)
    _, encodeds = aligner.align(encoded_a, encoded_b, backtrace=True)
    return encodeds[0]
示例#5
0
文件: edits.py 项目: ndronen/spelling
 def __init__(self):
     self.__dict__.update(locals())
     del self.self
     self.scoring = SimpleScoring(2, -1)
     self.aligner = StrictGlobalSequenceAligner(self.scoring, -2)
示例#6
0
文件: edits.py 项目: ndronen/spelling
class EditFinder(object):
    def __init__(self):
        self.__dict__.update(locals())
        del self.self
        self.scoring = SimpleScoring(2, -1)
        self.aligner = StrictGlobalSequenceAligner(self.scoring, -2)

    def align(self, word, error):
        vocab = Vocabulary()
        a = vocab.encodeSequence(Sequence(word))
        b = vocab.encodeSequence(Sequence(error))
        score, encodings = self.aligner.align(a, b, backtrace=True)
    
        # Choose the highest-score alignment.
        score = -sys.maxsize
        best_alignment = None
        for encoding in encodings:
            alignment = vocab.decodeSequenceAlignment(encoding)
            if alignment.score > score:
                best_alignment = alignment
                score = alignment.score

        return best_alignment.first, best_alignment.second

    def edit_is_rotation(self, first, second, start, end):
        first_span = first[start:end+1]
        second_span = [c for c in reversed(second[start:end+1])]
        return first_span == second_span and \
            '-' not in first_span and '-' not in second_span

    def build_rotation(self, first, second, start, end):
        first_span = first[start:end+1]
        second_span = second[start:end+1]
        if start == 0:
            first_span.insert(0, '^')
            second_span.insert(0, '^')
        else:
            first_span.insert(0, first[start-1])
            second_span.insert(0, first[start-1])
        return (''.join(first_span), ''.join(second_span))

    def edit_is_transposition(self, first, second, start, end):
        first_span = first[start:end+1]
        second_span = [c for c in reversed(second[start:end+1])]
        return first_span == second_span and \
            first_span[0] == '-' and second_span[0] == '-'

    def build_transposition(self, first, second, start, end):
        first_span = first[start+1:end+1]
        second_span = second[start:end]
        return (''.join(first_span), ''.join(second_span))

    def edit_is_insertion(self, first, second, start, end):
        ret = first[start] == '-'
        #print('edit_is_insertion', first, second, start, end, first[start] == '-', ret)
        return ret

    def build_insertion(self, first, second, start, end):
        extent = 0
        for c in first[start:]:
            if c != '-':
                break
            extent += 1
        if start == 0:
            first_span = "^" + ''.join(first[:1])
            second_span = first_span[:-1] + ''.join(second[max(0,start-1):start+extent])
        else:
            first_span = ''.join(first[max(0,start-2):start+1])
            second_span = first_span[:-2] + ''.join(second[max(0,start-1):start+extent])

        first_span = ''.join(c for c in first_span if c != "-")
        second_span = ''.join(c for c in second_span if c != "-")
        return (first_span, second_span)

    def edit_is_deletion(self, first, second, start, end):
        #ret = start == end and second[start] == '-'
        ret = second[start] == '-'
        #print('edit_is_deletion', first, second, start, end, second[start] == '-', ret)
        return ret

    def build_deletion(self, first, second, start, end):
        #print('build_deletion', first, second, start, end, len(first))
        extent = 0
        for c in second[start:]:
            if c != '-':
                break
            extent += 1
        if start == 0:
            first_span = '^' + first[start]
            second_span = '^'
        else:
            first_span = ''.join(first[start-1:start+extent])
            second_span = first[start-1]
        return (first_span, second_span)

    def edit_is_substitution(self, first, second, start, end):
        ret = '-' not in [first[start], second[start]] and \
                first[start] != second[start]
        #print('edit_is_substitution', first, second, start, end, first[start] == '-', ret)
        return ret

    def build_substitution(self, first, second, start, end):
        #print('build_substitution', first, second, start, end)
        extent = 0
        for f,s in zip(first[start:],second[start:]):
            if f==s or f == "-" or s == "-":
                break
            extent += 1
        return (''.join(first[max(0,start-1):start+extent]), ''.join(second[max(0,start-1):start+extent]))
        #return (first[start], second[start])

    def build_edits(self, first, second):
        positions = []

        for i in range(len(first)):
            if first[i] != second[i]:
                positions.append(i)

        edits = []
        edit_indices = []

        #print('positions', positions)

        skip_next = 0

        for i in range(len(positions)):
            start = positions[i]
            try:
                end = positions[i+1]
            except IndexError:
                end = start

            if skip_next:
                skip_next -= positions[i] - positions[i-1]
                #print skip_next
            if skip_next:
                if skip_next > 0:
                    continue
                else:
                    skip_next = 0

            #print('i', i, 'start', start, 'end', end, 'edits', edits)

            edit_indices.append(i)

            if self.edit_is_rotation(first, second, start, end):
                #print('found a rotation in ' + str(first) + ' -> ' + str(second))
                edits.append(self.build_rotation(first, second, start, end))
                skip_next = len(edits[-1][1])
            elif self.edit_is_transposition(first, second, start, end):
                #print('found a transposition in ' + str(first) + ' -> ' + str(second))
                edits.append(self.build_transposition(first, second, start, end))
                skip_next = 3
            elif self.edit_is_insertion(first, second, start, end):
                #print('found an insertion in ' + str(first) + ' -> ' + str(second))
                edits.append(self.build_insertion(first, second, start, end))
                skip_next = len(edits[-1][1])-1
                #print "for edit",edits[-1]
                #print "setting skip next to",skip_next
            elif self.edit_is_deletion(first, second, start, end):
                #print('found a deletion in ' + str(first) + ' -> ' + str(second))
                edits.append(self.build_deletion(first, second, start, end))
                skip_next = len(edits[-1][0])-1
            elif self.edit_is_substitution(first, second, start, end):
                #print('found a substitution in ' + str(first) + ' -> ' + str(second))
                edits.append(self.build_substitution(first, second, start, end))
                skip_next = len(edits[-1][0])-1
            else:
                raise ValueError('did not find any edits in %s => %s' % (
                    first, second))

        return edits

    def find(self, word, error):
        first, second = self.align(word, error)
        edits = self.build_edits(first, second)
        return edits

    def apply(self, word, edits):
        word = "^" + word
        planned = []
        for from_gram, to_gram in edits:
            index = word.find(from_gram)
            if index != -1:
                planned.append((index, len(from_gram), len(to_gram), to_gram))
        if len(planned) < len(edits):
            raise ValueError('could not apply all edits to "%s"' % word)
        planned.sort(reverse=True)
        new_word = word
        for index, size, _, to_gram in planned:
            #print new_word
            new_word = new_word[:index] + to_gram + new_word[index+size:]
        new_word = new_word.strip("^")
        return new_word

    def remove_dashes(self, index, word):
        new_word = []
        new_index = index
        for i,c in enumerate(word):
            if c == "-":
                if i < index:
                    new_index -= 1
            else:
                new_word.append(c)
        return new_index, ''.join(new_word)