def get_opcodes(self): if not self._opcodes: if self._editops: self._opcodes = opcodes(self._editops, self._str1, self._str2) else: self._opcodes = opcodes(self._str1, self._str2) return self._opcodes
def alignments(data): """ Converts a data set containing pairs of words into a list of alignments between these words. An alignement looks like this: [('#pseud', '#pseud'), ('o', 'ó'), ('pod', 'pod'), ('i', ''), ('o#', 'o#')]] The alignement is computed using the levenshtein distance (we assume that the words are close: few letters differ). For Russian and English, we will need to use another method. """ alignments = [] for s, p in data: ops = opcodes(s, p) align = [] word1, word2 = '', '' equal = True for op, i1, e1, i2, e2 in ops: if (op == 'equal' and not equal) or (op != 'equal' and equal): equal = (op == 'equal') if word1 or word2: align.append((word1, word2)) word1, word2 = '', '' word1 += s[i1:e1] word2 += p[i2:e2] if word1 or word2: align.append((word1, word2)) alignments.append(align) return alignments