def get_best(self, error): """Method to calculate channel model probability for errors.""" candidates = self.gen_candidates(error.lower()) p = [0] * len(candidates) for i, candidate_ in enumerate(candidates): candidate = candidate_[-1] p_ew_candidate = [] for res in editops(candidate, error): editop, w_idx, e_idx = res if editop == 'replace': e = error[e_idx] w = candidate[w_idx] elif editop == 'insert': e = error[e_idx - 1:e_idx + 1] w = candidate[w_idx - 1] elif editop == 'delete': if e_idx != 0: e = error[e_idx - 1] w = candidate[w_idx - 1:w_idx + 1] else: e = error[e_idx] w = candidate[w_idx:w_idx + 2] else: print(editops(candidate, error)) return error p_ew_candidate.append(self.pm.P_ew(editop, e, w)) p[i] = self.pm.P_w(candidate) * reduce( lambda x, y: x * y, p_ew_candidate) / len(p_ew_candidate) try: best_idx = p.index(max(p)) return (candidates[best_idx][-1]) except ValueError: return error print(editops(candidate, error))
def get_editops(self): if not self._editops: if self._opcodes: self._editops = editops(self._opcodes, self._str1, self._str2) else: self._editops = editops(self._str1, self._str2) return self._editops
def get_result(self): min_pair = min(itertools.combinations(self.input_content, 2), key=lambda pair: len(editops(*pair))) pos_to_remove = [pos[1] for pos in editops(*min_pair)] final_str = ''.join([ char for i, char in enumerate(min_pair[0]) if i not in pos_to_remove ]) return final_str
def find_best_pair(left_transcriptions, right_transcriptions): prep_left, prep_right = prepare_transcriptions_for_levenshtein(left_transcriptions[0], right_transcriptions[0]) best_ops = prepare_ops(editops(prep_left, prep_right)) best_phone_error_rate = (best_ops['replace'] + best_ops['delete'] + best_ops['insert']) / float(len(prep_left)) best_pair = (left_transcriptions[0], right_transcriptions[0]) for cur_left in left_transcriptions: for cur_right in right_transcriptions: prep_left, prep_right = prepare_transcriptions_for_levenshtein(cur_left, cur_right) cur_ops = prepare_ops(editops(prep_left, prep_right)) cur_phone_error_rate = (cur_ops['replace'] + cur_ops['delete'] + cur_ops['insert']) / float(len(prep_left)) if cur_phone_error_rate < best_phone_error_rate: best_phone_error_rate = cur_phone_error_rate best_ops = copy.copy(cur_ops) best_pair = (cur_left, cur_right) return best_pair, best_ops
def phonetic_distance(self,other,normalized=False): phonemes1=self.phonemes() phonemes2=other.phonemes() p2chr={} for p in phonemes1+phonemes2: pstr=p.phon_str if not pstr in p2chr: p2chr[pstr]=chr(len(p2chr)) chr1=''.join([p2chr[p.phon_str] for p in phonemes1]) chr2=''.join([p2chr[p.phon_str] for p in phonemes2]) from Levenshtein import editops dist=0.0 for edit_type,index1,index2 in editops(chr1,chr2): if edit_type!='replace': dist+=1 continue try: p1=phonemes1[index1] p2=phonemes2[index2] #print edit_type,p1,p2,index1,index2 #,p1.distance(p2) dist+=p1.distance(p2) except IndexError: dist+=1 if normalized: return dist / float(max(len(phonemes1),len(phonemes2))) return dist
def __call__(self, word, candidate_correction): prob = 0. word = u'$' + word candidate_correction = u'$' + candidate_correction ops = editops(word, candidate_correction) for op, spos, dpos in ops: nominator, denominator = None, None if op == "insert": c2 = candidate_correction[dpos] c1 = word[spos - 1] nominator = self.insertion_cm[self.char_to_index[c1], self.char_to_index[c2]] denominator = float(self.counts_dict[c1]) if op == "delete": c2 = word[spos] c1 = word[spos - 1] nominator = self.deletion_cm[self.char_to_index[c1], self.char_to_index[c2]] denominator = float(self.counts_dict[c1 + c2]) if op == "replace": c2 = candidate_correction[dpos] c1 = word[spos] nominator = self.substitution_cm[self.char_to_index[c1], self.char_to_index[c2]] denominator = float(self.counts_dict[c1]) if nominator is not None and denominator is not None: prob += np.log(nominator / denominator) return prob
def p_of_w_given_c(w, c): prob = 0. w = u'$' + w candidate_correction = u'$' + c ops = editops(w, candidate_correction) for op, spos, dpos in ops: nominator, denominator = None, None if op == "insert": c2 = candidate_correction[dpos] c1 = w[spos - 1] nominator = insertion_cm[char_to_index[c1], char_to_index[c2]] denominator = float(counts_dict[c1]) if op == "delete": c2 = w[spos] c1 = w[spos - 1] nominator = deletion_cm[char_to_index[c1], char_to_index[c2]] denominator = float(counts_dict[c1 + c2]) if op == "replace": c2 = candidate_correction[dpos] c1 = w[spos] nominator = substitution_cm[char_to_index[c1], char_to_index[c2]] denominator = float(counts_dict[c1]) if nominator is not None and denominator is not None: prob += np.log(nominator / denominator) return prob
def test_dependency(a, b): """ Test the assumption that the number of editops is equivalent to the Levenshtein edit distance. """ from Levenshtein import distance, editops # type: ignore assert distance(a, b) == len(editops(a, b))
def diffAlign(s1, s2): matching = matching_blocks( editops(s1, s2), s1, s2) s1 = [c for c in s1] s2 = [c for c in s2] s1n = [] s2n = [] popped1, added1 = 0, 0 popped2, added2 = 0, 0 for mb in matching: index1 = mb[0] index2 = mb[1] # Controllo allineamento indici while(popped1 < index1): s1n.append(s1.pop(0)) popped1 += 1 while(popped2 < index2): s2n.append(s2.pop(0)) popped2 += 1 while(len(s1n) < len(s2n)): s1n.append("$") added1 += 1 while(len(s2n) < len(s1n)): s2n.append("$") added2 += 1 seqLen = mb[2] for i in range(seqLen): s1n.append(s1.pop(0)) s2n.append(s2.pop(0)) popped1, popped2 = popped1 + 1, popped2 + 1 s1n = "".join(s1n) s2n = "".join(s2n) return s1n, s2n
def wordsim2(word1, word2): '''return a similarity score for between the two words attempt to use a behavioral model of edit distances ''' dist = damerau_levenshtein_distance(str(word1), str(word2)) if dist > 3: return 0 edit = editops(str(word1), str(word2)) return match_model(edit)
def __call__(self, s1, s2): ops = editops(s1, s2) replacements = [(spos, dpos) for op_name, spos, dpos in ops if op_name == "replace"] count = 0 for spos, dpos in replacements: if s1[spos] in diacritical_chars and diacritical_chars[s1[spos]] == s2[dpos]: count += 1 base_dist = len(ops) - (1 - diacritical_error_punishment) * count base_dist -= self.find_all_occurrences_of_substring(u"uw", s1) * (1 - ow_punishment) return base_dist
def detect_mistakes(self, s1, s2): edits = editops(s1, s2) for edit_type, pos1, pos2 in edits: try: mistake = (edit_type, s1[pos1], s2[pos2]) if mistake not in self.popular_mistakes: mistake = (edit_type, s2[pos2], s1[pos1]) self.popular_mistakes[mistake] += 1 except IndexError: continue
def edit_likelihood(x,y, alphabet_size=2, alpha=0.99): ops = editops(x,y) lp = log(alpha)*(len(y)-len(ops)) # all the unchanged for o, _, _ in ops: if o == 'equal': assert False # should never get here elif o == 'replace': lp += log(1.0-alpha) - log(3.0) - log(alphabet_size) elif o == 'insert': lp += log(1.0-alpha) - log(3.0) - log(alphabet_size) elif o == 'delete': lp += log(1.0-alpha) - log(3.0) else: assert False return lp
def edit_likelihood(x, y, alphabet_size=2, noise=0.01): ops = editops(x, y) lp = log(1.0 - noise) * len(y) # all the unchanged for o, _, _ in ops: if o == 'equal': lp += log(noise) - log(4.0) elif o == 'replace': lp += log(noise) - log(4.0) - log(alphabet_size) elif o == 'insert': lp += log(noise) - log(4.0) - log(alphabet_size) elif o == 'delete': lp += log(noise) - log(4.0) else: assert False # print lp, x, y return lp
def edit_likelihood(x,y, alphabet_size=2, noise=0.01): ops = editops(x,y) lp = log(1.0-noise)*len(y) # all the unchanged for o, _, _ in ops: if o == 'equal': lp += log(noise) - log(4.0) elif o == 'replace': lp += log(noise) - log(4.0) - log(alphabet_size) elif o == 'insert': lp += log(noise) - log(4.0) - log(alphabet_size) elif o == 'delete': lp += log(noise) - log(4.0) else: assert False # print lp, x, y return lp
def match_list(A, B, on_replace='delete'): """Match two lists of different sizes and return corresponding indice created by JR king Parameters ---------- A: list | array, shape (n,) The values of the first list B: list | array: shape (m,) The values of the second list Returns ------- A_idx : array The indices of the A list that match those of the B B_idx : array The indices of the B list that match those of the A """ from Levenshtein import editops #pip install python-Levenshtein A = np.nan_to_num(np.squeeze(A)) B = np.nan_to_num(np.squeeze(B)) assert A.ndim == B.ndim == 1 unique = np.unique(np.r_[A, B]) label_encoder = dict((k, v) for v, k in enumerate(unique)) def int_to_unicode(array): return ''.join([str(chr(label_encoder[ii])) for ii in array]) changes = editops(int_to_unicode(A), int_to_unicode(B)) B_sel = np.arange(len(B)).astype(float) A_sel = np.arange(len(A)).astype(float) for type, val_a, val_b in changes: if type == 'insert': B_sel[val_b] = np.nan elif type == 'delete': A_sel[val_a] = np.nan elif on_replace == 'delete': # print('delete replace') A_sel[val_a] = np.nan B_sel[val_b] = np.nan elif on_replace == 'keep': # print('keep replace') pass else: raise NotImplementedError B_sel = B_sel[np.where(~np.isnan(B_sel))] A_sel = A_sel[np.where(~np.isnan(A_sel))] assert len(B_sel) == len(A_sel) return A_sel.astype(int), B_sel.astype(int)
def extractPair(correctw, incorrectw): matching = matching_blocks(editops(correctw, incorrectw), correctw, incorrectw) correctw = [c for c in correctw] incorrectw = [c for c in incorrectw] for mb in matching: for i in range(mb[0], mb[0] + mb[2], 1): correctw[i] = None for i in range(mb[1], mb[1] + mb[2], 1): incorrectw[i] = None correctw = "".join([c for c in correctw if c]) incorrectw = "".join([c for c in incorrectw if c]) return (correctw, incorrectw)
def __call__(self, s1, s2): ops = editops(s1, s2) replacements = [(spos, dpos) for op_name, spos, dpos in ops if op_name == "replace"] count = 0 for spos, dpos in replacements: if s1[spos] in diacritical_chars and diacritical_chars[ s1[spos]] == s2[dpos]: count += 1 base_dist = len(ops) - (1 - diacritical_error_punishment) * count base_dist -= self.find_all_occurrences_of_substring( u"uw", s1) * (1 - ow_punishment) return base_dist
def edit_likelihood(x, y, alphabet_size=2, alpha=0.99): ops = editops(x, y) lp = log(alpha) * (len(y) - len(ops)) # all the unchanged for o, _, _ in ops: if o == 'equal': assert False # should never get here elif o == 'replace': lp += log(1.0 - alpha) - log(3.0) - log(alphabet_size) elif o == 'insert': lp += log(1.0 - alpha) - log(3.0) - log(alphabet_size) elif o == 'delete': lp += log(1.0 - alpha) - log(3.0) else: assert False return lp
def probabilities(self, word): """ count number of occurences of word in a document class normalize by the all the words """ try: N = 0 # all possible candidates - histogram count candidates = self.get_most_probable_candidates(word) for candidate in candidates: for c in self.lex.keys(): N += self.lex[c][candidate[0]] probabilities = [] for candidate in candidates: Nc = 0 for c in self.lex.keys(): Nc += self.lex[c][candidate[0]] P_c = (Nc + self.alpha) / \ (N + self.alpha*self.total_words) """ L -> 0 -> p(c) = 1 L -> len(c) -> p(c) = 0 p(c) = 1-L/len(c) """ lev = candidate[1] edits = editops(candidate[0], word) mistake_mod = 0 for edit_type, pos1, pos2 in edits: try: mistake = (edit_type, candidate[0][pos1], word[pos2]) if mistake in self.popular_mistakes.keys(): mistake_mod += self.beta * \ self.popular_mistakes[mistake] else: mistake = (edit_type, word[pos2], candidate[0][pos1]) if mistake in self.popular_mistakes.keys(): mistake_mod += self.beta * \ self.popular_mistakes[mistake] except IndexError: continue lev -= mistake_mod P_w_c = 1 - (lev / len(word)) prob = P_w_c * (-1 * np.log(P_c)) probabilities.append((candidate, prob)) probabilities.sort(key=lambda x: x[1], reverse=True) return probabilities[:np.min([5, len(probabilities) - 1])] except KeyError: pass
def __init__(self, dump_path=None): with codecs.open("data/bledy.txt", "r", "utf-8") as f: errors, corrections = map(lambda x: list(x), zip(*[line.split(';') for line in f.read().splitlines()])) with open("data/alphabet.pkl", 'r') as f: self.alphabet = list(pickle.load(f)) + [u'$'] n = len(self.alphabet) self.char_to_index = {c: i for i, c in enumerate(self.alphabet)} self.deletion_cm = np.ones((n, n)) self.insertion_cm = np.ones((n, n)) self.substitution_cm = np.ones((n, n)) self.counts_dict = defaultdict(df) model = CountVectorizer(analyzer='char', ngram_range=(1, 2)) errors = [u'$' + error for error in errors] corrections = [u'$' + correction for correction in corrections] counts_dense = model.fit_transform(errors + corrections).todense().sum(axis=0) for i, feature_name in enumerate(model.get_feature_names()): self.counts_dict[feature_name] += counts_dense[0, i] for error, correction in zip(errors, corrections): ops = editops(error, correction) for op, spos, dpos in ops: if op == 'insert': c2 = correction[dpos] c1 = error[spos - 1] self.insertion_cm[self.char_to_index[c1], self.char_to_index[c2]] += 1 if op == 'delete': c2 = error[spos] c1 = error[spos - 1] self.deletion_cm[self.char_to_index[c1], self.char_to_index[c2]] += 1 if op == 'replace': c2 = correction[dpos] c1 = error[spos] self.substitution_cm[self.char_to_index[c1], self.char_to_index[c2]] += 1 if dump_path is not None: with h5py.File(dump_path, 'w') as f: f.create_dataset('i', data=self.insertion_cm) f.create_dataset('d', data=self.deletion_cm) f.create_dataset('s', data=self.substitution_cm) with open('data/counts.pkl', 'w') as f: pickle.dump(self.counts_dict, f) with open('data/char_to_index.pkl', 'w') as f: pickle.dump(self.char_to_index, f)
def select(self, token, pos): """ retrieve best lemma, lookup if possible, otherwise """ #first retrieve precomputed results if (token,pos) in self.precomputed: return self.precomputed[(token,pos)] # take subset of valid lemmas pos_pli = self.pli[pos] # test on smallest edit length candidates_le = [] cur_len = 10000 tok_len = len(token) for candidate in pos_pli: lc = len(candidate) if lc - tok_len > cur_len: # too many inserts required in best case scenario, skip continue if tok_len - lc > cur_len: # too many deletes required in best case scenario, skip continue edits = editops(token, candidate) le = len(edits) if le < cur_len: candidates_le = [(candidate, edits)] cur_len = le elif le == cur_len: candidates_le.append((candidate, edits)) if len(candidates_le) == 1 : self.precomputed[(token,pos)] = candidates_le[0][0] return candidates_le[0][0] # test on highest index cur_len = -1 candidates_lo = [] for candidate, edits in candidates_le: lo = sum(i[-1] for i in edits) / le if lo > cur_len: candidates_lo = [(candidate, edits)] cur_len = lo elif lo == cur_len: candidates_lo.append((candidate, edits)) if len(candidates_lo) == 1 : self.precomputed[(token,pos)] = candidates_lo[0][0] return candidates_lo[0][0] # test on most frequent slct = max(candidates_lo, key=lambda c: pos_pli[c[0]])[0] self.precomputed[(token,pos)] = slct return slct
def determine_fix_event(file_a: bytes, file_b: bytes) -> FixEvent: """ For two source files with Levenshtein distance of one, this returns the edit that converts the first file into the second file. """ src = tokens2seq(java.tokenize(file_a)) dest = tokens2seq(java.tokenize(file_b)) ops = editops(src, dest) # This only works for files with one edit! assert len(ops) == 1 # Decode editop's format. (type_name, src_pos, dest_pos), = ops edit_type = to_edit_type(type_name) new_token = None if edit_type is Deletion else from_pua(dest[dest_pos]) edit = Edit(edit_type, dest_pos, new_token) return FixEvent(edit, edit, 0)
def __init__(self, a: Iterable[Token], b: Iterable[Token], convert: TokenConverter) -> None: self.src_toks = tuple(a) self.dest_toks = tuple(b) # Convert each token to an appropriate stringified representation. self.src_text = tuple(convert(tok) for tok in self.src_toks) self.dest_text = tuple(convert(tok) for tok in self.dest_toks) # Because python-Levenshtein calculates string distances exclusively, # synthesize "strings" by mapping each of the strings in the token # sequence to a single character. mapper = PrivateUseAreaMapper() src_str = ''.join(mapper[token] for token in self.src_text) dest_str = ''.join(mapper[token] for token in self.dest_text) # Determine the Levenstein edit operations. self._edit_ops = editops(src_str, dest_str)
def edit_dist(r_string: str, s_string: str) -> int: """Computes edit distance. ED(r, s) = number of edit operations Parameters ---------- r_string : str First string. s_string : str Second string. Returns ------- Edit distance of r and s. """ return round(len(editops(r_string, s_string)), 3)
def get_editops(str_from, str_to, allow_replace=True, allow_copy=True): """Gets edit operations from **str_from** to **str_to** according to Levenstein distance. Supported edit operations are: `'delete'`, `'insert'`, `'replace'`, `'copy'`. Args: **str_from** (`str`): the source string. **str_to** (`str`): the target string. **allow_replace** (`bool`; default is `True`): whether to allow the **replace** edit operation. **allow_copy** (`bool`; default is `True`): whether to allow the **copy** edit operation. Returns the `tuple` of edit operations that is needed to transform **str_from** to **str_to**. """ res = [] for op, idx_dst, idx_src in editops(str_from, str_to): if op == 'delete': res.append(('d', idx_dst, None)) else: ch_src = str_to[idx_src] if op == 'replace' and allow_replace: res.append(('r', idx_dst, ch_src)) elif op in ['insert', 'replace']: op_prev, idx_prev, ch_prev = res[-1] if res else [0] * 3 if allow_copy and idx_prev \ and str_from[idx_prev - 1] == ch_src \ and (op_prev == 'c' or idx_prev != idx_dst): res.append(('c', idx_dst, None)) else: res.append(('i', idx_dst, str_to[idx_src])) if op == 'replace': res.append(('d', idx_dst, None)) else: raise ValueError("Unexpected operation code '{}'" .format(op)) return tuple(res)
def process_changes(start: str, end: str, errors: dict) -> None: changes = editops(start, end) for change in changes: orig = '' update = '' if change[0] == 'replace': orig = start[change[1]] update = end[change[2]] elif change[0] == 'insert': orig = '*' update = end[change[2]] elif change[0] == 'delete': orig = start[change[1]] update = '*' else: print('Error: change value is ' + str(change[0])) k = (orig, update) if k in errors.keys(): errors[k] += 1 else: errors[k] = 1
def phonetic_distance(self,other,normalized=False): phonemes1=self.phonemes() phonemes2=other.phonemes() p2chr={} for p in phonemes1+phonemes2: pstr=p.phon_str if not pstr in p2chr: p2chr[pstr]=unichr(len(p2chr)) chr1=u''.join([p2chr[p.phon_str] for p in phonemes1]) chr2=u''.join([p2chr[p.phon_str] for p in phonemes2]) # @HACK @TODO #return 0 if chr1==chr2 else 10 ### from Levenshtein import editops dist=0.0 for edit_type,index1,index2 in editops(chr1,chr2): if edit_type!='replace': dist+=1 continue try: p1=phonemes1[index1] p2=phonemes2[index2] #print edit_type,p1,p2,index1,index2 #,p1.distance(p2) dist+=p1.distance(p2) except IndexError: dist+=1 ## @NEW # add a distpoint if does not end with same phoneme? try: if phonemes1[-1]!=phonemes2[-1]: dist+=2 except IndexError: # ???? @TODO dist+=2 if normalized: return dist / float(max(len(phonemes1),len(phonemes2))) return dist
def edit_operations(token1, token2): eds = editops(token1, token2) s = "" ops = ["none"] * max(len(token2), len(token1)) counter = 0 for ed in eds: if ed[0] == "insert" or ed[0] == "replace": ops[ed[2]] = (ed[0] + "_" + token2[ed[2]]) elif ed[0] == "delete": ops[ed[2]] = (ed[0] + "_" + token1[ed[2]]) counter += 1 # Remove trailing "none"s #if token1 == "breakbased": # print ops # print "===" for x in reversed(range(len(ops))): if ops[x] == "none": del ops[x] else: break #if token2 == "emergency management officer": # print "EMERGENCY MANAGEMENT OFFICER\n=====================" # print token1 # print token2 # print ops # print "===========================" ''' if ops != []: print "<<" print "Tokens: " + token1, token2 print "Eds: " + str(eds) print "Operations: " + str(tuple(ops)) print ">>" ''' return tuple(ops)
def main(): parser = argparse.ArgumentParser() parser.add_argument("title_file", help=ARG_HELP_STRINGS["title_file"]) parser.add_argument("-m", "--match_threshold", type=float, default=0.9, help=ARG_HELP_STRINGS["match_threshold"]) parser.add_argument("-a", "--ask_threshold", type=float, default=0.8, help=ARG_HELP_STRINGS["ask_threshold"]) parser.add_argument("-c", "--colors", type=bool, default=COLORS_DEFAULT, help=ARG_HELP_STRINGS["ansi_colors"]) parser.add_argument("--start", type=int, default=0, help=ARG_HELP_STRINGS["start"]) parser.add_argument("--end", type=int, default=inf, help=ARG_HELP_STRINGS["end"]) args = parser.parse_args() header = None additional_fields = ["doi", "similarity"] with open(args.title_file, "r") as f: reader = csv.DictReader(f) title_field = None for field in reader.fieldnames: if field.lower() in TITLE_HEADER_WL: print(colorise("Using column '" + field + "' as title column", "green")) title_field = field break else: print(colorise("ERROR: Could not find a column name which might denote a title column", "red")) sys.exit() header = reader.fieldnames for field in additional_fields: if field not in header: header.append(field) modified_lines = [] ask_count = 0 for line in reader: line["ask"] = False if reader.line_num < args.start or reader.line_num > args.end: continue print(BREAK) title = line[title_field] head = "line " + str(reader.line_num) + ", query title:" print(colorise(head.ljust(L_JUST) + "'" + title + "'", "blue")) ret = crossref_query_title(title) retries = 0 while not ret['success'] and retries < MAX_RETRIES_ON_ERROR: retries += 1 msg = "Error while querying CrossRef API ({}), retrying ({})...".format(ret["exception"], retries) print(colorise(msg, "red")) ret = crossref_query_title(title) result = ret["result"] msg_tail = "'{}' [{}]" msg_tail = msg_tail.format(result["crossref_title"], result["doi"]) if result["similarity"] == 1.0: msg_head = "Perfect match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "cyan")) line.update(result) line["ask"] = False elif result["similarity"] >= args.match_threshold: msg_head = "Good match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "green")) line.update(result) line["ask"] = False elif result["similarity"] >= args.ask_threshold: msg_head = "Possible match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "yellow")) line.update(result) line["line_num"] = reader.line_num line["ask"] = True ask_count += 1 else: msg_head = "No match found, most similar was ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "red")) line.update(EMPTY_RESULT) line["ask"] = False modified_lines.append(line) if ask_count > 0: print(BREAK) ask_msg = "{} matches found with a similarity between {} and {} will need manual confirmation:" ask_msg = ask_msg.format(ask_count, args.ask_threshold, args.match_threshold) print(colorise(ask_msg, "green")) for line in modified_lines: if line["ask"]: print(BREAK) query_t = line[title_field] xref_t = line["crossref_title"] # display matching segments in identical colors for easier recognition diff = matching_blocks(editops(query_t.lower(), xref_t.lower()), query_t, xref_t) query_print = query_t xref_print = xref_t # ANSI codes increase string length, so we need an offset to compensate offset = 0 for i in range(len(diff)): a, b, c = diff[i] a += offset b += offset offset += 9 color = CMP_COLORS[i % len(CMP_COLORS)] query_print = colorise_text_segment(query_print, a, a + c , color) xref_print = colorise_text_segment(xref_print, b, b + c , color) query_head = colorise("line {}, query title:".format(line["line_num"]), "blue") xref_head = colorise("Possible match ({}):".format(round(line["similarity"], 2)), "yellow") print(query_head.ljust(L_JUST) + query_print) print(xref_head.ljust(L_JUST) + xref_print) answer = input("Do you want to accept the DOI for the match title? (y/n):") while answer not in ["y", "n"]: answer = input("Please type 'y' or 'n':") if answer == "n": line.update(EMPTY_RESULT) with open("out.csv", "w") as out: dialect = csv.excel dialect.quoting = csv.QUOTE_ALL writer = csv.DictWriter(out, header, extrasaction='ignore', dialect=dialect) writer.writeheader() writer.writerows(modified_lines)
def renew_ann(old_txt_fn, old_ann_fn, new_txt_fn, save_new_ann_to, rewrite=False): """If we have a brat annotation for some txt-file already done, and we have to change that txt slightly, this method helps you to adjust the annotation for new version of the txt. :param old_txt_fn: a path to the old txt-file. :param old_ann_fn: a path to the old ann-file. :param new_txt_fn: a path to the new txt-file. :param save_new_ann_to: a path where the renewed ann will be saved to. :param rewrite: if ``True``, allow *save_new_ann_to* be equal to *old_ann_fn*. Default is ``False``. """ assert rewrite or save_new_ann_to != old_ann_fn, \ 'ERROR: use `rewrite=True` param if you really want to change ' \ 'original ann-file' from Levenshtein import editops with io.open(old_txt_fn, 'rt', encoding='utf-8', newline='') as f: old_txt = f.read() with io.open(new_txt_fn, 'rt', encoding='utf-8', newline='') as f: new_txt = f.read() if '\r' in new_txt: print('WARNING: The new txt file ("{}") file contain "CR" ' 'symbols that may cause errors ("nonprintable characters") ' 'in brat engine. Consider to remove "CR" symbols from new ' 'txt file and renew annotations again'.format(new_txt_fn)) with open(old_ann_fn, 'rt', encoding='utf-8') as f: old_ann = f.read().split('\n') transfer_map, shift, prev_idx = list(range(len(old_txt) + 1)), 0, 0 for op, idx_src, idx_dst in editops(old_txt, new_txt): if shift: for idx in range(prev_idx, idx_src): transfer_map[idx] += shift if op == 'insert': shift += 1 prev_idx = idx_src elif op == 'replace': transfer_map[idx_src] += shift prev_idx = idx_src + 1 elif op == 'delete': transfer_map[idx_src] = None shift -= 1 prev_idx = idx_src + 1 else: raise ValueError('ERROR: Unknown operation "{}"'.format(op)) if shift: for idx in range(prev_idx, len(old_txt) + 1): transfer_map[idx] += shift new_ann = [] len_old_txt, len_new_txt = len(old_txt), len(new_txt) old_aids, new_aids, aid_map, all_spans = set(), set(), {}, {} for line_no, ann in enumerate(old_ann, start=1): ann = ann.split('\t') if not ann: continue aid = ann[0] old_aids.add(aid) chunks_old = ann[1].split(';') if len(ann) >= 2 else [] chunks_new, spans, fragments = [], [], [] for chunk_old in chunks_old: chunk_new, span, idx0 = [], [], None for token in chunk_old.split(): if token.isdigit(): idx_ = int(token) assert idx_ <= len_old_txt, \ 'ERROR: Position "{}" in line {} is outside of ' \ 'bounds of the file {}' \ .format(idx_, line_no, old_ann_fn) # if we have a start of the fragment if not span: # search for transfer not to None for idx in transfer_map[idx_:]: if idx is not None: idx0 = idx if idx == len_new_txt: idx = None else: # if the old fragment starts after ' ', # the new one should do, too if idx_ == 0 \ or old_txt[idx_ - 1].isspace(): for i in range(idx, 0, -1): if new_txt[i - 1].isspace(): idx = i break else: idx = 0 # anyway, we can't point to ' ' for i in range(idx, len_new_txt): if not new_txt[i].isspace(): idx = i break else: idx = None #idx0 = idx if idx is None: token = None else: span.append(idx) token = str(idx) break else: break # end of the fragment else: for idx in reversed(transfer_map[:idx_ + 1]): if idx is not None: if idx <= idx0: idx = span = None else: # if the old fragment ends with ' ', # the new one should do, too if idx_ == len_old_txt \ or old_txt[idx_].isspace(): for i in range(idx, len_new_txt): if new_txt[i].isspace(): idx = i break else: idx = len_new_txt # anyway, we don't want to have ' ' # in the end for i in range(idx, 0, -1): if not new_txt[i - 1].isspace(): idx = i break else: idx = None if idx is None: token = None else: span.append(idx) token = str(idx) break else: break if token is None or (token in old_aids and token not in new_aids): token = aid_map.get(token) if not token: chunk_new = None break chunk_new.append(token) if span: if len(span) != 2: print(span) assert len(span) == 2, 'ERROR: Invalid line {} in {} file' \ .format(line_no, old_ann_fn) for span_ in spans: if span[0] >= span_[0] and span[0] < span_[1]: span[0] = span_[1] if span[1] > span_[0] and span[1] <= span_[1]: span[1] = span_[0] if span[1] > span[0]: for i, span_ in enumerate(reversed(spans), start=1): if span[0] == span_[1]: span_[1] = span[1] elif span[1] == span_[0]: span_[0] = span[0] else: continue ### frag_start, frag_end = span frags_ = new_txt[frag_start:frag_end].split('\n') fragment = '' chunk_new = [str(frag_start), str(frag_end)] for i, frag in enumerate(frags_): frag_len = len(frag) if frag_len: frag_end = frag_start + frag_len frag_start = frag_end + 1 chunk_new[0] += ' ' + str(frag_end) + \ ';' + str(frag_start) fragment += frag + ' ' fragment += frags_[-1] chunks_new[-i][-2:] = chunk_new fragments[-i] = fragment ### #chunks_new[-i][-2:] = [str(span_[0]), str(span_[1])] #fragments[-i] = new_txt[span_[0]:span_[1]] chunk_new = None break else: spans.append(span) if len(chunk_new) > 2: chunks_new.append(chunk_new[:-2]) ### frag_start, frag_end = span frags_ = new_txt[frag_start:frag_end].split('\n') fragment = '' chunk_new = [str(frag_start), str(frag_end)] for i, frag in enumerate(frags_[:-1]): frag_len = len(frag) if frag_len: frag_end = frag_start + frag_len frag_start = frag_end + 1 chunk_new[0] += ' ' + str(frag_end) + \ ';' + str(frag_start) fragment += frag + ' ' fragment += frags_[-1] fragments.append(fragment) ### #chunk_new = [str(span[0]), str(span[1])] #fragments.append(new_txt[span[0]:span[1]]) else: chunk_new = None if chunk_new is None: if not chunks_new: break else: chunks_new.append(chunk_new) if spans: order_ = [ i for _, i in sorted((x, i) for i, x in enumerate(spans)) ] spans = [spans[i] for i in order_] chunk_ = ' '.join(chunks_new[0]) chunks_new_ = [chunks_new[i + 1] for i in order_] fragments_ = [fragments[i] for i in order_] end_pos = None chunks_new, fragments = [], [] for span, chunk_new, fragment in zip(spans, chunks_new_, fragments_): if end_pos: for c in new_txt[end_pos:span[0]]: if c != ' ': chunks_new.append(chunk_new) fragments.append(fragment) break else: chunks_new[-1][1] = chunk_new[1] fragments[-1] += ' ' * (span[0] - end_pos) + fragment else: chunks_new.append(chunk_new) fragments.append(fragment) end_pos = span[1] chunks_new[0].insert(0, chunk_) if chunks_old and chunks_new: if chunks_new: chunks_new = ';'.join(' '.join(x) for x in chunks_new) if chunks_new in all_spans: aid_map[aid] = all_spans[chunks_new] continue all_spans[chunks_new] = aid chunks_new = [chunks_new] else: chunks_new = [] new_ann.append( '\t'.join([aid] + chunks_new + ([' '.join(fragments)] if fragments else ann[2:]))) new_aids.add(aid) with io.open(save_new_ann_to, 'wt', encoding='utf=8', newline='\n') as f: if new_ann: f.write('\n'.join(new_ann) + '\n')
def main(data_path, model_path, epochs): with open(os.path.join(model_path, 'train.txt')) as train_file: train = [x.strip() for x in train_file.readlines()] with open(os.path.join(model_path, 'test.txt')) as test_file: test = [x.strip() for x in test_file.readlines()] csv_logger = CSVLogger(os.path.join(model_path, 'Log1.csv')) dataset = Dataset(data_path) signal_seq = ExampleSequence(dataset, train, name='train', batch_size=batch_size) test_seq = ExampleSequence(dataset, test, name='test', batch_size=batch_size) model = load_model(os.path.join(model_path, 'model.h5'), custom_objects={ '<lambda>': lambda y_true, y_pred: y_pred }) model = multi_gpu_model(model, gpus=2) param = { 'lr': 0.001, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': None, 'clipvalue': 2 } adam = optimizers.Adam(**param) model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) model.fit_generator(signal_seq, validation_data=test_seq, epochs=epochs, callbacks=[csv_logger]) model.save(os.path.join(model_path, 'model_1.h5')) sub_model = model.get_layer('model_2') sub_model = sub_model.get_layer('model_1') im_model = Model(inputs=sub_model.get_input_at(0), outputs=sub_model.get_layer('activation_1').output) dists = [] ops = [] lens = [] pred_lens = [] real = [] predicted = [] for j in range(len(test_seq)): batch = test_seq[j][0] preds = im_model.predict_on_batch(batch) val = K.ctc_decode(preds, np.full(batch_size, batch['input_length'][0, 0]), greedy=False) decoded = K.eval(val[0][0]) for i in range(decoded.shape[0]): real_label = batch['the_labels'][i, :batch['label_length'][i, 0]] real_label = ''.join([str(int(x)) for x in real_label.tolist()]) pred_label = list(filter(lambda x: x != -1, decoded[i, :].tolist())) pred_label = [str(x) for x in pred_label] pred_label = ''.join(pred_label) dists.append(distance(pred_label, real_label)) ops.append(editops(pred_label, real_label)) lens.append(len(real_label)) pred_lens.append(len(pred_label)) real.append(real_label) predicted.append(pred_label) op_counts = {'insert': 0, 'replace': 0, 'delete': 0} for op in ops: for x in op: op_counts[x[0]] += 1 for key in op_counts.keys(): op_counts[key] = op_counts[key] / sum(lens) metrics = { 'LER': sum(dists) / sum(lens), 'real_mean_length': np.mean(lens), 'predicted_mean_length': np.mean(pred_lens) } metrics.update(op_counts) metrics_file_path = os.path.join(model_path, 'metrics_continue.json') write_dict_to_file(metrics_file_path, metrics)
def lcs(s1, s2): z = matching_blocks(editops(s1, s2),s1, s2) return np.max(list(zip(*z))[2])
def _get_matching_blocks(query, text): return matching_blocks(editops(query, text), query, text)
def main(): parser = argparse.ArgumentParser() parser.add_argument("title_file", help=ARG_HELP_STRINGS["title_file"]) parser.add_argument("-m", "--match_threshold", type=float, default=0.9, help=ARG_HELP_STRINGS["match_threshold"]) parser.add_argument("-a", "--ask_threshold", type=float, default=0.8, help=ARG_HELP_STRINGS["ask_threshold"]) parser.add_argument("-c", "--colors", type=bool, default=COLORS_DEFAULT, help=ARG_HELP_STRINGS["ansi_colors"]) parser.add_argument("--start", type=int, default=0, help=ARG_HELP_STRINGS["start"]) parser.add_argument("--end", type=int, default=inf, help=ARG_HELP_STRINGS["end"]) args = parser.parse_args() header = None additional_fields = ["doi", "similarity"] with open(args.title_file, "r") as f: reader = csv.DictReader(f) title_field = None for field in reader.fieldnames: if field.lower() in TITLE_HEADER_WL: print( colorise("Using column '" + field + "' as title column", "green")) title_field = field break else: print( colorise( "ERROR: Could not find a column name which might denote a title column", "red")) sys.exit() header = reader.fieldnames for field in additional_fields: if field not in header: header.append(field) modified_lines = [] ask_count = 0 for line in reader: line["ask"] = False if reader.line_num < args.start or reader.line_num > args.end: continue print(BREAK) title = line[title_field] head = "line " + str(reader.line_num) + ", query title:" print(colorise(head.ljust(L_JUST) + "'" + title + "'", "blue")) ret = crossref_query_title(title) retries = 0 while not ret['success'] and retries < MAX_RETRIES_ON_ERROR: retries += 1 msg = "Error while querying CrossRef API ({}), retrying ({})...".format( ret["exception"], retries) print(colorise(msg, "red")) ret = crossref_query_title(title) result = ret["result"] msg_tail = "'{}' [{}]" msg_tail = msg_tail.format(result["crossref_title"], result["doi"]) if result["similarity"] == 1.0: msg_head = "Perfect match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "cyan")) line.update(result) line["ask"] = False elif result["similarity"] >= args.match_threshold: msg_head = "Good match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "green")) line.update(result) line["ask"] = False elif result["similarity"] >= args.ask_threshold: msg_head = "Possible match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "yellow")) line.update(result) line["line_num"] = reader.line_num line["ask"] = True ask_count += 1 else: msg_head = "No match found, most similar was ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "red")) line.update(EMPTY_RESULT) line["ask"] = False modified_lines.append(line) if ask_count > 0: print(BREAK) ask_msg = "{} matches found with a similarity between {} and {} will need manual confirmation:" ask_msg = ask_msg.format(ask_count, args.ask_threshold, args.match_threshold) print(colorise(ask_msg, "green")) for line in modified_lines: if line["ask"]: print(BREAK) query_t = line[title_field] xref_t = line["crossref_title"] # display matching segments in identical colors for easier recognition diff = matching_blocks( editops(query_t.lower(), xref_t.lower()), query_t, xref_t) query_print = query_t xref_print = xref_t # ANSI codes increase string length, so we need an offset to compensate offset = 0 for i in range(len(diff)): a, b, c = diff[i] a += offset b += offset offset += 9 color = CMP_COLORS[i % len(CMP_COLORS)] query_print = colorise_text_segment( query_print, a, a + c, color) xref_print = colorise_text_segment(xref_print, b, b + c, color) query_head = colorise( "line {}, query title:".format(line["line_num"]), "blue") xref_head = colorise( "Possible match ({}):".format(round(line["similarity"], 2)), "yellow") print(query_head.ljust(L_JUST) + query_print) print(xref_head.ljust(L_JUST) + xref_print) answer = input( "Do you want to accept the DOI for the match title? (y/n):" ) while answer not in ["y", "n"]: answer = input("Please type 'y' or 'n':") if answer == "n": line.update(EMPTY_RESULT) with open("out.csv", "w") as out: dialect = csv.excel dialect.quoting = csv.QUOTE_ALL writer = csv.DictWriter(out, header, extrasaction='ignore', dialect=dialect) writer.writeheader() writer.writerows(modified_lines)
def do(input_file_name, backward_target_length, dest_folder_path): cur_cnt = 0 target_cnt = count_line_in_file(input_file_name) final_results = {} for target_set in open(input_file_name, 'r'): # 인풋 파일은 이름 : 와일드시퀀스 : 타겟 의 형태를 띈다. tmp = target_set.split(':') # 파일이름에 화이트스페이스 제거 file_name_no_ext = tmp[0].strip() # 파일이름에 확장자 추가 file_name = '{}.txt'.format(file_name_no_ext) # 와일드 시퀸스에 화이트 스페이스 제거 wild_seq = tmp[1].strip() # 타겟 시퀸스에 화이트 스페이스 제거 target = tmp[2].strip() # 타겟 시퀸스 valid 검사 target = seq_validator(target) if not target: continue # 와일드 시퀀스 valid 검사 wild_seq = seq_validator(wild_seq) if not wild_seq: continue # 결과 저장용 폴더 생성 result_folder_name = os.path.join(BASE_DIR, 'analyse_results') if not os.path.exists(result_folder_name): os.makedirs(result_folder_name) try: # 결과 임시 저장 dict result = { 'total_cnt': count_line_in_file(os.path.join(dest_folder_path, file_name)), 'mutated_cnt': 0, 'mutated_rates': 0.0, 'mutated_dict': {} } for line in open(os.path.join(dest_folder_path, file_name), 'r'): # 대상 시퀀스 valid 검사 line = seq_validator(line) if not line: continue # 와일드 시퀀스와 타겟을 이용하여 와일드 시퀀스에서 타겟의 시작, 종료 위치를 파악한다. editops에서 사용. target_start_pos_in_wild = int(wild_seq.find(target)) target_end_pos = target_start_pos_in_wild + len(target) # 와일드 시퀀스를 기준으로 대상 시퀀스와 비교하여 레벤슈타인 유사도 측정에서 editops를 뽑아낸다. # editops는 (변형방법, 와일드시퀀스 기준 위치, 대상시퀀스 기준 위치) 의 형태로 결과가 나온다. # 예를 들어, editops('test', 'teaasz') 의 경우 [('insert', 2, 2), ('insert', 2, 3), ('replace', 3, 5)] # 1번 인덱스 : 삽입이 와일드시퀀스 기준 2번째, 대상시퀀스 기준 2번째에서 발생 # 2번 인덱스 : 삽입이 와일드시퀀스 기준 2번째, 대상시퀀스 기준 3번째에서 발생 # 3번 인덱스 : 교체가 와일드시퀀스 기준 3번째, 대상시퀀스 기준 5번째에서 발생 # 때문에 와일드시퀀스에서 타겟의 위치만 정확히 파악한다면 대상시퀀스에서 변형이 어느부분에 일어났는지 # 몰라도 사용자가 지정한 위치에서의 변형 여부를 충분히 잡아낼 수 있다. for mutation_info in editops(wild_seq, line): # 사용자 지정 위치 검사(타겟의 뒤에서부터 backward_target_length 번째까지) if target_end_pos - int( backward_target_length ) <= mutation_info[1] <= target_end_pos: # 교체는 변형으로 치지 않는다. # 또한 교체가 아니면서 대상시퀸스의 문자 N이라면 변형으로 치지않는다. if mutation_info[0] != 'replace' and line[ mutation_info[2]] != 'N': # 여기까지 왔다면 변형으로 쳐서 카운트+1 result['mutated_cnt'] += 1 # 변형된 대상시퀀스를 결과 출력을 위해 저장하고 동일 시퀀스 갯수 조사를 위해 카운팅한다. if line not in result['mutated_dict'].keys(): result['mutated_dict'][line] = 1 else: result['mutated_dict'][line] += 1 break # 변형 퍼센티지 계산 try: result['mutated_rates'] = float( result['mutated_cnt']) / result['total_cnt'] * 100 except: result['mutated_rates'] = 0 # 각 결과값 저장. with open(os.path.join(result_folder_name, file_name), 'w') as f: f.write('{}\n'.format(wild_seq)) f.write('--------\n') for mutated_seq, cnt in result['mutated_dict'].items(): f.write('{} X {}\n'.format(mutated_seq, cnt)) f.write('--------\n') f.write('mutation rates : {} %'.format( result['mutated_rates'])) except Exception as e: print e print file_name, ' not found.' pass else: # 문제 없다면 결과물을 모은다. final_results[file_name_no_ext] = result # 타겟 하나 분석 종료 카운트+1 cur_cnt += 1 # 진행율 화면 표시 progress_percentage = float(cur_cnt) / target_cnt * 100 print '{} % done'.format(progress_percentage) # 최종 결과물 파일 저장. with open(os.path.join(result_folder_name, 'result_info.txt'), 'w') as f: for name, data in final_results.items(): f.write('{} : {} : {}/{}\n'.format(name, data['mutated_rates'], data['mutated_cnt'], data['total_cnt']))
def fix_triggers(events_meg, events_behavior, event_type): # def fix_triggers(events_meg, events_behavior, event_type='triggTarget'): """ Use this function when the triggers are not identical between the meg and the behavioral file output by Matlab. """ from nose.tools import assert_true from Levenshtein import editops # copy because can change data in place events_meg = np.copy(events_meg) events_behavior = events_behavior.copy() # initialize new field in behavioral file n_trials = len(events_behavior) events_type_labels = ['triggTarget', 'triggCue', 'triggProbe'] for label in events_type_labels: events_behavior[label + '_missing'] = np.zeros(n_trials, bool) # concatenate all behavioral events into one long series of triggers events_behavior_triggers = np.reshape(np.vstack(( events_behavior.triggTarget, events_behavior.triggCue, events_behavior.triggProbe)).T, [-1]) # Identify missed, exchanged or additional trigger values in MEG as # compared to behavioral file def int_to_unicode(array): return ''.join([str(chr(int(ii))) for ii in array]) changes = editops(int_to_unicode(events_behavior_triggers), int_to_unicode(events_meg[:, 2])) # for each modification print changes for modification, from_trigger, _ in changes: if modification == 'delete': this_trial = np.floor(from_trigger / 3.) this_event_type = int(from_trigger % 3.) # set False value to trigg[Type]_missing this_key = events_type_labels[this_event_type] + '_missing' events_behavior.set_value(this_trial, this_key, True) else: # TODO: implement other types of deletion, replacement etc error NotImplementedError() # TODO: remove or add elements in events_meg events_behavior['trial'] = range(len(events_behavior)) # ---- make sure to have modulo 3 events_meg = events_meg[:(3 * np.floor(len(events_meg)/3.)), :] # FIXME? assert_true((len(events_meg) % 3) == 0.) # delete trials absents from meg triggers: sel = np.where(events_behavior[event_type + '_missing'] == False)[0] # print sel events_behavior = events_behavior.iloc[sel] events_behavior.reset_index() # Returns specific types of events (Target, Cue or Probe) start = np.where([event_type == ii for ii in events_type_labels])[0][0] events_meg = events_meg[start::3, :] # check that same number of trials in MEG and behavior assert_true(len(events_meg) == len(events_behavior)) events_behavior['meg_event_tsample'] = events_meg[:, 0] events_behavior['meg_file'] = events_meg[:, 1] events_behavior['meg_event_value'] = events_meg[:, 2] events_behavior = events_behavior.reset_index() return events_behavior
from Levenshtein import editops from collections import Counter import sys orgA = open(sys.argv[1]).read().strip() orgB = open(sys.argv[2]).read().strip() bases = Counter(orgA) + Counter(orgB) ops = editops(orgA, orgB) mutations = [(orgA[a], orgB[b]) for (t, a, b) in ops if t == 'replace'] transitions = [(a, b) for (a, b) in mutations if (a, b) == ('a', 'g') or ( a, b) == ('g', 'a') or (a, b) == ('c', 't') or (a, b) == ('t', 'c')] a_t = [(a, b) for (a, b) in mutations if (a, b) == ('a', 't') or (a, b) == ('t', 'a')] g_c = [(a, b) for (a, b) in mutations if (a, b) == ('c', 'g') or (a, b) == ('g', 'c')] a_clg_t = [(a, b) for (a, b) in mutations if (a, b) == ('a', 'c') or ( a, b) == ('c', 'a') or (a, b) == ('c', 'g') or (a, b) == ('g', 'c')] print('transitions', len(transitions) / len(orgA + orgB)) print('a<->t', len(a_t) / (bases['a'] + bases['t'])) print('g<->c', len(g_c) / (bases['g'] + bases['c'])) print('a<->c|g<->t', len(a_clg_t) / len(orgA + orgB))
minimum_distance = ld return best_alternative def resolve_variations(alternating, refstr): while True: resolved = resolve_single_variation(alternating, refstr) if resolved == False: return alternating alternating = resolved str2 = resolve_variations(str2, str1) fh_disambiguated.write(str2) eops = editops(str1, str2) stash = { str1: { 'i': 0, 'l': 0 }, str2: { 'i': 0, 'l': 0 }, } def get_line(offset, text): global stash