def all_median(A): all_u = [''.join([unichr(i) for i in j]) for j in A] #convert to UTF-16 0-2^16 range p = partion_fwd_rev(all_u,1) #pairwise edit dist partioning fwd_u,rev_u = split(all_u,p[1]) #split from the r matrix all_m = Levenshtein.median(fwd_u+[i[::-1] for i in rev_u]) #test it on all directions fwd_m = Levenshtein.median(fwd_u) #do approximate greedy rev_m = Levenshtein.median(rev_u) #median all_med = [ord(i) for i in all_m] fwd_med = [ord(i) for i in fwd_m] rev_med = [ord(i) for i in rev_m] return all_med,fwd_med,rev_med
def all_median(A,p): all_u = [''.join([unichr(i) for i in j]) for j in A] #convert to UTF-16 0-2^16 range fwd_i,rev_i = split(A,p['r']) #split from the r matrix fwd_u = [''.join([unichr(i) for i in j]) for j in fwd_i] rev_u = [''.join([unichr(i) for i in j]) for j in rev_i] print("all median") all_m = lv.median(all_u) #test it on all directions print("fwd median") fwd_m = lv.median(fwd_u) print("rev median") rev_m = lv.median(rev_u) print('fwd==rev? %s'%(fwd_m==rev_m)) all_med = [ord(i) for i in all_m] fwd_med = [ord(i) for i in fwd_m] rev_med = [ord(i) for i in rev_m] return {'all':all_med,'>':fwd_med,'<':rev_med} #> partition is always larger
def clean_hometown(sheet): col = get_col_number(sheet, 'HOMETOWN') lst = get_data_list(sheet, col) final = [None] * len(lst) remain = [i for i in range(len(lst))] print(Levenshtein.ratio('SICHUAN', "SI CHUAN")) while len(remain) > 0: remain_2 = [] similar_index = [] similar = [] current_index = 0 while lst[remain[current_index]] is None: current_index += 1 current = lst[remain[current_index]] similar.append(current) similar_index.append(remain[current_index]) for j in range(current_index + 1, len(remain)): if lst[remain[j]] is not None: other = lst[remain[j]] other_index = remain[j] #print(current, other, Levenshtein.ratio(current, other)) if Levenshtein.ratio(current, other) > 0.93: similar.append(other) similar_index.append(other_index) else: remain_2.append(other_index) median = Levenshtein.median(similar) for index in similar_index: final[index] = median remain = copy.deepcopy(remain_2) df = pd.read_csv("you_xu.csv") df["HOMETOWN_NEW"] = final df.to_csv("you_xu.csv", index=False)
def multiple_alignment_cost_baseline(sequences, alphabet_size=4): (B, K, N) = sequences.shape avg_distance = AverageMeter() min_distance = AverageMeter() median_distance = AverageMeter() quickmedian_distance = AverageMeter() for i in range(B): strings = torch_to_string(sequences[i], alphabet_size) distances = cross_distance_matrix(strings, strings) # Average distance between sequences avg_distance.update(np.mean(distances)) # Center string min_distance.update(np.min(np.mean(distances, axis=1))) # Greedy median algorithm (Kruzslicz 1999) median = Levenshtein.median(strings) d_median = cross_distance_matrix([median], strings) median_distance.update(np.mean(d_median)) # Greedy median algorithm (Casacuberta & Antonio 1997) quickmedian = Levenshtein.quickmedian(strings) d_quickmedian = cross_distance_matrix([quickmedian], strings) quickmedian_distance.update(np.mean(d_quickmedian)) return avg_distance.avg, min_distance.avg, median_distance.avg, quickmedian_distance.avg
def check_dir_filename_distances(directory): ''' Check a directory to be ingested for wildly divergent filenames. We will currently only want to allow single-level directories of files that represent parts of a whole and thus have fairly similar filenames. ''' _list = abspath_list(directory) names = [] for name in _list: if os.path.isfile(name): if not os.path.basename(name).startswith('.'): names.append(name) median = Levenshtein.median(names) # print(median) outliers = 0 # start a counter for the number of files that diverge from the median name outlierList = [] # and list them for name in names: distance = Levenshtein.distance(median, name) # print(distance) if distance > 10: outliers += 1 outlierList.append(name) return outliers, outlierList
def get_umi_groups(bam_file, edit_distance): """Retrieve UMI groups with UMIs within a certain edit distance. Based on logic from: http://stackoverflow.com/a/35173198/252589 """ all_umis = set([]) with pysam.AlignmentFile(bam_file, "rb", check_sq=False) as bam_iter: for rec in bam_iter: all_umis.add(rec.get_tag("RX")) print(len(all_umis)) grs = [] for i, cur_umi in enumerate(sorted(all_umis)): if i % 1000 == 0: print(i, len(grs)) if edit_distance == 0: grs.append([cur_umi]) else: for g in grs: if any( Levenshtein.distance(cur_umi, w) <= edit_distance for w in g): g.append(cur_umi) break else: grs.append([cur_umi]) out = {} for cur_gr in grs: base = Levenshtein.median(cur_gr) for gr in cur_gr: out[gr] = base return out
def plot_distances(projects_filename): with open(projects_filename, 'rU') as f: config = json.load(f) populate_key_references(config['regions']) groups = defaultdict(list) for name, region in config['regions'].iteritems(): seed_group = region['seed_group'] if seed_group and seed_group.startswith('HCV-'): groups[seed_group].append(name) del groups['HCV-seeds'] group_names = groups.keys() group_names.sort() source_seed_names = [] all_seeds = {} # {name: (group_index, reference)} median_references = [] group_labels = [] for group_index, group_name in enumerate(group_names): logger.info('Grouping %s.', group_name) seed_names = groups[group_name] seed_names.sort() source_seed_names.append(seed_names[0]) references = [] for seed_name in seed_names: reference = ''.join(config['regions'][seed_name]['reference']) all_seeds[seed_name] = (group_index, reference) references.append(reference) median_references.append(Levenshtein.median(references)) group_labels.append(group_name[4:-6]) # trim HCV- and -seeds config = None intragroup_source_groups = [] intragroup_distances = [] intergroup_source_groups = [] intergroup_distances = [] for source_index, source_group_name in enumerate(group_names): logger.info('Processing %s.', source_group_name) source_reference = median_references[source_index] for dest_index, dest_reference in all_seeds.itervalues(): distance = calculate_distance(source_reference, dest_reference) if source_index == dest_index: intragroup_source_groups.append(source_index) intragroup_distances.append(distance) else: intergroup_source_groups.append(source_index) intergroup_distances.append(distance) fig = plt.figure() ax = fig.add_subplot( 111, title='Distance From Genotype Median Reference in Key Regions', xlabel='genotype', ylabel='Levenshtein distance', xticks=range(len(group_labels)), xticklabels=group_labels) ax.plot(intragroup_source_groups, intragroup_distances, 'go', alpha=0.4) ax.plot(intergroup_source_groups, intergroup_distances, 'ro', alpha=0.4) ax.margins(0.1) plt.show()
def get_popular_string(all_strings): '''Finds most likely OCR output string based on string median calculations. Arguments: all_strings: All different OCR strings corresponding to the same x-ray image. Returns: Most Likely string. ''' return lv.median(list(map(lambda x: x.upper(), all_strings)))
def median_word(words, word_counts): median = lev.median(words, word_counts) while True: last_median = median median = lev.median_improve(median, words, word_counts) if median == last_median: break return median
def plot_distances(projects_filename): with open(projects_filename, 'rU') as f: config = json.load(f) populate_key_references(config['regions']) groups = defaultdict(list) for name, region in config['regions'].iteritems(): seed_group = region['seed_group'] if seed_group and seed_group.startswith('HCV-'): groups[seed_group].append(name) del groups['HCV-seeds'] group_names = groups.keys() group_names.sort() source_seed_names = [] all_seeds = {} # {name: (group_index, reference)} median_references = [] group_labels = [] for group_index, group_name in enumerate(group_names): logger.info('Grouping %s.', group_name) seed_names = groups[group_name] seed_names.sort() source_seed_names.append(seed_names[0]) references = [] for seed_name in seed_names: reference = ''.join(config['regions'][seed_name]['reference']) all_seeds[seed_name] = (group_index, reference) references.append(reference) median_references.append(Levenshtein.median(references)) group_labels.append(group_name[4:-6]) # trim HCV- and -seeds config = None intragroup_source_groups = [] intragroup_distances = [] intergroup_source_groups = [] intergroup_distances = [] for source_index, source_group_name in enumerate(group_names): logger.info('Processing %s.', source_group_name) source_reference = median_references[source_index] for dest_index, dest_reference in all_seeds.itervalues(): distance = calculate_distance(source_reference, dest_reference) if source_index == dest_index: intragroup_source_groups.append(source_index) intragroup_distances.append(distance) else: intergroup_source_groups.append(source_index) intergroup_distances.append(distance) fig = plt.figure() ax = fig.add_subplot(111, title='Distance From Genotype Median Reference in Key Regions', xlabel='genotype', ylabel='Levenshtein distance', xticks=range(len(group_labels)), xticklabels=group_labels) ax.plot(intragroup_source_groups, intragroup_distances, 'go', alpha=0.4) ax.plot(intergroup_source_groups, intergroup_distances, 'ro', alpha=0.4) ax.margins(0.1) plt.show()
def get_median(words, counts): median = lev.median(words, counts) # print median, while True: last_median = median median = lev.median_improve(median, words, counts) # print "-->", median, if median == last_median: break return median
def find_median(self): most_common = self.members.most_common(2); # we have a tie for the mode if most_common[0][1] == most_common[0][0]: # account for prefix/suffix stuff too #text = map(lambda ele: ele if isinstance(ele, str) else ele[0] + ele[1], self.members) #print "\t", text return Levenshtein.median(self.members.keys(), self.members.values()) else: return most_common[0][0]
def find_median(self): most_common = self.members.most_common(2) # we have a tie for the mode if most_common[0][1] == most_common[0][0]: # account for prefix/suffix stuff too #text = map(lambda ele: ele if isinstance(ele, str) else ele[0] + ele[1], self.members) #print "\t", text return Levenshtein.median(self.members.keys(), self.members.values()) else: return most_common[0][0]
def CalculateMedoid(dico_vjunc, Dicoresult): centroid = {} #print (Dicoresult) for key in Dicoresult.keys(): listloc = [] for seq in Dicoresult[key]: if seq.rstrip() in dico_vjunc.keys(): listloc.append(dico_vjunc[seq.rstrip()][3]) if len(listloc) != 0: centroid[key] = Levenshtein.median(listloc) #print ("centroid",centroid) return centroid
def median_string(self, strings, string_counts): # Find a 'median string' which is a string with the minimal # sum of edit distances to each string in a list of strings. # see: https://pypi.python.org/pypi/python-Levenshtein/0.11.2 # Note the python package above also permits the use of # weights for each string (e.g. counts of occurances) median = Levenshtein.median(strings, string_counts) while True: last_median = median median = Levenshtein.median_improve(median, strings, string_counts) if median == last_median: break return median
def CalculateMedoid(Dicofasta, Dicoresult): centroid = {} print "Calculating Medoid sequence of each cluster ... \n" for key in tqdm.tqdm(Dicoresult.keys()): listloc = [] for seq in Dicoresult[key]: if seq.rstrip() in Dicofasta.keys(): listloc.append(Dicofasta[seq.rstrip()]) else: print "Caution the", seq.rstrip(), "is not in the fasta file" centroid[key] = Levenshtein.median(listloc) return centroid
def CalculateMedoid(dico_vjunc, Dicoresult): centroid = {} #print ("Calculating Medoid sequence of each cluster ... \n") #for key in tqdm.tqdm(Dicoresult.keys()) : for key in Dicoresult.keys(): listloc = [] for seq in Dicoresult[key]: if seq.rstrip() in dico_vjunc.keys(): listloc.append(dico_vjunc[seq.rstrip()][2]) else: print("Caution the", seq.rstrip(), "is not in the fasta file") centroid[key] = Levenshtein.median(listloc) #print (centroid) return centroid
def choose_normalized(connected_comp, choice='lvst'): ''' choose one normalized term for each connected component by rule-based system NOTE: "longest = False" will choose the shortest ''' res = list() for comp in connected_comp: # Rule-1: no period at the end temp = [name for name in comp if name[-1] != '.'] comp = temp if temp else comp # Rule-2.1: no hyphen anywhere temp = [name for name in comp if '-' not in name] comp = temp if temp else comp # Rule-2.2: no '/' anywhere temp = [name for name in comp if '/' not in name] comp = temp if temp else comp # Rule-3: start with uppercase temp = [name for name in comp if name[0].isupper()] comp = temp if temp else comp # Rule-4: no unicode (no '\u2003') temp = [] for name in comp: normal = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore') val = normal.decode("utf-8") temp.append(val) comp = list(set(temp)) if temp else comp # Rule-5: choose by length or Levenshtein Median comp.sort(key=len) if choice == 'long': res.append(comp[-1]) elif choice == 'short': res.append(comp[0]) elif choice == 'lvst': res.append(lvst.median(comp)) else: print("Not a valid choice methond") return None return res
def domainMedian(domainObjs, numSamples=200): """ Compute the median Domain object from a list of Domain objects. The median is defined as the string that is computed from the per-domain-level Levenshtein string medians. if <numSamples> is set to a value > 0, this number of samples will be picked randomly from <domainObjs>, and the median is then computed from this set. returns a Domain object """ if numSamples and len(domainObjs)>numSamples: domainObjs=list(random.sample(domainObjs, numSamples)) data=[(d.rSplitView(), 1) for d in domainObjs] mxIdx=max([len(d) for d,_ in data]) medianParts=[] for i in range(mxIdx): occurrencesWithWeights = _getLD(data, i) domainLevels,levelWeights = zip(*occurrencesWithWeights) try: ldMedian = lev.median(domainLevels, levelWeights) except TypeError: logging.error('median error: '+str(domainLevels)) else: if ldMedian: """ ignore empty medians; prepend this level to output """ medianParts.insert(0, ldMedian) """ we construct the final median now directly from the constructed parts, i.e. we don't let the DomainStr constructor split it in parts which might be different from the parts we found here, and would therefore impair the alignment for comparisons later. """ medianObj = DomainStr(medianParts) return medianObj
def median(A): u = [''.join([unichr(i) for i in j]) for j in A] u_m = lv.median(u) return [ord(i) for i in u_m]
for name in name_all: output_file.write(name + "\t") lang_word = [] lang_roman_word = [] median_word = "" median_roman = "" if index in findings: for pair in findings[index]: lang_word.append(pair[0]) lang_roman_word.append(pair[1].lower()) try: if len(lang_word) > 3: median_roman = Levenshtein.median(lang_roman_word) pos = lang_roman_word.index(median_roman) median_word = lang_word[pos] print("Got one median") else: median_word = ", ".join(lang_word) median_roman = ", ".join(lang_roman_word) except Exception as e: median_word = ", ".join(lang_word) median_roman = ", ".join(lang_roman_word) output_file.write(median_word + "\t" + median_roman + "\n") output_file.close() ## to get the romanized form of all greek names we have # for strng , greek_forms in zip(names_strong,greek_name_forms):