def extract(self, x, y): if x is None or y is None: return 0 if self.similarity: return 1 - float(hamming_distance(unicode(x), unicode(y))) / max(len(x), len(y)) else: return hamming_distance(unicode(x), unicode(y))
def build_read_names_given_seq(target, read_names_by_seq_fpath, allowed_read_names_set, is_interesting_seq, max_ham, verbose=True): interesting_reads = defaultdict(set) for i, line in enumerate(open(read_names_by_seq_fpath)): if verbose and i % 10000 == 0: sys.stdout.write('.') sys.stdout.flush() words = line.strip().split() seq = words[0] if is_interesting_seq(seq): read_names = set(words[1:]) & allowed_read_names_set interesting_reads[seq].update(read_names) last_start = len(seq) - len(target) if last_start < 0: continue min_ham_idx = min( range(0, last_start + 1), key=lambda i: hamming_distance(unicode( target), unicode(seq[i:i + len(target)]))) min_ham = hamming_distance( unicode(target), unicode(seq[min_ham_idx:min_ham_idx + len(target)])) if min_ham <= max_ham: min_ham_seq = seq[min_ham_idx:min_ham_idx + len(target)] interesting_reads[min_ham_seq].update(read_names) return interesting_reads
def distance_filter(df, c, thresh=3, suffix1='_x', suffix2='_y', col1=None, col2=None, nonull=None): if df.shape[0] == 0: df = pd.DataFrame() else: if (col1 is not None) and (col2 is not None): c1 = col1 + suffix1 c2 = col2 + suffix2 else: c1 = c + suffix1 c2 = c + suffix2 if nonull is not None: df['distance'] = df.apply( lambda x: jf.hamming_distance(x[c1], x[c2]), axis=1) else: df['distance'] = df.apply( lambda x: 10 if (pd.isnull(x[c1]) | pd.isnull(x[c2])) else jf.hamming_distance(x[c1], x[c2]), axis=1) df = df[df.distance <= thresh] return df
def output_calculation(id_1, id_2, id_1_column1_name, id_2_column1_name, id_1_column2_name, id_2_column2_name): return [ str(id_1), str(id_2), str(jellyfish.hamming_distance( id_1_column1_name, id_2_column1_name)), ##################Change Algorithm here!!! str(jellyfish.hamming_distance(id_1_column2_name, id_2_column2_name)) ]
def write_distance2csv(column1_name, column2_name): process = 0 process_sub = 0 total = 100 # distance_info = [] with open(path_1) as f1, open(column1_name + column2_name + '_similarity.csv', 'w+', newline='') as csv_file: headers = [ 'id_1', 'id_2', column1_name + '_similarity', column2_name + '_similarity' ] writer = csv.writer(csv_file) writer.writerow(headers) reader1 = csv.DictReader(f1) for id_1 in reader1: if process >= 100: break # logger.info(id_1) process += 1 with open(path_2) as f2: reader2 = csv.DictReader(f2) process_sub = 0 for id_2 in reader2: # logger.info(id_1) process_sub += 1 logger.info('processing: ' + str(process) + '/' + str(total) + '__' + str(process_sub) + '/' + str(total)) logger.info(id_1['EnterpriseID']) logger.info(len(id_2['EnterpriseID'])) if len(id_2['EnterpriseID']) == 0: break elif id_1['EnterpriseID'] >= id_2['EnterpriseID']: continue else: try: writer.writerow([ str(id_1['EnterpriseID']), str(id_2['EnterpriseID']), str( jellyfish.hamming_distance( id_1[column1_name], id_2[column1_name])), str( jellyfish.hamming_distance( id_1[column2_name], id_2[column2_name])) ]) except Exception as e: logger.info(Exception, ': ', e)
def get_closest_hamming(needle, haystack): closest = None for x in haystack: if (closest == None): closest = (x, jellyfish.hamming_distance(needle, x)) else: temp = (x, jellyfish.hamming_distance(needle, x)) if (temp[1] < closest[1]): closest = temp if (closest == None): return None return closest[0]
def get_closest_hamming(needle,haystack): closest = None; for x in haystack: if(closest == None): closest = (x,jellyfish.hamming_distance(needle,x)); else: temp = (x,jellyfish.hamming_distance(needle,x)); if(temp[1] < closest[1]): closest = temp; if(closest == None): return None; return closest[0];
def compare_two_texts(self, string_a, string_b, normalize_value=True): """ Compare two string and return the value of Hamming algorithm the value is normalized between 0 and 1 values. """ if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or (isinstance(string_a, str) and isinstance(string_b, str))): if normalize_value: return self.__normalized_value(jellyfish.hamming_distance(string_a, string_b)) else: return jellyfish.hamming_distance(string_a, string_b) else: raise TypeError
def write_distance2csv(column1_name, column2_name): process = 0 process_sub = 0 total = 100 # distance_info = [] with open(path_1) as f1, open(column1_name + column2_name + '_similarity.csv', 'w+', newline='') as csv_file: headers = ['id_1', 'id_2', column1_name + '_similarity', column2_name + '_similarity'] writer = csv.writer(csv_file) writer.writerow(headers) reader1 = csv.DictReader(f1) for id_1 in reader1: if process >= 100: break # logger.info(id_1) process += 1 with open(path_2) as f2: reader2 = csv.DictReader(f2) process_sub = 0 for id_2 in reader2: # logger.info(id_1) process_sub += 1 logger.info('processing: ' + str(process) + '/' + str(total) + '__' + str(process_sub) + '/' + str(total)) logger.info(id_1['EnterpriseID']) logger.info(len(id_2['EnterpriseID'])) if len(id_2['EnterpriseID']) == 0: break elif id_1['EnterpriseID'] >= id_2['EnterpriseID']: continue else: try: writer.writerow([str(id_1['EnterpriseID']), str(id_2['EnterpriseID']), str(jellyfish.hamming_distance(id_1[column1_name], id_2[column1_name])), str(jellyfish.hamming_distance(id_1[column2_name], id_2[column2_name]))]) except Exception as e: logger.info(Exception, ': ', e)
def get_similar_v_genes(): """Returns a dictionary of V genes 90% similar to a given V gene. Parameters ---------- None Returns ------- v_to_include : dict Dictionary where the keys are V genes and the items are V genes 90% similar to key. """ v_ref = make_v_ref_dict() v_ref_genes = list(v_ref.keys()) v_ham_mat = np.zeros(shape=(len(v_ref), len(v_ref))) for idx1, v1 in enumerate(v_ref): for idx2, v2 in enumerate(v_ref): seq1 = v_ref[v1] seq2 = v_ref[v2] min_len = np.min([len(seq1), len(seq2)]) # Go backwards from where the CDR3 begins. v_ham_mat[idx1, idx2] = hamming_distance(seq1[-min_len:], seq2[-min_len:]) / min_len v_to_include = {} for idx1, v1 in enumerate(v_ref_genes): v_to_include[v1] = [] for idx2, v2 in enumerate(v_ref_genes): if v_ham_mat[idx1, idx2] <= 0.1: v_to_include[v1].append(v2) return v_to_include
def simple_example(): # String comparison. str1, str2 = u'jellyfish', u'smellyfish' print("jellyfish.levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.levenshtein_distance(str1, str2))) print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2))) print("jellyfish.hamming_distance({}, {}) = {}.".format( str1, str2, jellyfish.hamming_distance(str1, str2))) print("jellyfish.jaro_distance({}, {}) = {}.".format( str1, str2, jellyfish.jaro_distance(str1, str2))) print("jellyfish.jaro_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_similarity(str1, str2))) print("jellyfish.jaro_winkler({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler(str1, str2))) print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler_similarity(str1, str2))) print("jellyfish.match_rating_comparison({}, {}) = {}.".format( str1, str2, jellyfish.match_rating_comparison(str1, str2))) #-------------------- # Phonetic encoding. ss = u'Jellyfish' print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss))) print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss))) print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss))) print("jellyfish.match_rating_codex({}) = {}.".format( ss, jellyfish.match_rating_codex(ss)))
def select_fitness(population, target, population_size): fitness_size = population_size * 2 res_list = fitness_size * [None] for index, a_str in enumerate(population): comp_dist = hamming_distance(a_str, target) res_list[index] = (index, a_str, comp_dist) # index, string, fitness return res_list
def alldist(filex, filey): xread = open(filex, 'r').read() yread = open(filey, 'r').read() lvd = jellyfish.levenshtein_distance(xread,yread) dlvd= jellyfish.damerau_levenshtein_distance(xread,yread) spsum = spamsum.match(xread,yread) spsum = 100 - spsum spsum = float(spsum/100.00) # print lvd res = float( lvd / 100.00 ) dres= float(dlvd / 100.00 ) # print res # print "Levenshtein Distance=",res jaro = jellyfish.jaro_distance(xread,yread) ## Added jaro-winkler distance by fahim 20111011 jarowink = jellyfish.jaro_winkler(xread,yread) jaro = 1.0 - jaro jarowink = 1.0 - jarowink # print "Jaro Distance = ",jaro ham = jellyfish.hamming_distance(xread,yread) ham = float ( ham / 100.00) print "Hamming Distance = ", ham # print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)) # print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)) # print "Spamsum Match score: ", spsum kl = kldiv(tokenize(xread), tokenize(yread)) return res, dres , jaro, jarowink, ham, kl, spsum
def correct_barcode_map(config_params, barcodes_in_data, barcode_to_gene): barcode_tolerance = int(config_params['barcode_tolerance']) orig_barcodes = set(barcode_to_gene.keys()) full_map = {x:x for x in orig_barcodes} correcting_map = {} unmatched_barcodes = set(barcodes_in_data).difference(orig_barcodes) for orig_barcode in barcode_to_gene.keys(): for unmatched_barcode in unmatched_barcodes: barcode_dist = jf.hamming_distance(orig_barcode, unmatched_barcode) if barcode_dist <= barcode_tolerance: if get_verbosity(config_params) >= 3: print 'bad : corrected --> {0} : {1}'.format(unmatched_barcode, orig_barcode) if correcting_map.has_key(unmatched_barcode): correcting_map[unmatched_barcode].append(orig_barcode) else: correcting_map[unmatched_barcode] = [orig_barcode] # Now, filter out any unmatched barcodes that map to multiple original barcodes for key in correcting_map.keys(): if len(correcting_map[key]) > 1: correcting_map[key].pop() # The corrected barcodes are still lists - turn them back into strings! corrected_barcodes = correcting_map.keys() for barcode in corrected_barcodes: correcting_map[barcode] = correcting_map[barcode][0] # Update the mapping of original barcodes to themselves with the mapping of # unmatched barcodes to original barcodes full_map.update(correcting_map) return full_map
def ham_dist_vectorform(strings: list) -> np.array: """Constructs the Hamming distance vector-form for a VJL grouping used for clustering. This function takes in a set of strings, the observed CDR3s in a VJL grouping, and computes the upper triangle of the Hamming (normalized by length) square matrix. This vector-form is what is used as input for the single-linkage clustering algorithm given by scipy. Parameters ---------- strings : list of strings Returns ------- dists : np.array Vector-form of normalized Hamming distances with np.float16 precision. """ normalization = len(strings[0]) num_seqs = len(strings) num_entries = int((num_seqs**2 - num_seqs) / 2) dists = np.zeros(num_entries,dtype=np.float16) index = 0 for i,s1 in enumerate(strings): for s2 in strings[i + 1:]: dists[index] = hamming_distance(s1,s2) index += 1 return dists / normalization
def inverse_hamming_dist(row): return 1.0 / ( jellyfish.hamming_distance( UnicodeDammit(str(row["question1"])).markup.lower(), UnicodeDammit(str(row["question2"])).markup.lower()) or 1.0 )
def get_max_ham_dists(min_len, max_len): dists = defaultdict(list) for _ in xrange(50000): ref_seq = rand_seq(max_len) new_seq = rand_seq(max_len) for i in range(min_len, max_len+1): dists[i].append(hamming_distance(unicode(ref_seq[:i]), unicode(new_seq[:i]))) max_ham_dists = [min(np.percentile(dists[i], 0.1), int(i/4)) for i in range(min_len, max_len+1)] return max_ham_dists
def barcode_hamming(observed,barcodes): """Compute entropy of probabilistic barcode assignment. observed -- SeqRecord of the barcode barcodes -- list of barcode possibilities (python strings) """ obs_seq = observed.seq.tostring() distances = [(barcode,hamming_distance(obs_seq,barcode)) for barcode in barcodes] closest = min(distances,key=lambda p: p[1]) return closest # tuple of (barcode, distance)
def hamming_pred(string, dictionary): from jellyfish import hamming_distance distances = [] for item in dictionary: distances.append(hamming_distance(string, item)) min_distance = min(distances) min_index = distances.index(min_distance) return dictionary[min_index], min_distance
def measure_mrn_similarity(ssn1, ssn2, sign): if ssn1 == "" or ssn2 == "" or ssn1 is None or ssn2 is None: return 0 r1 = jellyfish.jaro_winkler(ssn1, ssn2) r2 = 1 - jellyfish.hamming_distance(ssn1, ssn2) / len(ssn1) if sign == "t": print("jw-{} vs hd-{}".format(r1, r2)) elif sign == "w": return max(r1, r2)
def hamming(self): self.cluster = [] already = [] for i in range(0,len(self.group)): if self.group[j] in already: continue for j in range(i+1, len(self.group)): if self.radius >= jf.hamming_distance(str(self.group[i]),str(self.group[j])): self.cluster.append([self.group[i],self.group[j]]) already = flatten(self.cluster) return com_check(self.cluster)
def test_hamming_distance(self): cases = [("", "", 0), ("", "abc", 3), ("abc", "abc", 0), ("acc", "abc", 1), ("abcd", "abc", 1), ("abc", "abcd", 1), ("testing", "this is a test", 13), ] for (s1, s2, value) in cases: self.assertEqual(jellyfish.hamming_distance(s1, s2), value)
def measure_distance(word1, word2, distance_type): if distance_type == 'lv': distance = Levenshtein.eval(word1, word2) if distance_type == 'dlv': distance = jellyfish.damerau_levenshtein_distance(word1, word2) if distance_type == 'jw': # Jaro–Winkler indicates the similiraty, we take the inverse distance = -jellyfish.jaro_winkler_similarity(word1, word2) if distance_type == 'j': distance = -jellyfish.jaro_similarity(word1, word2) if distance_type == 'hm': distance = jellyfish.hamming_distance(word1, word2) return distance
def getSimilarity(str1, str2): distance = {} if distance_metric1 == "JaroWinkler": distance[distance_metric1] = jellyfish.jaro_winkler(str1, str2) if distance_metric2 == "Jaro": distance[distance_metric2] = jellyfish.jaro_distance(str1, str2) if distance_metric3 == "MatchRating": distance[distance_metric3] = jellyfish.match_rating_comparison( str1, str2) if distance_metric4 == "Levenshtein": distance[distance_metric4] = jellyfish.levenshtein_distance(str1, str2) if distance_metric5 == "Hamming": distance[distance_metric5] = jellyfish.hamming_distance(str1, str2) return distance
def remove_duplications(self): print("\nRemoving duplications...") duplications = [] for ix in range(len(self.data)): for yx in range(ix, len(self.data)): if ix != yx: if (jf.hamming_distance( str(self.data.summary[ix])[0:200], str(self.data.summary[yx])[0:200]) <= 20): duplications.append(yx) duplications = list(set(duplications)) self.data = self.data[~self.data.index.isin(duplications)].reset_index( drop=True)
def read_fastq(config_params, species_config_params, folder, out_path, lane_id): common_primer_start = int(species_config_params['common_primer_start']) common_primer_end = common_primer_start + int(species_config_params['common_primer_length']) common_primer_seq = species_config_params['common_primer_sequence'] common_primer_tolerance = int(config_params['common_primer_tolerance']) index_tag_start = int(species_config_params['index_tag_start']) index_tag_end = index_tag_start + int(species_config_params['index_tag_length']) barcode_start = int(species_config_params['genetic_barcode_start']) barcode_end = barcode_start + int(species_config_params['genetic_barcode_length']) out_filename = get_barseq_filename(config_params, lane_id) # print out_filename # print common_primer_tolerance of = open(out_filename, 'wt') fastq_filenames = [os.path.join(folder, x) for x in os.listdir(folder) if is_fastq_filename(x)] common_primer_count = 0 barcodes = set() index_tags = set() for filename in fastq_filenames: f = cfo.get_compressed_file_handle(filename) for line_count, line in enumerate(f): if get_verbosity(config_params) >= 3: print line_count, line if line_count % 4 == 1: string = line.strip() common_primer = string[common_primer_start:common_primer_end] common_primer_dist = jf.hamming_distance(common_primer, common_primer_seq) if get_verbosity(config_params) >= 3: print common_primer_dist if common_primer_dist <= common_primer_tolerance: common_primer_count += 1 index_tag = string[index_tag_start:index_tag_end] barcode = string[barcode_start:barcode_end] index_tags.update(set([index_tag])) barcodes.update(set([barcode])) # print "index_tag, barcode : {}, {}".format(index_tag, barcode) of.write('{0}\t{1}\n'.format(index_tag, barcode)) f.close() total_counts = (line_count + 1)/ 4 of.close() return total_counts, common_primer_count, barcodes, index_tags
def comparacion_pares(self, texto1, texto2, tipo="levenshtein", norm=None): """ Permite hacer comparaciones entre dos textos de entrada, de acuerdo a \ un tipo de distancia o similitud determinado. :param texto1: Primer texto de interés a comparar. :type texto1: str :param texto2: Segundo texto de interés a comparar. :type texto2: str :param tipo: Criterio de comparación a utilizar entre los textos. \ Valor por defecto `'levenshtein'`. :type tipo: {'damerau_levenshtein', 'levenshtein', 'hamming', \ 'jaro_winkler', 'jaro'}, opcional :param norm: Permite normalizar los resultados en función de la \ longitud de los textos. Si `norm = 1` se normaliza en función al \ texto más corto, si `norm = 2` se normaliza en función al texto \ de mayor extensión. :type norm: {1,2}, opcional :return: (float) Valor resultado de la comparación entre `texto1` y \ `texto2`. """ tipo = tipo.lower() if "damerau" in tipo: salida = jellyfish.damerau_levenshtein_distance(texto1, texto2) elif "levenshtein" in tipo: salida = jellyfish.levenshtein_distance(texto1, texto2) elif "hamming" in tipo: salida = jellyfish.hamming_distance(texto1, texto2) elif "winkler" in tipo: salida = jellyfish.jaro_winkler_similarity(texto1, texto2) elif "jaro" in tipo: salida = jellyfish.jaro_similarity(texto1, texto2) else: print( ( "Por favor seleccione un criterio válido " "para comparar los strings." ) ) return None if norm in [1, 2] and "jaro" not in tipo: if norm == 1: salida /= min(len(texto1), len(texto2)) else: salida /= max(len(texto1), len(texto2)) return salida
def rmBadSeqs(fname, canonIndex): # Remove the sequences that have only one coding # posibility and are more than edit distance 1 # away from all canonical sequences for this target # that meet the entropy cutoff. # Get all sequences meeting the entropy threshold fin = open(fname, 'r') goodSeqs = [i.strip().split('\t')[0] for i in fin.readlines()\ if eval(i.strip().split('\t')[3]) != 1] fin.close() # Convert to canonical set of sequences canonSeqs = [] for seq in goodSeqs: canonSeq = '' for i in canonIndex: canonSeq += seq[i] canonSeqs.append(canonSeq) canonSeqs = set(canonSeqs) #print canonSeqs # Write all the good sequences from fname into a temp # file and then replace fname with the tmp file fin = open(fname, 'r') fout = open('tmp.txt', 'w') for line in fin: sp_line = line.strip().split('\t') numPoss = eval(sp_line[3]) if numPoss > 1: fout.write(line) else: seq = sp_line[0] seq2 = '' for i in canonIndex: seq2 += seq[i] for canonSeq in canonSeqs: if jellyfish.hamming_distance(seq2, canonSeq) <= 1: #print "Here" fout.write(line) break fout.close() normalizeFreq('tmp.txt', 1) os.system('mv tmp.txt ' + fname)
def calc_distance(string1, string2, method): if method == "levenshtein": distance=jellyfish.levenshtein_distance(string1, string2) elif method == "damerau_levenshtein": distance= damerau_levenshtein_distance(string1, string2) elif method == "hamming_distance": distance= jellyfish.hamming_distance(string1, string2) elif method == "jaro_winkler": distance= jellyfish.jaro_winkler(string1, string2) elif method == "cosine": vector1 = text_to_vector(string1) vector2 = text_to_vector(string2) distance = get_cosine(vector1, vector2) elif method=="jaccard": x_set=ngrams(string1, 1) y_set=ngrams(string2, 1) distance = jaccard_similarity(x_set, y_set) return distance
def cal_str_similarity(str_1, str_2, option): multiset_1 = str_1.split() multiset_2 = str_2.split() # Jaccard similarity if option == 'JACC': return 1.0 - dist.jaccard(multiset_1, multiset_2) # Cosine similarity if option == 'COS': comm_len = len([word for word in multiset_1 if word in multiset_2]) return comm_len * 1.0 / math.sqrt(len(multiset_1) * len(multiset_2)) # Dice similarity elif option == 'DICE': comm_len = len([word for word in multiset_1 if word in multiset_2]) return comm_len * 2.0 / (len(multiset_1) + len(multiset_2)) # Edit similarity elif option == 'ES': return 1.0 - jf.levenshtein_distance(str_1, str_2) * 1.0 / max( len(str_1), len(str_2)) # Hamming similarity elif option == 'HAMMING': return 1.0 - jf.hamming_distance(str_1, str_2) * 1.0 / max( len(str_1), len(str_2)) # Jaro distance elif option == 'JARO': return jf.jaro_distance(str_1, str_2) # Jaro-Winkler distance elif option == 'JARO-WINKLER': return jf.jaro_winkler(str_1, str_2) # Overlap similarity elif option == 'OVERLAP': comm_len = len([word for word in multiset_1 if word in multiset_2]) return comm_len * 1.0 / max(len(multiset_1), len(multiset_2)) # entity string start with mention string elif option == 'START-WITH': return str_2.startswith(str_1) elif option == 'END-WITH': return str_2.endswith(str_1) elif option == 'SAME': return str_1 == str_2
def string_comparison(self, text1, text2, choice='levenshtein_distance'): ''' text1: String Input 1 text2: String Input 2 choice: 'levenshtein_distance' or 'damerau_levenshtein_distance' or 'hamming_distance' or 'jaro_distance' or 'jaro_winkler' or 'match_rating_comparison' ''' # https://jellyfish.readthedocs.io/en/latest/comparison.html if choice == 'levenshtein_distance': return jellyfish.levenshtein_distance(text1, text2) elif choice == 'damerau_levenshtein_distance': return jellyfish.damerau_levenshtein_distance(text1, text2) elif choice == 'hamming_distance': return jellyfish.hamming_distance(text1, text2) elif choice == 'jaro_distance': return jellyfish.jaro_distance(text1, text2) elif choice == 'jaro_winkler': return jellyfish.jaro_winkler(text1, text2) elif choice == 'match_rating_comparison': return jellyfish.match_rating_comparison(text1, text2) else: print("Wrong Choice")
def classify_seq(rec1, rec2, min_len, max_len, max_ham_dists, log_p_struct): bases = set('ACGT') # Store as strings seq1 = str(rec1.seq) seq2_rc = str(rec2.seq.reverse_complement()) loc_max_len = min(max_len, len(seq1), len(seq2_rc)) # Find aligning sequence, indels are not allowed, starts of reads included sig_lens = [i for i, max_ham in zip(range(min_len, loc_max_len + 1), max_ham_dists) if hamming_distance(unicode(seq1[:i]), unicode(seq2_rc[-i:])) < max_ham] if len(sig_lens) != 1: return None seq2_len = sig_lens[0] seq2_match = seq2_rc[-seq2_len:] seq1_match = seq1[:seq2_len] # Get corresponding quality scores quals1 = rec1.letter_annotations['phred_quality'][:seq2_len] quals2 = rec2.letter_annotations['phred_quality'][::-1][-seq2_len:] # Build consensus sequence ML_bases = [] for r1, q1, r2, q2 in zip(seq1_match, quals1, seq2_match, quals2): if r1 in bases and r1 == r2: ML_bases.append(r1) elif set([r1, r2]) <= bases and q1 > 2 and q2 > 2: r1_score = log_p_struct[r1][r1][q1] + log_p_struct[r1][r2][q2] r2_score = log_p_struct[r2][r1][q1] + log_p_struct[r2][r2][q2] if r1_score > r2_score: ML_bases.append(r1) else: ML_bases.append(r2) elif r1 in bases and q1 > 2: ML_bases.append(r1) elif r2 in bases and q2 > 2: ML_bases.append(r2) else: return None return ''.join(ML_bases)
def processMatchesHammingDistance(): inputQueue = deque(inputList) row = inputQueue.pop() while row is not None: bestMatchScore = -1 bestMatchRow = '' for rowToCompare in inputQueue: score = jellyfish.hamming_distance(row, rowToCompare) if bestMatchScore == -1 or score < bestMatchScore: bestMatchScore = score bestMatchRow = rowToCompare bestMatchPerColumn[row] = { 'match': bestMatchRow, 'score': bestMatchScore } if len(inputQueue) > 1: row = inputQueue.pop() else: return
def orderByRel(jobs, kw, algo): """Order based on algo type""" for i in range(len(jobs)): if algo == 1: jobs[i][5] = levenshtein_distance(jobs[i][0].strip(), kw) elif algo == 2: jobs[i][5] = damerau_levenshtein_distance(jobs[i][0].strip(), kw) elif algo == 3: jobs[i][5] = hamming_distance(jobs[i][0].strip(), kw) elif algo == 4: jobs[i][5] = 1 - jaro_distance(jobs[i][0].strip(), kw) elif algo == 5: jobs[i][5] = 1 - jaro_winkler(jobs[i][0].strip(), kw) # jobs.sort(jobs, key=lambda job: job[5]) jobs_sorted = sorted(jobs, key=lambda dist: dist[5]) return jobs_sorted
def cluster_singlecell_aa(singlecell_annotation, lineages, thresh): """Performs single-linkage clustering (using aa CDR3) on a single cell given 90% V gene similarity. Parameters ---------- singlecell_annotation : dict Dictionary containing information about the annotated single cell sequence. lineages : dict Unnested dictionary of clustered annotations [(V, J, L, cluster_id)]. threshold : float Distance threshold for the single-linkage clustering algorithm. Returns ------- to_insert : np.array np.array of keys of lineages into which single cells clustered successfully. """ v_gene = singlecell_annotation['v_gene']['gene'] cdr3 = singlecell_annotation['junc_nt'] cdr3_aa = translate(cdr3) len_cdr3 = len(cdr3) len_cdr3_aa = len(cdr3_aa) subset = [key for key in lineages if key[2] == str(len_cdr3) and key[0] in v_to_include[v_gene]] min_distances = np.ones(len(subset)) for idx, vjlc in enumerate(subset): distances = np.zeros(len(lineages[vjlc]), dtype=np.float16) for idxa, annotation in enumerate(lineages[vjlc]): distances[idxa] = hamming_distance(cdr3_aa, translate(annotation['junc_nt'])) distances /= len_cdr3_aa min_distances[idx] = (np.min(distances)) to_insert = np.array(subset)[min_distances <= thresh] return to_insert
def comparacion_pares(self, texto1, texto2, tipo='levenshtein', norm=None): """ Permite hacer comparaciones entre dos textos de entrada, de acuerdo a un tipo de \ distancia o similitud determinado. :param texto1: (str) Primer texto de interés a comparar. :param texto2: (str) Segundo texto de interés a comparar. :param tipo: (str) {'damerau_levenshtein', 'levenshtein', 'hamming', 'jaro_winkler', \ 'jaro'} Valor por defecto: 'levenshtein'. Criterio de comparación a utilizar entre los textos. :param norm: (int) {1, 2} Valor por defecto: None. Permite normalizar \ los resultados en función de la longitud de los textos. \ Si norm=1 se normaliza en función al texto más corto, \ si norm=2 se normaliza en función al texto de mayor extensión. :return: (float o int) Valor resultado de la comparación. """ tipo = tipo.lower() if 'damerau' in tipo: salida = jellyfish.damerau_levenshtein_distance(texto1, texto2) elif 'levenshtein' in tipo: salida = jellyfish.levenshtein_distance(texto1, texto2) elif 'hamming' in tipo: salida = jellyfish.hamming_distance(texto1, texto2) elif 'winkler' in tipo: salida = jellyfish.jaro_winkler_similarity(texto1, texto2) elif 'jaro' in tipo: salida = jellyfish.jaro_similarity(texto1, texto2) else: print( 'Por favor seleccione un criterio válido para comparar los strings.' ) return None if norm in [1, 2] and 'jaro' not in tipo: if norm == 1: salida /= min(len(texto1), len(texto2)) else: salida /= max(len(texto1), len(texto2)) return salida
def process_dewlap_dorsal_ventral(): args = get_args() filenames = getFilenames(args.input_dir) base_dir_name = os.path.split(args.input_dir)[-1] # SETUP DATASET col_headers = [] data_set = [] header_list = [] # Sort files by tissue allowing for missspellings organized_by_tissue = {'dewlap':[], 'dorsal':[], 'ventral':[]} for count, filename in enumerate(filenames): fname = os.path.split(filename)[-1] for tissue in organized_by_tissue.keys(): distances = sort([jellyfish.hamming_distance(tissue, item) for item in fname.split("_")]) if distances[0] <= 2: organized_by_tissue[tissue].append(filename) print organized_by_tissue
def comparison(filename_txt, json_data, debug): #-----txt--------------------------------------- with open(filename_txt + '.txt', 'r') as myfile: data_txt = myfile.read().upper().replace("\n", " ") if (debug): print(data_txt) data_txt = ''.join(re.findall("[A-Z0-9]", data_txt)) if (debug): print(data_txt) date_txt = find_date(filename_txt + '.txt') if debug: print('text date', date_txt) allout = [] if debug: print('json_data', json_data) #-----json-------------------------------------- for text in json_data: #state # can create a dictionary: state - > state code date_json = json_data[3] #'DateOfRegistry' if text == "USA": text = "UNITEDSTATESOFAMERICA" #makemodel if text == json_data[5]: list_model = [] # list of words descriping the model # temporal str textmodel = "" for place in range(len(text)): if text[place] != " ": textmodel += text[place] else: list_model.append(textmodel) textmodel = "" list_model.append(textmodel) if debug: print('list_model1', list_model) for word in list_model: if word[0] == "(" and word[-1] == ")": list_model.remove(word) temp_word = word[1:-1] list_model.append(temp_word) if debug: print('list_model2', list_model) #Remove all spaces, commas and other non numerical or ascii characters from the data and the search word newtext = ''.join(re.findall("[A-Z0-9]", text.upper())) if (debug): print('newtext(upper,npspace)', newtext) arr = [] #Create a window of the size of the seach word and slide it over the text file #Calculate the levenshtein distance between the window and the search text for i in range(0, len(data_txt) - len(newtext)): window = data_txt[i:i + len(newtext)] d = jellyfish.levenshtein_distance(newtext, window) d2 = jellyfish.hamming_distance(newtext, window) d3 = jellyfish.jaro_winkler(newtext, window) arr.append([window, d, d2, d3]) #Search which window has the smallest distance p = pd.DataFrame(arr, columns=["word", "lev", "hamming", "jarowinkler"]) m = p["jarowinkler"].idxmax() #Output that window and the match is the proportion of the matched window out = [p.iloc[m, 0], 100 * p.iloc[m, 3]] if (debug): print(p) print(p.xiloc[m]) #date of registry if text == json_data[3]: if date_json in date_txt: out = [date_json, 100] #yearofbuilt if text == json_data[-1]: if out[1] < 100: out[1] = 0.0 out.insert(0, newtext) allout.append(out) #Calculate an overall score with the average of the provided text out = [allout, pd.DataFrame(allout).loc[:, 2].mean()] return out
def word_similarity( word_to_compare='Vignir', list_of_words=["Heigigr", "Beðurni"], return_top_n=20, use_cut_off=False, cut_off=0.5, sim_measure='Levenshtein', #SequenceMatcher #Jaro-Winkler #Hamming, min_characters=2, #Null for no restriction, filter_non_capital_letters=True): """Compare similarity between a word and a list of words Returns list of similar words/names based on a similarity measure Args: word_to_compare (str) -word to compare with each value in list list_of_words (lst) - list of strings to compare against return_top_n (int) - return only top n 10 results based on similarity measure use_cut_off (bool) - whether to use a cut off value based on similarity cut_off (int) - cut off value Returns: Returns two ints; average epoc_loss and epoch_accuracy """ word_similarity_list = [] for word in list_of_words: dict_Words = {} dict_Words['word_to_compare'] = word_to_compare dict_Words['word_to_compare_against'] = word if sim_measure == 'Levenshtein': ##dict_Words['similarity']=Levenshtein.ratio(word_to_compare, word) dict_Words['similarity'] = jellyfish.levenshtein_distance( word_to_compare, word) * -1 dict_Words['similarity_measure'] = 'Levenshtein' elif sim_measure == 'SequenceMatcher': dict_Words['similarity'] = SequenceMatcher(None, word_to_compare, word).ratio() dict_Words['similarity_measure'] = 'SequenceMatcher' #https://docs.python.org/2.4/lib/sequencematcher-examples.html elif sim_measure == 'Jaro-Winkler': dict_Words['similarity'] = jellyfish.jaro_winkler( word_to_compare, word) dict_Words['similarity_measure'] = 'Jaro-Winkler' elif sim_measure == 'Hamming': dict_Words['similarity'] = jellyfish.hamming_distance( word_to_compare, word) * -1 dict_Words['similarity_measure'] = 'Hamming' word_similarity_list.append(dict_Words) #Convert to frame df_word_similarity = pd.DataFrame(word_similarity_list) #Sort df_word_similarity = df_word_similarity.sort_values(by='similarity', ascending=False) #Return top results if return_top_n > 0: if len(df_word_similarity) > return_top_n: df_word_similarity = df_word_similarity[0:return_top_n] else: return df_word_similarity[0:0] #Whether to use cutoff if use_cut_off: df_word_similarity = df_word_similarity[ df_word_similarity.similarity > cut_off] #Filter min characters if min_characters > 0: df_word_similarity = df_word_similarity[ df_word_similarity.word_to_compare_against.str.len() > min_characters] #Filter out words that does not start with a large character if filter_non_capital_letters: df_word_similarity = df_word_similarity[ df_word_similarity.word_to_compare_against.str.istitle()] return df_word_similarity
def output_calculation(id_1, id_2, id_1_column1_name, id_2_column1_name, id_1_column2_name, id_2_column2_name): return [str(id_1), str(id_2), str(jellyfish.hamming_distance(id_1_column1_name, id_2_column1_name)), ##################Change Algorithm here!!! str(jellyfish.hamming_distance(id_1_column2_name, id_2_column2_name))]
def percent_id(seq1,seq2): alignment = global_align(seq1,seq2) return (1. - hamming_distance(alignment[0],alignment[1]) / float(len(alignment[0]))) * 100.
ng=ngram.NGram(pad_len=1,N=2) a = set(ng.ngrams(ng.pad(a))) b = set(ng.ngrams(ng.pad(b))) overlap = len(a & b) return overlap * 2.0/(len(a) + len(b)) def jaccard(a, b): ng=ngram.NGram(pad_len=1,N=2) a = set(ng.ngrams(ng.pad(a))) b = set(ng.ngrams(ng.pad(b))) return len(a & b) * 1.0 / len(a | b) similarity = { "jaro_winckler": lambda a,b: jellyfish.jaro_winkler(a, b), "hamming_distance": lambda a,b: 1.0 - float(jellyfish.hamming_distance(a, b)) / max(len(a), len(b)), "damreau_levenshtein":lambda a,b: 1.0 - float(jellyfish.damerau_levenshtein_distance(a, b)) / max(len(a), len(b)), "dice":dice, "jaccard":jaccard }[args.similarity] mean = { "arithmetic_mean":arithmeticMean, "arithemtic_weighted_mean":arithmeticWeightedMean, "geometric_mean":geometricMean, "geometric_weighted_mean": geometricWeightedMean }[args.mean] def open_tsv(filename): return csv.reader(open(filename, "r"), delimiter='\t')
match_tl = match_tl.search(word); if(len(match_tl)!=0): m5 = m5 + match_tl[0][1]; m1_tmp = 0; m2_tmp = 0; m3_tmp = 0; word = unicode(word,'utf8'); for txt in product_description: txt = unicode(txt,'utf8'); a = jf.levenshtein_distance(word,txt); if(a>=m1_tmp): m1_tmp = a; a = jf.damerau_levenshtein_distance(word,txt); if(a>=m2_tmp): m2_tmp = a; a = jf.hamming_distance(word,txt); if(a>=m3_tmp): m3_tmp = a; m1 = m1 + m1_tmp; m2 = m2 + m2_tmp; m3 = m3 + m3_tmp; m6_tmp = 0; m7_tmp = 0; m8_tmp = 0; #word = word.decode('utf-8'); for txt in product_title: txt = safe_unicode(txt); a = jf.levenshtein_distance(word,txt); if(a>=m6_tmp): m6_tmp = a; a = jf.damerau_levenshtein_distance(word,txt);