def hamming(ahash,phash): """return the 10 most similar pics based on the hamming""" result = find_aHash(); dist_a = 9 dist_b = 16 ahash_db = 0 bhash_db = 0 phash = str(convert_hex_to_bin(phash)) for i in result: if distance.hamming(i, ahash) < dist_a: dist_a = distance.hamming(i,ahash) ahash_db = i result_p = find_pHash_a(ahash_db) print result_p resu =[] for x in result_p: y=list(x) y[0] = str(convert_hex_to_bin(y[0])) y.append(distance.hamming(phash,y[0])) resu.append(y) resu = sorted(resu,key=lambda x: x[2]) if len(resu) <= 10: return resu else: return resu[0:10]
def _getLinkerIndices(self, readKmers): minLinker1Ed = 2 minLinker2Ed = 2 linker1Index = None linker2Index = None for i in range(len(readKmers)): kmer = readKmers[i] linker1Ed = distance.hamming(kmer,self.linker1) if linker1Ed < minLinker1Ed: minLinker1Ed = linker1Ed linker1Index = i break if linker1Index != None: linker2Start = linker1Index + 15 else: linker2Start = 0 for i in range(linker2Start,len(readKmers)): kmer = readKmers[i] linker2Ed = distance.hamming(kmer,self.linker2) if linker2Ed < minLinker2Ed: minLinker2Ed = linker2Ed linker2Index = i break return linker1Index, linker2Index
def findSameHash(simhash): # 对比第一个hash表,第0-15位hash值 num = int(simhash[:16], 2) if len(hash1[num]) == 0: hash1[num].append(simhash) else: for i in range(len(hash1[num])): if distance.hamming(simhash, hash1[num][i]) < 3: return True # 对比第二个hash表,第16-31位hash值 num2 = int(simhash[16:32], 2) if len(hash2[num2]) == 0: hash2[num2].append(simhash) else: for i in range(len(hash2[num2])): if distance.hamming(simhash, hash2[num2][i]) < 3: return True # 对比第三个hash表,第32-47位hash值 num3 = int(simhash[32:47], 2) if len(hash3[num3]) == 0: hash3[num3].append(simhash) else: for i in range(len(hash3[num3])): if distance.hamming(simhash, hash3[num3][i]) < 3: return True # 对比第四个hash表,第48-63位hash值 num4 = int(simhash[48:64], 2) if len(hash4[num4]) == 0: hash4[num4].append(simhash) else: for i in range(len(hash4[num4])): if distance.hamming(simhash, hash4[num4][i]) < 3: return True return False
def EdgeComp(e1,e2): u,v=e1 x,y=e2 leveluv = distance.hamming(profiles[u],profiles[v]) levelxy = distance.hamming(profiles[x],profiles[y]) if leveluv != levelxy: return leveluv - levelxy for k in range (maxlen): maxuv = max(lvs[u][k], lvs[v][k]) maxxy = max(lvs[x][k], lvs[y][k]) if maxuv != maxxy: return maxxy - maxuv minuv = min(lvs[u][k], lvs[v][k]) minxy = min(lvs[x][k], lvs[y][k]) if minuv != minxy: return minxy - minuv maxuv = max(u,v) maxxy = max(x,y) if maxuv != maxxy: return maxxy - maxuv minuv = min(u,v) minxy = min(x,y) if minuv != minxy: return minxy - minuv
def hamming(ahash, phash): """return the 10 most similar pics based on the hamming""" result = find_aHash() dist_a = 9 dist_b = 16 ahash_db = 0 bhash_db = 0 phash = str(convert_hex_to_bin(phash)) for i in result: if distance.hamming(i, ahash) < dist_a: dist_a = distance.hamming(i, ahash) ahash_db = i result_p = find_pHash_a(ahash_db) print result_p resu = [] for x in result_p: y = list(x) y[0] = str(convert_hex_to_bin(y[0])) y.append(distance.hamming(phash, y[0])) resu.append(y) resu = sorted(resu, key=lambda x: x[2]) if len(resu) <= 10: return resu else: return resu[0:10]
def filter_contigs(contigs, assembly_min_uniq=0.01): """ given a list of contigs, removes similar contigs to leave the highest (of the similar) scoring contig only """ filtered_contigs = {} # ordering: highest scoring, then longest, then aphanumeric for contig in sorted(contigs, key=lambda x: (-1 * x.score, -1 * len(x.seq), x.seq)): rseq = reverse_complement(contig.seq) if contig.seq in filtered_contigs or rseq in filtered_contigs: continue drop = False # drop all contigs that are more than 'x' percent similar to existing contigs for other_seq in filtered_contigs: kmer_length = min(len(other_seq), len(contig.seq)) okmer_list = set(kmers(other_seq, kmer_length)) for okmer, ckmer in itertools.product(okmer_list, set(kmers(contig.seq, kmer_length))): if distance.hamming(okmer, ckmer, normalized=True) < assembly_min_uniq: drop = True break if not drop: for okmer, ckmer in itertools.product(okmer_list, set(kmers(rseq, kmer_length))): if distance.hamming(okmer, ckmer, normalized=True) < assembly_min_uniq: drop = True break if drop: break if not drop: filtered_contigs[contig.seq] = contig return list(filtered_contigs.values())
def get_best_re(x, target_re): x_count = sorted(dict(Counter(x)).items(), key=itemgetter(1), reverse=True) x_count_max = [(re,a) for re,a in x_count if a == x_count[0][1]] if len(x_count_max) == 1: return((x_count_max[0][0],x_count_max[0][1],distance.hamming(x_count_max[0][0], target_re))) else: x_ham = [(re,a,distance.hamming(re, target_re)) for re,a in x_count_max] return(max(x_ham, key=itemgetter(2)))
def blocksAreCorrect(self,block1,block2): if (len(block2) < 4): return False block1Ed = distance.hamming(block1,"ACG") block2Ed = distance.hamming(block2,"GACT") if (block1Ed > 1) or (block2Ed > 1): return False else: return True
def groupbyHamm(seqrlist, Hamm_limit): """Takes a list of seq records Makes a copy of the list Calculates the Hamming distance between all pairs of sequences including reverse_complements and groups them together into a list if their Hamming distances from each other are <= Hamm_limit If a group is found, those sequences are removed from subsequent consideration by adding them to a passlist Outputs list_of_groups, which is a list of lists containing seq groups""" seqrcopy = list(seqrlist) list_of_groups = [] rcidlist = [] passlist = [] for record1 in seqrcopy: if record1.id not in passlist: # print "%s not in passlist... creating new seqgroup and beginning search..." % record1.id seqgroup = [] seqgroup.append(record1) passlist.append(record1.id) fwrecord1 = record1.seq rvrecord1 = record1.reverse_complement() for record2 in seqrcopy: if record2.id not in passlist: # print "%s not in passlist... testing..." % record2.id fwrecord2 = record2.seq # print ("%s/%s --> %s" # % (record1.id, record2.id, # min(distance.hamming(fwrecord1,fwrecord2), # distance.hamming(rvrecord1,fwrecord2))) # ) dH = distance.hamming(fwrecord1,fwrecord2) dHr = distance.hamming(rvrecord1,fwrecord2) # print dH # print dHr if dH <= Hamm_limit: # print "Pass! Adding %s to seqgroup and passlist" % record2.id seqgroup.append(record2) passlist.append(record2.id) elif dHr <= Hamm_limit: seqgroup.append(record2) passlist.append(record2.id) rcidlist.append(record2.id) list_of_groups.append(seqgroup) return list_of_groups,rcidlist
def pack(lines): s = ['', '', '', ''] ref = 'A' * readlen # ref is the reference which is constantly updated (introduced because matching a read to previous read leads to double noise than actual) prev = 'A' * readlen count = [[1] * readlen, [0] * readlen, [0] * readlen, [0] * readlen, [0] * readlen ] #number of A's,C's,T's,G's and N's seen at each position in ref #Note: N is never considered in the ref - we arbitrarily place an A if only N's are seen at some position for current in lines: flag = 0 for i in range(maxmatch): if (hamming(current[:(readlen - i)], ref[i:]) <= thresh): if (hamming(current[:(readlen - i)], ref[i:]) <= hamming( current[:(readlen - i)], prev[i:])): s[1] += 'r' s[0] += (current[(readlen - i):] + '\n') prevj = 0 for j in range(readlen - i): count[char2index(current[j])][i + j] += 1 if current[j] != ref[i + j]: s[2] += (current[j]) s[3] += ("%02d" % (j - prevj)) #delta encoding prevj = j else: s[1] += ('p') s[0] += (current[(readlen - i):] + '\n') prevj = 0 for j in range(readlen - i): count[char2index(current[j])][i + j] += 1 if current[j] != prev[i + j]: s[2] += (current[j]) s[3] += ("%02d" % (j - prevj)) #delta encoding prevj = j count = [count[j][i:] + [0] * i for j in range(5)] for j in range(readlen - i, readlen): count[char2index(current[j])][j] = 1 ref = findmajority(count) #ref = current#ref[i:]+current[readlen-i:] s[2] += ('\n') flag = 1 break if flag == 0: s[1] += ('0') s[0] += (current + '\n') count = [[0] * readlen for j in range(5)] for j in range(readlen): count[char2index(current[j])][j] = 1 ref = findmajority(count) prev = current return s
def calculate_intra_hamming_distance_between_elements( self, files, length=391716): total = 0 count = 0 highest = 0 highest_pct = 0 lowest = length lowest_pct = 101 distances = {} if isinstance(files, dict): print('i') for a1, b1 in itertools.combinations(files, 2): a = files[a1] b = files[b1] if len(a) != length and len(b) != length: continue dis = distance.hamming(a, b) pct = (dis / length) * 100 total += pct count += 1 name = a1 + "-" + b1 distances[name] = pct if highest < dis: highest = dis highest_pct = pct if lowest > dis: lowest = dis lowest_pct = pct else: print('j') for a, b in itertools.combinations(files, 2): if len(a) != length and len(b) != length: continue dis = distance.hamming(a, b) pct = (dis / length) * 100 total += pct count += 1 if highest < dis: highest = dis highest_pct = pct if lowest > dis: lowest = dis lowest_pct = pct # print(str(dis) + ", " + str(pct) + "%") average = total / count if count > 0 else 0 return [ average, highest, lowest, highest_pct, lowest_pct if lowest_pct != 101 else 0, distances ]
def findOtherBlocks(self): print "Finding other blocks ..." for i in range(len(self.linkerIndices)): indexPair = self.linkerIndices[i] if not self.isInvalidIndexPair(indexPair): linker1Index = indexPair[0] linker2Index = indexPair[1] sequenceName = self.sequenceNames[i] sequence = self.sequences[i] placeHolder = self.sequencePlaceHolders[i] qualityScore = self.qualityScores[i] R2sequenceName = self.R2sequenceNames[i] R2sequence = self.R2sequences[i] R2placeHolder = self.R2sequencePlaceHolders[i] R2qualityScore = self.R2qualityScores[i] barCode1 = sequence[linker1Index-6:linker1Index] barCode2 = sequence[linker1Index+15:linker2Index] barCode3 = sequence[linker2Index+15:linker2Index+15+6] acgBlock = sequence[linker2Index+15+6:linker2Index+15+6+3] umiBlock = sequence[linker2Index+15+6+3:linker2Index+15+6+3+8] polyTBlock = sequence[linker2Index+15+6+3+8:linker2Index+15+6+3+8+4] barCode1QS = qualityScore[linker1Index-6:linker1Index] barCode2QS = qualityScore[linker1Index+15:linker2Index] barCode3QS = qualityScore[linker2Index+15:linker2Index+15+6] umiBlockQS = qualityScore[linker2Index+15+6+3:linker2Index+15+6+3+8] if self.barCodesAreCorrect(barCode1,barCode2,barCode3): for possibleBarCode in self.possibleBarCodes: bc1Ed = distance.hamming(barCode1,possibleBarCode) bc2Ed = distance.hamming(barCode2,possibleBarCode) bc3Ed = distance.hamming(barCode3,possibleBarCode) if bc1Ed == 1: barCode1 = possibleBarCode #self.numCorrectedBarCodes = self.numCorrectedBarCodes + 1 if bc2Ed == 1: barCode2 = possibleBarCode #self.numCorrectedBarCodes = self.numCorrectedBarCodes + 1 if bc3Ed == 1: barCode3 = possibleBarCode #self.numCorrectedBarCodes = self.numCorrectedBarCodes + 1 if self.blocksAreCorrect(acgBlock,polyTBlock): self.blockSequenceNames.append(sequenceName) self.blocks.append([barCode1,barCode2,barCode3,acgBlock,umiBlock,polyTBlock]) self.blockPlaceHolders.append(placeHolder) self.blockQualityScores.append([barCode1QS,barCode2QS,barCode3QS,umiBlockQS]) self.read2Lines.extend([R2sequenceName,R2sequence,R2placeHolder,R2qualityScore]) else: self.filterIndices.append(i)
def find_sim(): # image_filenames = [os.path.join(userpath, path) for path in os.listdir(userpath) if is_image(path)] # images={} f = open(sys.argv[2]) shopitem_dic = {} list = [] type_item_set = set() for line in f: dv = line.split() type = dv[0] shopid = dv[1] itemid = dv[2] shopitem_dic[itemid] = shopid if type == '': type_item_set.add(itemid) for line in open(sys.argv[3]): # hash = hashfunc(Image.open(img)) # hash = test_dhash(Image.open(img)) img, hash = line.split() if 'home' in img: itemid = img.split('/')[-1].split('_')[0] else: itemid = img.split('_')[0] if itemid in type_item_set: list.append((img, shopitem_dic.get(itemid), str(hash))) num = len(list) print num for i in xrange(num): item1 = list[i] for j in xrange(num - i): item2 = list[i + j] if item1[1] == item2[1]: continue sim = distance.hamming(item1[2], item2[2]) if sim < 4: print "-".join(item1), "-".join(item2), sim
def calculate_hamming_distance(self, files, length=1048576): total = 0 count = 0 highest = 0 highest_pct = 0 lowest = length lowest_pct = 100 keys = list(files.keys()) a = files[keys[0]] high = 0 name = "" print(keys[0]) for c, d in itertools.combinations(files, 2): # for c in keys[1:]: # b = files[c] a = files[c] b = files[d] if len(a) != length and len(b) != length: continue dis = distance.hamming(a, b) pct = (dis / length) * 100 total += pct count += 1 if highest < dis: highest = dis highest_pct = pct high = count # name = c if lowest > dis: lowest = dis lowest_pct = pct print(str(c) + "-" + str(d) + " : " + str(dis) + ", " + str(pct) + "%") return [total/count, highest, lowest, highest_pct,lowest_pct, high, name]
def get_best_match(self, photo_path): """Try to match the photo with the nearest image in the database""" #split = cv2.split(warp) s = time.time() photo = cv2.imread(photo_path) # Get the 90° cropped view of the card framed = self.get_framed_card(photo) photo_array = Image.fromarray(framed) # Detect language of the card: https://stackoverflow.com/questions/37235932/python-langdetect-choose-between-one-language-or-the-other-only # text = pytesseract.image_to_string(photo_array) # try: # lang = detect(text) # except: # lang = "NF" phash_photo = str(ih.phash(photo_array)) # Get 20 best results from phash matching distances = sorted([(distance.hamming(phash_photo, i[0]), i[2]) for i in self.phashes], key=itemgetter(0))[:20] # Extract descriptor from framed card kp1, des_photo = self.orb.detectAndCompute(cv2.cvtColor(framed, cv2.COLOR_BGR2GRAY), None) # Get descriptors for the 20 best phash matches cards_list = self.database.get_descriptors([p[1] for p in distances]) # Get the card with the minimum hamming distance between descriptors sc, cpath, best_match = min([(self.orb_score(des_photo, des_im), cpath, (name, set_code, scryfall_id)) for des_im, cpath, name, set_code, scryfall_id in cards_list], key = itemgetter(0)) # scores = [] # for phash, im_path in distances: # scores.append((self.score_images(framed, im_path), im_path)) # sc, best_match = min(scores, key = itemgetter(0)) e = time.time() logging.info("===> Result : orb_score = {0} / time = {1}s / lang = {2} / best match = {3}".format(sc, e - s, "None", os.path.basename(cpath))) print("===> Result : orb_score = {0} / time = {1}s / best match = {2}".format(sc, e - s, os.path.basename(cpath))) return best_match
def mydist(args): query, mydict, thresh = args[:] mydistance = [(idx, distance.hamming(query, mydict[idx])) for idx in range(len(mydict))] mydistance.sort(key=lambda x: x[1]) return mydistance[0] if mydistance[0][1] != mydistance[1][1] else ( -1, mydistance[0][1])
def get_similar_photos(user, album): with getcursor() as cur: cur.execute( "SELECT phash, id FROM photos WHERE owner = %s AND album = %s AND phash IS NOT NULL ORDER BY phash", ( user.id, album, )) rows = cur.fetchall() # TODO clusters should be cached based on rows as a key ie. if the photos haven't changed, the clustering won't have changed clusters = [] for phash, id in (rows): found = False for cluster in clusters: for photo in cluster: if distance.hamming(photo['phash'], phash) <= 8: # add photo to an existing cluster cluster.append({"phash": phash, "id": id}) # stop looking once first cluster is found found = True break if found: break # start a new cluster clusters.append([{"phash": phash, "id": id}]) return list(filter(lambda x: len(x) > 1, clusters))
def find_ss(s): count = 0 len_ss = len(ss) for i in range(len(s)-len_ss): if distance.hamming(s[i:i+len_ss], ss) <= 3: count += 1 print(count)
def _repost_checker_proc(self, to_be_checked, records, hashsize, hd): results = [{ 'image_id': to_be_checked['image_id'], 'older_images': [] }] found_repost = None hash_size = 'hash' + hashsize for r in records: if to_be_checked['image_id'] == r['image_id'] or r['user'] == to_be_checked['user']: continue try: hamming_distance = hamming(to_be_checked[hash_size], r[hash_size]) except ValueError: continue if hamming_distance < hd: found_repost = True results[0]['older_images'].append(r) return results if found_repost else None
def gen_hamming_matrices(self): """ Generate Hamming distance and similarity matrices for all sequences """ # For all target sequences for target in tqdm(self.target_list, desc="Generating Hamming distance matrices"): # Sort the list of k-mers unique to the target and all observed kmers in the set alphabetically unique_kmer_list = sorted(list(target.unique_kmers)) other_kmer_list = sorted( list(self.all_kmers.difference(target.unique_kmers))) # Get the number of unique k-mers in current target sequence and all observed kmers target_unique = len(unique_kmer_list) other_kmers = len(other_kmer_list) # Generate a matrix of appropriate size ham_array = np.zeros((target_unique, other_kmers)) # Loop over both lists and fill in the Hamming distance matrix for i in range(target_unique): for j in range(other_kmers): ham_array[i, j] = hamming(unique_kmer_list[i], other_kmer_list[j]) # Assign the lists as instance variables to the target sequences in the list target.unique_kmer_list = unique_kmer_list target.other_kmer_list = other_kmer_list # Assign the Hamming distance and similarity score matrices to the target sequences in the list target.hamming_dist_matrix = ham_array target.similarity_matrix = np.full_like(ham_array, self.k) - ham_array
def find_closest(output, input_patterns): distances = [] for pattern in input_patterns: ham_dist = distance.hamming(output, pattern) distances.append(ham_dist) val, idx = min((val, idx) for (idx, val) in enumerate(distances)) return idx
def find_highest(self, input_offer, bucket_num): """ must call this before updating the round number :param bucket_num: the number of utility_buckets we are willing to look in; the best option from the highest_utility bucket is chosen :return: an offer that works best for us with the smallest hamming distance from the given offer """ if bucket_num > len(self.utility_buckets) - 1: # ensure validity of utility_bucket bucket_num = len(self.utility_buckets) - 1 if bucket_num < 1: # self.num_buckets = 1 bucket_num = 1 best_off = [] hamming_best = float("inf") i = 0 for key in self.utility_buckets: if i == bucket_num: break for offer in self.map_util_to_list[key]: #tempdist = distance.hamming(offer, self.received_offers[self.cur_round]) tempdist = distance.hamming(offer, input_offer) if tempdist < hamming_best and offer != input_offer: hamming_best = tempdist best_off = offer #print("New best offer to counter ",self.received_offers[self.cur_round]," is ",bestOff, "with distance",hamming_best,"and utility",key) #print("New best offer to counter ",input_offer," is ",bestOff, "with distance",hamming_best,"and utility",key) i += 1 return best_off
def calculate_hamming_distance_list(self, files, length=1048576): total = 0 count = 0 highest = 0 highest_pct = 0 lowest = length lowest_pct = 100 a = files[0] high = 0 name = "" for a, b in itertools.combinations(files, 2): # for c in keys[1:]: # b = files[c] if len(a) != length and len(b) != length: continue dis = distance.hamming(a, b) pct = (dis / length) * 100 total += pct count += 1 if highest < dis: highest = dis highest_pct = pct high = count if lowest > dis: lowest = dis lowest_pct = pct print(str(dis) + ", " + str(pct) + "%") return [total/count, highest, lowest, highest_pct,lowest_pct, high]
def find_putative_CS(start, end, kmer, genome, leader_core_sequence): search_seq = genome[start:end] min_dist = kmer res_seq = '' res_index = 0 find = 0 gaac_list = [] for i in range(0, len(search_seq) - kmer + 1): query_seq = search_seq[i:i + kmer] index = start + i + 1 dist = distance.hamming(leader_core_sequence, query_seq) downflnk = search_seq[i + kmer:i + kmer + 4] if leaderCS[-4:] == query_seq[-4:] and downflnk == "AAA": find = 1 return query_seq + "(AAA)", dist, index, find if downflnk == "AAA": find = 1 gaac_list.append([query_seq + "(AAA)", dist, index, find]) if dist <= min_dist or leaderCS[-4:] == query_seq[-4:]: min_dist = dist res_seq = query_seq res_index = index if len(gaac_list) != 0: return sorted(gaac_list, key=lambda x: x[1])[0] else: if min_dist <= 2: find = 1 return res_seq, min_dist, res_index, find
def detect_images(signatures): print( '\n[+] Detecting images that meet similarity threshold of signatures (' + str(parsed_args.min_similarity_threshold) + '%)...') images = [] for i in os.listdir(dir_image): images.append(i) with open(csv_file, mode='w') as csv_out: csv_writer = csv.writer(csv_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow([ 'Document_SHA256', 'Average_Hash', 'Signature_Name', 'Similarity' ]) for i in images: image_path = os.path.join(dir_image, i) image_hash = str(imagehash.average_hash(Image.open(image_path))) for sig_hash, sig_name in signatures.items(): hamming_distance = distance.hamming(image_hash, sig_hash) image_similarity = 100 - ((hamming_distance / 16) * 100) if image_similarity >= min_similarity_threshold: csv_writer.writerow( [i, image_hash, sig_name, image_similarity]) print('[+] Document ' + i + ' matched ' + sig_name + ' (' + str('%.0f' % image_similarity) + '% similarity).') print('[+] Saved results to ' + csv_file + '.') return True
def generate_matrix(db, str_rel): dic = {} for ind,item in enumerate(db.index): print str_rel, ind + 1, len(db.index) secdic = {} for secitem in db.index: if secitem in dic: secdic[secitem] = dic[secitem][item] continue a = db.loc[item].tolist() b = db.loc[secitem].tolist() secdic[secitem] = hamming(a,b,normalized=True) dic[item] = secdic pd_pre = [] for key in sorted(dic.keys()): newlst = [dic[key][i] for i in sorted(dic.keys())] pd_pre.append(newlst) new_pd = pd.DataFrame(pd_pre,columns=sorted(dic.keys()),index=sorted(dic.keys())) new_pd.to_csv('/Users/virpatel/Desktop/pub_stuff/relevant_data/%s_dataframe.txt' %(str_rel), sep='\t', encoding='utf-8') return dic
def generate_matrix(db, str_rel): dic = {} for ind,item in enumerate(db.keys()): print str_rel, ind + 1, len(db.keys()) secdic = {} for secitem in db.keys(): if secitem in dic: secdic[secitem] = dic[secitem][item] continue a = db[item] b = db[secitem] secdic[secitem] = hamming(a,b,normalized=True) dic[item] = secdic pd_pre = [] for key in sorted(dic.keys()): newlst = [dic[key][i] for i in sorted(dic.keys())] pd_pre.append(newlst) new_pd = pd.DataFrame(pd_pre,columns=sorted(dic.keys()),index=sorted(dic.keys())) print new_pd new_pd.to_csv('../relevant_data/%s_dataframe.txt' %(str_rel), sep='\t', encoding='utf-8') return new_pd
def find_sim(userpath, hashfunc=imagehash.average_hash): image_filenames = [ os.path.join(userpath, path) for path in os.listdir(userpath) if is_image(path) ] images = {} f = open(sys.argv[3]) shopitem_dic = {} list = [] for line in f: dv = line.split() shopid = dv[1] itemid = dv[2] shopitem_dic[itemid] = shopid for img in sorted(image_filenames): # hash = hashfunc(Image.open(img)) hash = test_dhash(Image.open(img)) itemid = img.split('/')[-1].split('_')[0] list.append((img.split('/')[-1], shopitem_dic.get(itemid), str(hash))) # itemid=img.split('/')[-1].split('_')[0] # print itemid # images[hash] = images.get(hash, []) + [img] num = len(list) for i in xrange(num): item1 = list[i] for j in xrange(num - i): item2 = list[i + j] if item1[1] == item2[1]: continue sim = distance.hamming(item1[2], item2[2]) if sim < 4: print "-".join(item1), "-".join(item2), sim
def just_sim(userpath): images = {} f = open(sys.argv[3]) shopitem_dic = {} list = [] for line in f: dv = line.split() shopid = dv[1] itemid = dv[2] shopitem_dic[itemid] = shopid fr = open(userpath) for img in fr: # hash = hashfunc(Image.open(img)) # hash = test_dhash(Image.open(img)) item, hash = img.split() itemid = item.split('_')[0] list.append((item, shopitem_dic.get(itemid), str(hash))) # itemid=img.split('/')[-1].split('_')[0] # print itemid # images[hash] = images.get(hash, []) + [img] num = len(list) for i in xrange(num): item1 = list[i] for j in xrange(num - i): item2 = list[i + j] if item1[1] == item2[1]: continue sim = distance.hamming(item1[2], item2[2]) if sim < 4: print "-".join(item1), "-".join(item2), sim
def call_CDR3_end(VDJ_seq, CDR3_start): '''Find the end of the CDR3''' try: from distance import hamming except ImportError: from util import hamming CDR3_end_anchor_sequence = 'CTGGGG' minimum_match_distance = 1 CDR3_end = -1 for i in range(CDR3_start, len(VDJ_seq) - len(CDR3_end_anchor_sequence) + 1): seq1 = VDJ_seq[i:i + len(CDR3_end_anchor_sequence)] seq2 = CDR3_end_anchor_sequence if len(seq1) == len(seq2): d = hamming(seq1, seq2) else: print('expected two strings of the same length for hamming') d = 10 if d <= minimum_match_distance: CDR3_end = i + 1 minimum_match_distance = d return CDR3_end
def _final_meme_filter(self, searched_hash: Text, matches: List[ImageSearchMatch], target_hamming) -> List[ImageSearchMatch]: results = [] log.debug('MEME FILTER - Filtering %s matches', len(matches)) if len(matches) == 0: return matches for match in matches: try: match_hash = self._get_meme_hash(match.post.url) except Exception as e: log.error('Failed to get meme hash for %s', match.post.id) continue h_distance = hamming(searched_hash, match_hash) if h_distance > target_hamming: log.info( 'Meme Hamming Filter Reject - Target: %s Actual: %s - %s', target_hamming, h_distance, f'https://redd.it/{match.post.post_id}') continue log.debug('Match found: %s - H:%s', f'https://redd.it/{match.post.post_id}', h_distance) match.hamming_distance = h_distance match.hash_size = len(searched_hash) results.append(match) return results
def _get_adj_list_directional_adjacency(self, umis, counts, threshold, use_hamming=False, countRatio=1.5): ''' identify all umis within the hamming distance threshold and where the counts of the first umi is > (1.5 * second umi counts)-1''' if use_hamming: return { umi: [ umi2 for umi2 in umis if hamming(umi, umi2) <= threshold and counts[umi] >= (counts[umi2] * countRatio) - 1 ] for umi in umis } else: return { umi: [ umi2 for umi2 in umis if edit_distance(umi, umi2) <= threshold and counts[umi] >= (counts[umi2] * countRatio) - 1 ] for umi in umis }
def buildcontig(reads): if (len(reads) == 1): #singleton read return [reads[0], [0]] count = [[0, 0, 0, 0, 0] for i in range(readlen) ] #number of A's,C's,T's,G's,N's seen at each position in ref pos = [0] for i in range(readlen): count[i][char2index[reads[0][i]]] = 1 prevread = reads[0] for currentread in reads[1:]: flag = 0 bestmatch = readlen besti = 0 for i in range(maxmatch): hammingdist = hamming(currentread[:(readlen - i)], prevread[i:]) if (hammingdist <= thresh): pos.append(i + pos[-1]) count = count + [[0, 0, 0, 0, 0] for j in range(i)] for j in range(readlen): count[pos[-1] + j][char2index[currentread[j]]] += 1 flag = 1 break if (hammingdist < bestmatch): bestmatch = hammingdist bestmatchpos = i if flag == 0: #no match found due to some reason (this might happen because of matchsort9's # handling of N's (if only N's are seen at a position, matchsort9 makes it A in the ref. pos.append(bestmatchpos + pos[-1]) count = count + [[0, 0, 0, 0, 0] for j in range(bestmatchpos)] for j in range(readlen): count[pos[-1] + j][char2index[currentread[j]]] += 1 prevread = currentread ref = findmajority(count) return [ref, pos]
def correctBarcodes(self): print "Correcting barcodes ..." for i in range(len(self.blocks)): bc1 = self.blocks[i][0] bc2 = self.blocks[i][1] bc3 = self.blocks[i][2] for possibleBarCode in self.possibleBarCodes: bc1Ed = distance.hamming(bc1,possibleBarCode) bc2Ed = distance.hamming(bc2,possibleBarCode) bc3Ed = distance.hamming(bc3,possibleBarCode) if bc1Ed == 1: self.blocks[i][0] = possibleBarCode if bc2Ed == 1: self.blocks[i][1] = possibleBarCode if bc3Ed == 1: self.blocks[i][2] = possibleBarCode
def mapseq(seq, pri): for offset in range(len(seq) - len(pri)): qseq = seq[offset:offset + len(pri)] if len(qseq) < len(pri): break if hamming(qseq, pri) <= 3: return offset return 0
def get_distance(data, vec1): distance_list = [] for i in range(1, len(data)+1): vec2 = data[i][0] distance_list.append((distance.hamming(vec1, vec2), i)) distance_list.sort() return distance_list
def calculate_intra_hamming_distance(self, files, length=1048576): total = 0 count = 0 highest = 0 highest_pct = 0 lowest = length lowest_pct = 100 keys = list(files.keys()) a = files[keys[0]] distances = {} alphabet = 'B' for c in keys[1:]: b = files[c] if len(a) != length and len(b) != length: continue dis = distance.hamming(a, b) pct = (dis / length) * 100 total += pct count += 1 name = "A-" + alphabet distances[name] = pct if highest < dis: highest = dis highest_pct = pct if lowest > dis: lowest = dis lowest_pct = pct alphabet = chr(ord(alphabet) + 1) return [ total / count, highest, lowest, highest_pct, lowest_pct, distances ]
def dhash(image1, image2): i1 = Image.open(image1) i2 = Image.open(image2) h1 = _dhash(i1) h2 = _dhash(i2) return distance.hamming(h1, h2)
def hamming_distance_calculator(one, two): t1 = anagramfunctions.stripped_string(one["tweet_text"]) t2 = anagramfunctions.stripped_string(two["tweet_text"]) comparitor.set_seqs(t1, t2) dist = 1.0 - float(distance.hamming(t1, t2)) / len(t1) if dist < 1: print(t1, t2, str(dist), str(comparitor.ratio()) + "\n\n", sep="\n")
def hamming(self,sig1,sig2): assert len(sig1) == len(sig2) if (sig1 == sig2): return 0 elif sig1 in self._hamming and sig2 in self._hamming[sig1]: return self._hamming[sig1][sig2] else: self._hamming[sig1][sig2] = self._hamming[sig2][sig1] = (hamming(sig1,sig2)/self.length) return self._hamming[sig1][sig2]
def primeiracamada(matriz,palavra,raio): # faz a leitura da palavra, compara com a matriz de enderecos e retorna as linhas que estão próximas matrizretorno = [] for linha in range(len(matriz)): dif = distance.hamming(matriz[linha],palavra) if dif <= raio : matrizretorno.append(matriz[linha]) return (matrizretorno)
def gdm(pop): s1 = 0 for i in range(len(pop) - 1): s2 = 0 for j in range(i + 1, len(pop)): s2 += distance.hamming(pop[i], pop[j], normalized=True) s1 += (s2/(len(pop) - i)) measure = s1 / len(pop) return measure
def hammingDistance(motifTable, t): ''' Calculate the hamming distance between one motif and others motifTable is a list contains t motifs The return value will be the score. ''' motif = motifTable[0] minDistance = 0 for i in xrange(t): minDistance += distance.hamming(motif, motifTable[i]) for i in xrange(t): motif = motifTable[i] dis = 0 for j in xrange(t): dis += distance.hamming(motif, motifTable[j]) if dis < minDistance: minDistance = dis return minDistance
def search(a, b, count, min_dist): min_list=[] l={} if distance.hamming(a, b)==0: print count print a sys.exit() return count+=1 for i in range(0, len(a)): for j in range(i, len(a)): if i==0 and j==0: a1=a[j:i+1:-1]+a[j:] elif i==0 and j>0: a1=a[j::-1]+a[i+j+1:] else: a1=a[:i]+a[j+1:i-1:-1]+a[j+2:] str1="" for q in a1: str1+=q str1+=" " #print str(j)+str(a1) l[str1]=distance.hamming(a1, b) # if distance.hamming(a1, b)<min_dist: # l[a1]=distance.hamming(a1, b) # if distance.hamming(a1, b)==0: # print count # print a1 # sys.exit() # return l1=sorted(l.iteritems(), key=operator.itemgetter(1)) for i in l1: if i[1]==l1[0][1] and i[1]<min_dist: min_list.append(i[0].split()) for i in l1: if i[1]==l1[0][1]: min_dist=i[1] for i in min_list: o.write(str(i)+"\t"+str(count)+"\n") my_count=count my_dist=min_dist search(i, b, my_count, my_dist) return
def find_similar_images(userpath = "localS3Images", hashfunc = imagehash.dhash, inputUrl = "https://s3.amazonaws.com/treblalee.images/watches7.jpg", inputFilePath = ""): import os global cache def is_image(filename): f = filename.lower() return f.endswith(".png") or f.endswith(".jpg") or \ f.endswith(".jpeg") or f.endswith(".bmp") or f.endswith(".gif") # get image url to detail page mapping imageToDetailPageMapping = {} mappingFile = open("imageUrlToDetailPageMapping.txt") for line in mappingFile: fields = line.strip("\n").split(",") imageUrl = str(urllib.unquote(fields[0]).decode('utf8')) detailPageUrl = str(urllib.unquote(fields[1]).decode('utf8')) imageToDetailPageMapping[imageUrl] = detailPageUrl # compute hash of input image try: if len(inputFilePath) > 0: image_file = inputFilePath inputAsString = inputFilePath else: fd = urllib.urlopen(inputUrl) image_file = io.BytesIO(fd.read()) inputAsString = inputUrl inputHash = str(hashfunc(Image.open(image_file))) except: traceback.print_exc(file=sys.stdout) return json.dumps({}, sort_keys=True, indent=4, separators=(',', ': ')) # compute hashes of all images in DB (currently just a directory) image_filenames = [os.path.join(userpath, path) for path in os.listdir(userpath) if is_image(path)] simList = [] for img in sorted(image_filenames): if img in cache: hash = cache[img] else: hash = str(hashfunc(Image.open(img))) cache[img] = hash dist = distance.hamming(inputHash, hash) print inputHash + " " + hash + " " + str(dist) if dist < 6 and dist > 0: imageUrl = img.replace('localS3Images/', 'https://s3.amazonaws.com/treblalee.images/') detailPageUrl = imageToDetailPageMapping[imageUrl] pair = {} pair["imageUrl"] = imageUrl pair["detailPageUrl"] = detailPageUrl simList.append(pair) result = {} result["input"] = inputAsString result["output"] = simList #print result return json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
def check_left_shift_conflicts(self): #checks if indexes from the same library after a left shift are conflicting for kit_type in self.indexes_by_kit: #for each lib kit type for index_type in self.indexes_by_kit[kit_type]: # for each type of indexes for index_name, index_seq in self.indexes_by_kit[kit_type][index_type].iteritems(): fake_index = index_seq[1:] + "A" for index_name_check, index_seq_check in self.indexes_by_kit[kit_type][index_type].iteritems(): hamming_dist = distance.hamming(index_seq_check, fake_index) if hamming_dist <= 2: print "{} {} {} {} {}".format(index_seq, index_seq_check, fake_index, hamming_dist, kit_type)
def dis(kmers, ownKmers): hammingDistance = {} minDistance = 0 for each in ownKmers: hammingDistance[each] = [] for i in xrange(len(ownKmers[each])): hammingDistance[each].append(distance.hamming(kmers, ownKmers[each][i])) for each in hammingDistance: minDistance += min(hammingDistance[each]) return minDistance
def func(a, b): for j in range(32,41): for i in range(len(a)-j): c1=0 for k in range(len(b)-j): if distance.hamming(a[i:i+j], b[k:k+j])<=3: c1+=1 if c1>c: c=c1 return c
def get_distances(self): dm = [[0 for x in range(self.pl)] for x in range(self.n)] pm = self.pm for i in range(self.n): for j in range(i + 1, self.n): dm[i][j] = distance.hamming(pm[i], pm[j]) self.dm = dm return self.dm
def force(p1, p2): """ estimate force between p1 and p2 p must be a Position """ ## TODO: remake it later dist = hamming(p1.entity, p2.entity) dist = cannot_be_zero(dist) f_abs = G * (p1.mass * p2.mass) / dist force = {item: p * f_abs for item, p in (p1.entity - p2.entity).items()} return MappingParticle.Velocity(force)
def family_homogenity_collapsed(human_mirlst, mirna2disease, mirna2age): family_avg_age = [] family_avg_hamming = [] family_percent_invoved_dis = [] all_mir_vector_df = pd.DataFrame() dislst = get_list_of_dictionary(mirna2disease) all_fam_mir = list(itertools.chain.from_iterable(human_mirlst.values())) for mir in all_fam_mir: if mir in mirna2disease: vec = generate_class_vector(dislst, mirna2disease[mir]) tmp = pd.DataFrame([vec,],index=[str(mir),], columns=dislst) all_mir_vector_df = all_mir_vector_df.append(tmp) for fam in human_mirlst: family_vector = [] mirlst = [a for a in human_mirlst[fam] if a in mirna2disease] if len(mirlst) < 4: continue for mir in mirlst: for other_mir in mirlst: if mir == other_mir: continue family_vector.append(hamming(all_mir_vector_df.loc[mir], all_mir_vector_df.loc[other_mir],normalized=True)) family_avg_hamming.append(mean(family_vector)) family_avg_age.append(round(mean([float(mirna2age[mirna]) for mirna in mirlst if mirna in mirna2age]),1)) family_percent_invoved_dis.append(float(len(mirlst)) / float(len(human_mirlst[fam]))) print spearmanr(family_percent_invoved_dis, family_avg_hamming) fam_df = pd.DataFrame(zip(family_avg_age,family_avg_hamming,family_percent_invoved_dis),columns=['fam_age','fam_hamming','fam_per']) fam_df = fam_df.sort('fam_age',ascending=1) f = plt.gcf() f.set_size_inches(20, 10) sns.boxplot(x='fam_age',y='fam_hamming',data=fam_df) plt.xticks(range(0,len(list(set(family_avg_age)))), [str(a) for a in sorted(list(set(family_avg_age)))]) plt.gca().set_ylim([0,.094]) plt.ylabel('Average Family Disease Vector Hamming Distance (0-1)', fontsize=15) plt.xlabel('Average Family Age',fontsize=15) plt.subplots_adjust(bottom=0.20) plt.savefig('figures/family_disease_hamming_collapsed.pdf',bbox_inches='tight') plt.close()
def calculate_changes_in_fitness(self,population,number_of_trials): original_fitnesses = ar(self.fitness_many(population)) print original_fitnesses.shape sample = [self.sample_dA([i]) for i in population] # print sample.shape sample_fitnesses = ar(self.fitness_many([j for j in sample])) # return original_fitnesses,sample,sample_fitnesses print sample_fitnesses.shape print sample_fitnesses[0:10] differences = sample_fitnesses - original_fitnesses distances = [[distance.hamming(population[k],sample[k]) for k in range(len(sample))]] # pdb.set_trace() for i in range(number_of_trials): print "trial:",i new_sample = [self.sample_dA([j]) for j in population] new_sample_fitnesses = ar(self.fitness_many([j for j in new_sample])) new_difference = new_sample_fitnesses - original_fitnesses sample_fitnesses = np.vstack((sample_fitnesses,new_sample_fitnesses)) differences = np.vstack((differences,new_difference)) distances.append([distance.hamming(population[k],sample[k]) for k in range(len(sample))]) return sample_fitnesses,differences,distances
def create_graph(graph, word_list): """ :param graph: graph of words with each word as parent and its children as words which differ from parent by one place :param word_list: list of all n-letter words sourced from dictionary :return: created graph of words and their one-letter spaced children """ for word1 in word_list: neighbours_list = [] for word2 in word_list: if distance.hamming(word1, word2) == 1: if word2 not in neighbours_list: neighbours_list.append(word2) graph[word1] = set(neighbours_list)
def get_statistics(self,population,sample,get_distances=False,original_fitnesses=False): if original_fitnesses: original_fitnesses = ar(self.fitness_many(population)) sample_fitnesses = ar(self.fitness_many(sample)) if original_fitnesses == False: original_fitnesses = sample_fitnesses if get_distances: differences = sample_fitnesses - original_fitnesses distances = [distance.hamming(population[k],sample[k]) for k in range(len(sample))] else: differences = [] distances = [] return original_fitnesses,sample_fitnesses,differences,distances
def collapse_by_hamming(x, maxham): x_cllps = deque(sorted(Counter(x).items(), key=itemgetter(1), reverse=True)) while(len(x_cllps) > 0): seq, count = x_cllps.popleft() uniq = 1 #x_cllps_save = deque() for aseq, acount in list(x_cllps): if distance.hamming(seq, aseq) <= maxham: count += acount uniq += 1 x_cllps.remove((aseq, acount)) yield ((seq, str(len(seq)), str(count), str(uniq)), len(x_cllps))
def test(n): import time import distance from simhash import Simhash, SimhashIndex WIDTH = 3 def gg(): import random from random import randint from simhash import Simhash, SimhashIndex from itertools import groupby # text = str(bin(randint(2**63, 2**64-1)))[2:] # tokens = [text[i:i + WIDTH] for i in range(max(len(text) - WIDTH + 1, 1))] # return text, Simhash({k: sum(1 for _ in g) for k, g in groupby(sorted(tokens))}) text = ''.join([random.choice('0123456789abcdef') for _ in range(36)]) return text, Simhash(text) hashes = [gg() for _ in range(n)] d1, d2 = [], [] test_string, test_hash = gg() start = time.time() for s, h in hashes: d1.append([distance.hamming(test_string, s), s]) print time.time() - start start = time.time() index = SimhashIndex(hashes, k=5) for st in index.get_near_dups(test_hash): d2.append([distance.hamming(test_string, st), st]) print time.time() - start print len(d1), len(d2) for a, b in zip(sorted(d1)[:20], sorted(d2)): print a[1] == b[1], '\t', a, '\t', b
def best_match(my_bc5, my_bc3, bc5_list, bc3_list, bc5_max_mismatch, bc3_max_mismatch): ## Find the edit distance between the given 3' barcode and all barcodes within the list edit_dist3 = []; for i in bc3_list: edit_dist3.append(distance.hamming(my_bc3, i)) edit_dist3_min = min(edit_dist3) ## Find the edit distance between the given 5' barcode and all barcodes within the list edit_dist5 = []; for i in bc5_list: edit_dist5.append(distance.hamming(my_bc5, i)) edit_dist5_min = min(edit_dist5) ## If the number of mismatches is lower than the given threshold AND only one barcode has the lowest ## number of mismatches, then the index is assigned if edit_dist3_min <= bc3_max_mismatch and edit_dist3.count(edit_dist3_min) == 1 and edit_dist5_min <= bc5_max_mismatch and edit_dist5.count(edit_dist5_min) == 1: tmp_bc3 = bc3_list[edit_dist3.index(edit_dist3_min)] tmp_bc5 = bc5_list[edit_dist5.index(edit_dist5_min)] else: tmp_bc3 = "" tmp_bc5 = "" return (tmp_bc5, tmp_bc3)
def CalcLVs(self): maxlen = len(self.pm[0]) nprof = len(self.pm) lvs = [[0 for x in range(maxlen)] for x in range(nprof)] for i in range(nprof): for j in range(i + 1, nprof): diff = distance.hamming(self.pm[i], self.pm[j]) lvs[i][diff - 1] += 1 lvs[j][diff - 1] += 1 self.lvs = lvs return self.lvs
def call_CDR3_start(VDJ_seq): CDR3_start_anchor_sequence = 'TATTACTGT' minimum_match_distance = 2 CDR3_start = -1 for i in xrange(0, len(VDJ_seq) - len(CDR3_start_anchor_sequence) - 1): d = hamming(VDJ_seq[i:i+len(CDR3_start_anchor_sequence)], CDR3_start_anchor_sequence) if d <= minimum_match_distance: CDR3_start = i + len(CDR3_start_anchor_sequence) + 1 minimum_match_distance = d return CDR3_start
def call_CDR3_end(VDJ_seq, CDR3_start): CDR3_end_anchor_sequence = 'CTGGGG' minimum_match_distance = 1 CDR3_end = -1 for i in xrange(CDR3_start, len(VDJ_seq) - len(CDR3_end_anchor_sequence) + 1): try: d = hamming(VDJ_seq[i:i+len(CDR3_end_anchor_sequence)], CDR3_end_anchor_sequence) except: print 'expected two strings of the same length for hamming' d=10 if d <= minimum_match_distance: CDR3_end = i + 1 minimum_match_distance = d return CDR3_end