def correct_cells_whitelist(final_results, umis_per_cell, whitelist, collapsing_threshold, ab_map): """ Corrects cell barcodes. Args: final_results (dict): Dict of dict of Counters with mapping results. umis_per_cell (Counter): Counter of UMIs per cell. whitelist (set): The whitelist reference given by the user. collapsing_threshold (int): Max distance between umis. ab_map (OrederedDict): Tags in an ordered dict. Returns: final_results (dict): Same as input but with corrected umis. umis_per_cell (Counter): Updated UMI counts after correction. corrected_barcodes (int): How many umis have been corrected. """ barcode_tree = pybktree.BKTree(Levenshtein.hamming, whitelist) print('Generated barcode tree from whitelist') cell_barcodes = list(final_results.keys()) n_barcodes = len(cell_barcodes) print('Finding reference candidates') print('Processing {:,} cell barcodes'.format(n_barcodes)) #Run with one process true_to_false = find_true_to_false_map( barcode_tree=barcode_tree, cell_barcodes=cell_barcodes, whitelist=whitelist, collapsing_threshold=collapsing_threshold) (umis_per_cell, final_results, corrected_barcodes) = collapse_cells(true_to_false, umis_per_cell, final_results, ab_map) return (final_results, umis_per_cell, corrected_barcodes)
def main(): distance = 2 #hashFunction = imagehash.average_hash hashFunction = imagehash.dhash #hashFunction = imagehash.phash #hashFunction = imagehash.whash # Image file extensions, more can be added if necessary ext = ('.jpg', '.jpeg', '.gif', '.png') tree = pybktree.BKTree(hdist, []) potentialDuplicates = [] for filename in os.listdir('.'): if filename.endswith(ext): # Calculates the hash and adds it to the tree with Image.open(filename) as img: hashval = hashFunction(img) # Checks for potentially duplicate images for pd in tree.find((hashval, filename), distance): potentialDuplicates.append((filename, pd[1])) tree.add((hashval, filename)) print(f'Found {len(potentialDuplicates)} potential duplicate images.') with open('foundchanges.txt', 'w') as out: csv_out = csv.writer(out) for row in potentialDuplicates: csv_out.writerow(row)
def get_image_simhash_bktree(): with db_functions.get_ad_info_database_connection() as db_connection: db_interface = db_functions.AdsIfoDBInterface(db_connection) simhash_to_archive_id_set = db_interface.all_ad_creative_image_simhashes( ) total_sim_hashes = len(simhash_to_archive_id_set) logging.info('Got %d image simhashes to process.', total_sim_hashes) # Create BKTree with dhash bit difference function as distance_function, used to find similar # hashes image_simhash_tree = pybktree.BKTree(get_num_bits_different) sim_hashes_added_to_tree = 0 tree_construction_start_time = time.time() for sim_hash, archive_id_set in simhash_to_archive_id_set.items(): # Add single entry in BK tree for simhash with lowest archive_id. image_simhash_tree.add( ArchiveIDAndSimHash(sim_hash=sim_hash, archive_id=min(archive_id_set))) sim_hashes_added_to_tree += 1 if sim_hashes_added_to_tree % 1000 == 0: logging.debug('Added %d/%d simhashes to BKtree.', sim_hashes_added_to_tree, total_sim_hashes) logging.info('Constructed BKTree in %s seconds', (time.time() - tree_construction_start_time)) return image_simhash_tree
def __init__(self, V=None, model=None): """Constructor method to load external probMaker class, load dictionary and words counts """ self.vocab = self.load_vocab() self.counts = self.load_counts() self.trie = pybktree.BKTree(distance, self.vocab) self.error_df = self.load_error_df() self.pm = probMaker(self.error_df, self.counts) self.V = V self.model = model
def fit(self, words_list): """ Подгонка спеллера """ checkpoint = time.time() self.words_list = pybktree.BKTree(editdistance.eval, words_list) print("Speller fitted in", time.time() - checkpoint) return self
def v_cut_detector(img_path, v_cut_path): img = cv2.imread(img_path) o_img = img.copy() key_text_loc = () # has_v_cut = False dominate_color = get_dominant_color(img) gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if dominate_color[0] > 127: res, bin_img = cv2.threshold(gray_img, 45, 255, cv2.THRESH_BINARY_INV) else: res, bin_img = cv2.threshold(gray_img, 45, 255, cv2.THRESH_BINARY) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (24, 6)) dilate = cv2.dilate(bin_img, kernel, iterations=5) close_img = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, kernel) res1, contours, h = cv2.findContours(close_img, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) object_region = {} max_area = np.dot(img.shape[0], img.shape[1]) # print(max_area) key_list = [] for contour_num in range(len(contours)): key = [] cnt = contours[contour_num] area = cv2.contourArea(cnt) if area < max_area / 3000 or area > 3 * max_area / 4: continue x, y, w, h = cv2.boundingRect(cnt) # 将轮廓信息转换成(x, y)坐标,并加上矩形的高度和宽度 cv2.rectangle(img, (x, y), (x + w, y + h), (255, 255, 0), 2) # 画出矩形 # print(x, y+h) cut_img = o_img[y:y + h, x:x + w] key.append(x) key.append(y + h) key_list.append(tuple(key)) object_region[tuple(key)] = cut_img text = pytesseract.image_to_string(cut_img, lang='eng') # 'chi_sim+eng' if 'v-cut' in text.lower(): print(img_path) key_text_loc = tuple(key) print(key_text_loc) # has_v_cut = True bk_tree = pybktree.BKTree(manhattan_distance, key_list) if key_text_loc: v_cut_key = bk_tree.find(key_text_loc, 1000) print(v_cut_key) if len(v_cut_key) > 1: v_cut_img = object_region[v_cut_key[1][1]] else: print('no 1000' + img_path) v_cut_img = object_region[v_cut_key[0][1]] cv2.imwrite(v_cut_path, v_cut_img) cv2.imwrite('v-cut.png', img)
def getErrorCorrectMapping(cell_barcodes, whitelist, threshold=1): ''' Find the mappings between true and false cell barcodes based on an edit distance threshold. Any cell barcode within the threshold to more than one whitelist barcode will be excluded''' true_to_false = collections.defaultdict(set) # Unexpected results with cythonise hamming distance so redefine in python here def hamming_distance(first, second): ''' returns the edit distance/hamming distances between its two arguements ''' # We only want to define hamming distance for barcodes with the same length if len(first) != len(second): return np.inf dist = sum([not a == b for a, b in zip(first, second)]) return dist whitelist = set([str(x) for x in whitelist]) U.info('building bktree') tree2 = pybktree.BKTree(hamming_distance, whitelist) U.info('done building bktree') for cell_barcode in cell_barcodes: if cell_barcode in whitelist: # if the barcode is already whitelisted, no need to add continue # get all members of whitelist that are at distance 1 candidates = [ white_cell for d, white_cell in tree2.find(cell_barcode, threshold) if d > 0 ] if len(candidates) == 0: # the cell doesnt match to any whitelisted barcode, # hence we have to drop it # (as it cannot be asscociated with any frequent barcde) continue elif len(candidates) == 1: white_cell_str = candidates[0] true_to_false[white_cell_str].add(cell_barcode) else: # more than on whitelisted candidate: # we drop it as its not uniquely assignable continue return true_to_false
def benchmark_pybktree(element_counts, repeat_count=10): """ Returns a list of triples: - elements - tree creation time in seconds - lookup time for one element in seconds """ timings = [] for element_count in tqdm.tqdm(element_counts): timing = [element_count] runtimes = [] for i in range(repeat_count): elements = np.random.randint(np.iinfo(np.uint64).max, size=element_count, dtype=np.uint64) t0 = time.time() tree = pybktree.BKTree(pybktree.hamming_distance, elements) t1 = time.time() runtimes.append(t1 - t0) timing += [np.mean(runtimes), np.std(runtimes)] for distance in [0, 1, 2, 4, 8, 16]: runtimes = [] for i in range(repeat_count): elements = np.random.randint(np.iinfo(np.uint64).max, size=element_count, dtype=np.uint64) tree = pybktree.BKTree(pybktree.hamming_distance, elements) t0 = time.time() results = tree.find(item=np.uint64(0), n=distance) t1 = time.time() runtimes.append(t1 - t0) timing += [distance, np.mean(runtimes), np.std(runtimes)] timings.append(timing) return timings
def correct_cells_whitelist(final_results, umis_per_cell, whitelist, collapsing_threshold): """ Corrects cell barcodes. Args: final_results (dict): Dict of dict of Counters with mapping results. umis_per_cell (Counter): Counter of number of umis per cell. whitelist (set): The whitelist reference given by the user. collapsing_threshold (int): Max distance between umis. Returns: final_results (dict): Same as input but with corrected umis. umis_per_cell (Counter): Counter of umis per cell after cell barcode correction corrected_umis (int): How many umis have been corrected. """ true_to_false = defaultdict(set) barcode_tree = pybktree.BKTree(Levenshtein.hamming, whitelist) print('Generated barcode tree from whitelist') cell_barcodes = list(final_results.keys()) print('Finding reference candidates') for i, cell_barcode in enumerate(cell_barcodes): if cell_barcode in whitelist: # if the barcode is already whitelisted, no need to add continue # get all members of whitelist that are at distance of collapsing_threshold candidates = [ white_cell for d, white_cell in barcode_tree.find( cell_barcode, collapsing_threshold) if d > 0 ] if len(candidates) == 0: # the cell doesnt match to any whitelisted barcode, # hence we have to drop it # (as it cannot be asscociated with any frequent barcode) continue elif len(candidates) == 1: white_cell_str = candidates[0] true_to_false[white_cell_str].add(cell_barcode) else: # more than on whitelisted candidate: # we drop it as its not uniquely assignable continue (umis_per_cell, final_results, corrected_barcodes) = collapse_cells(true_to_false=true_to_false, umis_per_cell=umis_per_cell, final_results=final_results) return (final_results, umis_per_cell, corrected_barcodes)
def load(self, idx_dir, force=False): if self.loaded and not force: return self.idx_dir = idx_dir self.file_path = os.path.join(idx_dir, 'idx.pk') if os.path.exists(self.file_path): with open(self.file_path, 'rb') as file: self.tree = dill.load(file) else: self.tree = pybktree.BKTree( lambda x, y: editdistance.eval(x['sequence'], y['sequence'])) self.loaded = True self.tree.distance_func = lambda x, y: editdistance.eval( x['sequence'], y['sequence'])
def main(): dhash_json = pathlib.Path(os.getenv('DHASH_FILE')).resolve() dhash_tree = pybktree.BKTree(diff) with dhash_json.open() as f: data = json.load(f) for path, dhash in data.items(): dhash_tree.add(Image(path, dhash)) match_distance = int(os.getenv('MATCH_DISTANCE', 5)) for image in dhash_tree: matches = dhash_tree.find(image, match_distance) if len(matches) > 1: print(image.path) for match in matches: print(f'{match[0]} {match[1].path}') input()
def pare_matches_and_download(thumbnail_urls, person): urls = set() directory = '../../common/images/' + person if not os.path.exists(directory) and len(thumbnail_urls) > 10: # Make sure all the matches are of the same person try: identifier = face.Identifier(threshold=1.0) images = map(identifier.download_image, thumbnail_urls) urls_and_embeddings = identifier.detect_encode_all( images, thumbnail_urls, True) anchor_embedding = urls_and_embeddings[0].embedding # Assume first image is of the right person and check other images are of the same person for other in urls_and_embeddings: is_match, distance = identifier.compare_embedding( anchor_embedding, other.embedding) # print('dist: {} between {} and {}'.format(distance,urls_and_embeddings[0].url, other.url)) if is_match: urls.add(other.url) del identifier except Exception as e: print(e) # Make sure there are no duplicate images image_hashes = [HASH_URL(url_to_img_hash(url), url) for url in urls] tree = pybktree.BKTree(image_distance, image_hashes) # this makes images saved in order of similarity so we can spot duplicates easier sorted_image_hashes = sorted(tree) to_discard = [] urls_to_keep = set() for image_hash in sorted_image_hashes: if image_hash not in to_discard: # gets pictures within a hamming distance of 3 matches = tree.find(image_hash, 3) for match in matches: if match[1].url != image_hash.url: to_discard.append(match[1]) urls_to_keep.add(image_hash.url) # Download the images download_urls(person, list(urls_to_keep)) # Update counter try: increment() timer.update(int(counter.value)) except Exception as e: print(e)
def find_threshold(db, threshold=1): dups = [] # Build a tree cursor = db.find() tree = pybktree.BKTree(pybktree.hamming_distance) cprint('Finding fuzzy duplicates, it might take a while...') cnt = 0 for document in db.find(): int_hash = int(document['hash'], 16) tree.add(int_hash) cnt = cnt + 1 deduplicated = set() scanned = 0 for document in db.find(): cprint("\r%d%%" % (scanned * 100 / (cnt - 1)), end='') scanned = scanned + 1 if document['hash'] in deduplicated: continue deduplicated.add(document['hash']) hash_len = len(document['hash']) int_hash = int(document['hash'], 16) similar = tree.find(int_hash, threshold) if len(similar) > 1: similar = list(set(similar)) similars = [] for (distance, item_hash) in similar: item_hash = format(item_hash, '0' + str(hash_len) + 'x') if distance > 0: deduplicated.add(item_hash) for item in db.find({'hash': item_hash}): item['file_name'] = item['_id'] similars.append(item) if len(similars) > 0: dups.append( { '_id': document['hash'], 'total': len(similars), 'items': similars } ) return dups
def build_dict_tree(dict_path): hash_list = [] chr_name = [] bk_tree = None for f in get_all_files(dict_path): f_path = f if f_path[-3:] == 'png': chr_image = Image.open(f_path) chr_image = chr_image.convert('L') represent_hash = dhash.dhash_int(chr_image) if not represent_hash: continue hash_list.append(represent_hash) chr_name.append(f_path.split('/')[-1][:-4]) bk_tree = pybktree.BKTree(pybktree.hamming_distance, hash_list) return chr_name, hash_list, bk_tree
def main(): averageTime = [] for i in range(55): start = time.time() distance = i // 5 #hashFunction = imagehash.average_hash #hashFunction = imagehash.dhash #hashFunction = imagehash.phash hashFunction = imagehash.whash # Image file extensions, more can be added if necessary ext = ('.jpg', '.jpeg', '.gif', '.png') tree = pybktree.BKTree(hdist, []) potentialDuplicates = [] for filename in os.listdir('.'): if filename.endswith(ext): # Calculates the hash and adds it to the tree with Image.open(filename) as img: hashval = hashFunction(img) # Checks for potentially duplicate images for pd in tree.find((hashval, filename), distance): potentialDuplicates.append((filename, pd[1][1])) tree.add((hashval, filename)) print(i) f = 'foundchanges' + str(i) + '.txt' with open(f, 'w') as out: writer = csv.writer(out, delimiter=',') writer.writerows(potentialDuplicates) end = time.time() averageTime.append((i, end - start)) with open('whash.txt', 'w') as out: writer = csv.writer(out, delimiter=',') writer.writerows(averageTime)
def get_most_common_true_sequences(read_counter, topN:int): """ get the most abundant sequences, but also make sure that shadows dont sneak in. e.g. a VERY abundant true sequence might be ~100000reads, and 1% (1000) will result in shadows. these shadows might end up in the top100 itself """ assert isinstance(read_counter, collections.Counter) from rnaseqtools.seqerrors.CB_errors import hamming_distance bktree = pybktree.BKTree(hamming_distance) DISTANCE = 2 most_common = set() for seq, freq in tqdm.tqdm(read_counter.most_common(topN), desc='finding most common seqs'): # if the sequence is close to an an already accepted true seq if len(bktree.find(seq, DISTANCE)) > 0: continue else: bktree.add(seq) most_common.add(seq) return most_common
def image_grouping(images: Collection['Image'], sensitivity: Sensitivity) \ -> Generator[Tuple[GroupIndex, Group], None, None]: '''Find similar images and group them. Yield a tuple with the group index and image group when a new group has been added or existing one has been modified (a new image has been added to the group) :param images: images to process, :param sensitivity: maximal difference between hashes of 2 images when they are considered similar, :yield: tuple with the group index and list with grouped similar images, :raise TypeError: any of the hashes is not integer ''' image_groups: List[Group] = [] try: bktree = pybktree.BKTree(Image.hamming, images) except TypeError: raise TypeError('Hashes must be integers') checked: Dict['Image', GroupIndex] = dict() for image in images: distance, closest = _closest(bktree, image, sensitivity) if closest is None: continue # 'closest' goes to the same group as 'image' if image in checked and closest not in checked: yield _add_img_to_existing_group(image, closest, checked, image_groups) # and vice versa if image not in checked and closest in checked: yield _add_img_to_existing_group(closest, image, checked, image_groups) # create a new group with 'image' and 'closest' it it if image not in checked and closest not in checked: yield _add_new_group(image, closest, checked, image_groups, distance)
def get_multitree(voc_fd, lang_id): """Get a multitree for the given language.""" # Get a dict() where each key is a letter and each value # is a BK tree of the words that start with that letter dst = distance.Distance() ed = EditDistanceWrapper(lang_id, dst, phonemise) distractors = {} for line in voc_fd.readlines(): (f, w) = line.strip("\n").split("\t") first_letter = w[0].lower() if first_letter not in distractors: distractors[first_letter] = [] distractors[first_letter].append(w.lower()) distractors_tree = {} for letter in distractors: distractors_tree[letter] = pybktree.BKTree( ed.edit_distance, distractors[letter], ) return distractors_tree
def dedupe_images(matched_urls: List[str], person: str) \ -> Tuple[List[str], str]: image_hashes = [IMAGE_HASH(url_to_img_hash(url), url) for url in matched_urls] tree = pybktree.BKTree(image_distance, image_hashes) # this makes images saved in order of similarity so we can spot duplicates # easier sorted_image_hashes = sorted(tree) to_discard: List[str] = [] urls_to_keep = set() for image_hash in sorted_image_hashes: if image_hash not in to_discard: # gets pictures within a hamming distance of 5 matches = tree.find(image_hash, 5) for match in matches: if match[1].url != image_hash.url: to_discard.append(match[1]) urls_to_keep.add(image_hash.url) # Update counter increment() TIMER.update(int(COUNTER.value)) return list(urls_to_keep), person
def create_PUG_umi_based(cb_records:list, ec_dict): umi_dict = collections.defaultdict(list) for record in cb_records: umi_dict[record.UMI].append(record) # a BKTree of all UMIs in that cell tree = pybktree.BKTree(hamming_distance, list(umi_dict.keys())) nodes = set() edges = [] for record in cb_records: nodes.add(record) # any sequence neighbours? for distance, umi_neighbor in tree.find(record.UMI, 1): # this particular UMI might have multiple records: for neighbor_record in umi_dict[umi_neighbor]: if record == neighbor_record: continue # due to d==0 this can be the same record # check EC overlap T1 = set(ec_dict[record.EC]) T2 = set(ec_dict[neighbor_record.EC]) e1 = (record, neighbor_record) e2 = (neighbor_record, record) if len(T1 & T2) > 0: if record.COUNT > 2 * neighbor_record.COUNT - 1: edges.append(e1) elif neighbor_record.COUNT > 2 * record.COUNT - 1: edges.append(e2) else: edges.append(e1) edges.append(e2) G = nx.DiGraph() G.add_nodes_from(nodes) G.add_edges_from(edges) return G
def find_su_number(self): self.get_iso_object() file = os.listdir('su_RAM/') hash_list = [] su_list = [] img_name = [] for img_f in file: img_path = 'su_RAM/' + img_f if img_f[-3:] == 'png': # print(img_path) sub_iso_img = Image.open(img_path) represent_hash = dhash.dhash_int(sub_iso_img) # os.remove(img_path) if not represent_hash: continue # print(represent_hash) hash_list.append(represent_hash) img_name.append(img_path) bk_tree = pybktree.BKTree(pybktree.hamming_distance, hash_list) for hash_code in hash_list: find_result = bk_tree.find(hash_code, 3) similar_number = len(find_result) su_list.append(similar_number) # print(find_result) # os.removedirs('su_RAM/') su_number = max(su_list) su_index = [i for i, v in enumerate(su_list) if v == su_number] # bin_img, contours, hierarchy = cv2.findContours(self.binary_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) # for i in su_index: # cv2.drawContours(self.o_image, contours, i, (0, 255, 255), thickness=-1) # print(su_index) for i in su_index: print(img_name[i]) print(img_name) # cv2.imwrite('su_RAM/su_image.png', self.o_image) return su_number
# count_threads=2 # part_data=len(data)//count_threads # threads = [] # for i in range(1,count_threads+1): # print(i*part_data) # if i*part_data-part_data<0: # start=0 # else: # start=i*part_data-part_data # # end=i*part_data # t = threading.Thread(target=worker, args=(start,end,data,hash)) # threads.append(t) # t.start() # print(list(data)) tree = pybktree.BKTree(fuzzy_distance, np_data) print(tree.find(last_foto, 10)) print(tree) # for foto in data: # tree.add(convert_base(last_foto,from_base=16,to_base=16)) # print(foto) # print(hash) # print(imagehash.hex_to_hash(foto)-hash) # hash=imagehash.hex_to_hash(foto) print("--- %s seconds main ---" % (time.time() - start_time)) name = "not_aneta"
class PHashStore: tree = pybktree.BKTree(pybktree.hamming_distance, []) def add(self, phash): if not self.exists(phash): self.tree.add(phash) def find(self, phash, distance=15): return self.tree.find(phash, distance) def exists(self, phash): return len(self.find(phash, 0)) > 0 def load(self, io): data = json.load(io) for r in data: self.add(r) def dump(self): return json.dumps(sorted(self.tree)) def phash_for(self, image, algorithm='dhash'): if algorithm == 'phash': return self.phash(image) else: return self.dhash(image) def phash(self, image): r = self.__ndarray_for(image, size="32x32!").astype(np.float64) h = fft.dctn(r, norm="ortho")[0:8, 0:8] avg = np.average(h.reshape(64, )[1:]) mask = (h <= avg) h = mask.reshape(64, ).dot(2**np.arange(mask.size)[::-1]) return int(h) def dhash(self, image): r = self.__ndarray_for(image) h = 0 try: for i in range(1, 9): for j in range(1, 9): h = h << 1 | (1 if r[i][j] >= r[i][j - 1] else 0) for i in range(1, 9): for j in range(1, 9): h = h << 1 | (1 if r[j][i] >= r[j - 1][i] else 0) return h except IndexError as e: pdb.set_trace() return -1 except ValueError as e: pdb.set_trace() return -2 def __ndarray_for(self, image, size="9x9!"): image.alpha_channel = False image.format = 'gray' image.type = 'grayscale' image.depth = 8 image.transform(resize=size) result = np.asarray(bytearray(image.make_blob()), dtype=np.uint8).reshape(image.size) image.close() return result def hamming2(self, s1, s2): assert len(s1) == len(s2) return sum(c1 != c2 for c1, c2 in zip(s1, s2))
def makeBkTree(func, addr="../data/DICT.txt"): return pybktree.BKTree(func, __readFilesAsList(addr))
def makeBkTree(func, addr): return pybktree.BKTree(func, __readFilesAsList(addr))
else: word2wiki_entity[word] = [entity] entity_word_set = set(word2wiki_entity) print("entity_word_set DONE") entity_totally_match = all_entity & entity_in_wiki # entity totally matched in wiki with open("enwiki_match.txt", "w") as f: # add totally matched entity to file for entity in entity_totally_match: num = entity2vec[entity] f.write("{},,,{},,,{},,,Total_Match\n".format(entity.lower(), entity, num)) Levenshtein_tree = pybktree.BKTree(distance, entity_in_wiki) print("Levenshtein_bktree Done") Word_tree = pybktree.BKTree(distance, entity_word_set) print("Word_tree Done") for entity_to_be_match in (all_entity - entity_totally_match): candidates = [] with open("enwiki_match.txt", "a") as f: for long_entity in entity_totally_match: if partof(entity_to_be_match, long_entity): candidates.append(long_entity) if len(candidates)!=0: entity_matched = process.extractOne(entity_to_be_match, candidates) num = entity2vec[entity_matched[0]] f.write("{},,,{},,,{},,,Abbreviation\n".format(entity_to_be_match.lower(), entity_matched, num))
import pybktree import pandas as pd import time import geopandas from shapely.geometry import Point t1 = time.time() df = pd.read_csv("/home/bigdata/Downloads/Data/miniNSPL.csv") tree = pybktree.BKTree(pybktree.hamming_distance, [0, 4, 5, 14,65,4,76,4,35,63,23]) print sorted(tree) print df.head() print df.shape Southampton = df[df.pcds.str.contains("SO15")] print Southampton.shape t2 = time.time() print t2 -t1
else: threshold = int( input('Enter threshold (e.g. \'40\' means the dhashes are \ 40% different and 60% similar): ')) hashDict = {} hashList = [] files = os.listdir(filePath) for file in files: image = Image.open(filePath + '/' + file) imageDhash = dhash.dhash_int(image) hashDict[imageDhash] = file hashList.append(imageDhash) f = csv.writer(open('dhashNearMatches.csv', 'w')) f.writerow(['percentage'] + ['dhash1'] + ['dhash2']) completeNearMatches = [] tree = pybktree.BKTree(pybktree.hamming_distance, hashList) for hash in hashList: nearMatches = tree.find(hash, threshold) for nearMatch in nearMatches: if hashDict[hash] != hashDict[nearMatch[1]]: print(nearMatch[0], hashDict[hash], hashDict[nearMatch[1]]) hashTuple = (nearMatch[0], hashDict[hash], hashDict[nearMatch[1]]) hashTupleReversed = (nearMatch[0], hashDict[nearMatch[1]], hashDict[hash]) if hashTupleReversed not in completeNearMatches: completeNearMatches.append(hashTuple) for hashTuple in completeNearMatches: f.writerow([hashTuple[0]] + [hashTuple[1]] + [hashTuple[2]])
prob_factor = 1 / sum(occurrences) for pred in predecessors: successor[pred] *= prob_factor # sort inverse lookup for successor in words_inverse.keys(): pred_and_probs = words_inverse[successor].items() pred_and_probs = sorted(pred_and_probs, key=lambda x: x[1], reverse=True) words_inverse[successor] = dict() for (pred, probability) in pred_and_probs: words_inverse[successor][pred] = probability print("Normalizing word frequencies...") for word in words.values(): successors = word.keys() occurrences = word.values() prob_factor = 1 / sum(occurrences) for successor in successors: word[successor] *= prob_factor print("Building BKTree...") tree = pybktree.BKTree(editdistance.eval) [tree.add(word) for word in words.keys()] print("Dumping to file...") model = dict() model['words'] = words model['words_inverse'] = words_inverse model['tree'] = tree dill.dump(model, open(f"{config.MODEL}/model.dill", 'wb'))
def __init__(self, match_threshold=lambda s: 1 + 0.3 * len(s)): self.match_tree = pybktree.BKTree( jellyfish.damerau_levenshtein_distance) self.match_map = {} self.max_query_len = 0 self.get_match_threshold = match_threshold