def encode_images(self, image_dir=None, rglob=False): """ Generate hashes for all images in a given directory of images. Args: image_dir: Path to the image directory. Returns: dictionary: A dictionary that contains a mapping of filenames and corresponding 64 character hash string such as {'Image1.jpg': 'hash_string1', 'Image2.jpg': 'hash_string2', ...} Example: ``` from imagededup.methods import <hash-method> myencoder = <hash-method>() mapping = myencoder.encode_images('path/to/directory') ``` """ if not os.path.isdir(image_dir): raise ValueError('Please provide a valid directory path!') image_dir = Path(image_dir) if rglob: files = [ i.absolute() for i in image_dir.rglob('*') if not i.name.startswith('.') and i.is_file() ] # ignore hidden files else: files = [ i.absolute() for i in image_dir.glob('*') if not i.name.startswith('.') and i.is_file() ] # ignore hidden files logger.info(f'Start: Calculating hashes...') hashes = parallelise(self.encode_image, files, self.verbose) hash_initial_dict = dict(zip([f.name for f in files], hashes)) hash_dict = { k: v for k, v in hash_initial_dict.items() if v } # To ignore None (returned if some probelm with image file) logger.info(f'End: Calculating hashes!') return hash_dict
def get_cosine_similarity( X: np.ndarray, chunk_size: int = 1000, threshold: int = 10000 ) -> np.ndarray: n_rows = X.shape[0] if n_rows <= threshold: print('Small feature matrix for calculating cosine similarities...') return cosine_similarity(X) else: print('Large feature matrix thus calculating cosine similarities in chunks...') start_idxs = list(range(0, n_rows, chunk_size)) end_idxs = start_idxs[1:] + [n_rows] cos_sim = parallelise( cosine_similarity_chunk, [(X, idxs) for i, idxs in enumerate(zip(start_idxs, end_idxs))], ) return np.vstack(cos_sim)
def _get_query_results( self, search_method_object: Union[BruteForce, BKTree]) -> None: """ Get result for the query using specified search object. Populate the global query_results_map. Args: search_method_object: BruteForce or BKTree object to get results for the query. """ args = list( zip( list(self.queries.keys()), list(self.queries.values()), [search_method_object] * len(self.queries), [self.threshold] * len(self.queries), )) result_map_list = parallelise(self._searcher, args) result_map = dict(zip(list(self.queries.keys()), result_map_list)) self.query_results_map = { k: [i for i in sorted(v, key=lambda tup: tup[1], reverse=False)] for k, v in result_map.items() } # {'filename.jpg': [('dup1.jpg', 3)], 'filename2.jpg': [('dup2.jpg', 10)]}
def run_parallel(encoder, files): from imagededup.utils.general_utils import parallelise hashes = parallelise(encoder.encode_image, files, encoder.verbose) hash_dict = dict(zip(files, hashes)) return hash_dict