def test_get_files_to_remove(): from collections import OrderedDict dict_a = OrderedDict({ '1': ['2'], '2': ['1', '3'], '3': ['4'], '4': ['3'], '5': [] }) dups_to_remove = general_utils.get_files_to_remove(dict_a) assert set(dups_to_remove) == set(['2', '4'])
def find_duplicates_to_remove( self, image_dir: PurePath = None, encoding_map: Dict[str, np.ndarray] = None, min_similarity_threshold: float = 0.9, outfile: Optional[str] = None, ) -> List: """ Give out a list of image file names to remove based on the similarity threshold. Does not remove the mentioned files. Args: image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as numpy arrays which represent the CNN encoding for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding CNN encodings. min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9 outfile: Optional, name of the file to save the results, must be a json. Default is None. Returns: duplicates: List of image file names that should be removed. Example: ``` from imagededup.methods import CNN myencoder = CNN() duplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'), min_similarity_threshold=0.85) OR from imagededup.methods import CNN myencoder = CNN() duplicates = myencoder.find_duplicates_to_remove(encoding_map=<mapping filename to cnn encodings>, min_similarity_threshold=0.85, outfile='results.json') ``` """ if image_dir or encoding_map: duplicates = self.find_duplicates( image_dir=image_dir, encoding_map=encoding_map, min_similarity_threshold=min_similarity_threshold, scores=False, ) files_to_remove = get_files_to_remove(duplicates) if outfile: save_json(files_to_remove, outfile) return files_to_remove
def find_duplicates_to_remove( self, image_dir: PosixPath = None, encoding_map: Dict[str, str] = None, max_distance_threshold: int = 10, outfile: Optional[str] = None, ) -> List: """ Give out a list of image file names to remove based on the hamming distance threshold threshold. Does not remove the mentioned files. Args: image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as hash strings for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding hashes. max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). Default is 10. outfile: Optional, name of the file to save the results. Returns: duplicates: List of image file names that are found to be duplicate of me other file in the directory. Example: ``` from imagededup.methods import <hash-method> myencoder = <hash-method>() duplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'), max_distance_threshold=15) OR from imagededup.methods import <hash-method> myencoder = <hash-method>() duplicates = myencoder.find_duplicates(encoding_map=<mapping filename to hashes>, max_distance_threshold=15, outfile='results.json') ``` """ result = self.find_duplicates( image_dir=image_dir, encoding_map=encoding_map, max_distance_threshold=max_distance_threshold, scores=False, ) files_to_remove = get_files_to_remove(result) if outfile: save_json(files_to_remove, outfile) return files_to_remove