def merge_near_duplicates(near_duplicate_objects): """ Iteratively merge nearly deduplicated images Args: near_duplicate_objects: a list of tuples. Each tuple is a (SimhashIndex, image_dictionary), where image_dictionary is an object which contains simhash keys and image/filename values Returns: a dictionary containing simhash keys and image/filename values """ if near_duplicate_objects == None or len(near_duplicate_objects) == 0: return {} if len(near_duplicate_objects) == 1: # near_duplicate_objects is a tuple (index, image_dictionary) return near_duplicate_objects[0][1] final_dict = {} first_nd = None second_nd = None for index, (simhash_index, image_dictionary) in enumerate(near_duplicate_objects): if index < len(near_duplicate_objects) - 1: sim_index1, img_dict1 = near_duplicate_objects[index] sim_index2, img_dict2 = near_duplicate_objects[index+1] first_nd, second_nd = NearDuplicate([]), NearDuplicate([]) first_nd.image_dictionary, second_nd.image_dictionary = img_dict1, img_dict2 first_nd.simhash_index, second_nd.simhash_index = sim_index1, sim_index2 final_dict.update(first_nd.merge_near_duplicate_dictionaries(second_nd)) return final_dict
def near_deduplicate_images(file_array, bit_distance, metadata = None): """Given a list of file names, return a dictionary of "nearly" deduplicated images""" nd = NearDuplicate(file_array, k=bit_distance, metadata_dictionary = metadata) nd.deduplicate_images() return nd.simhash_index,nd.image_dictionary