def get_duplicate_imgs_tuple(img_path): phasher = PHash() encodings = phasher.encode_images(image_dir=img_path) duplicates = phasher.find_duplicates(encoding_map=encodings) # Find two duplicate imgs --> duplicate_imgs_tuple duplicate_imgs_tuple = [] img_checked = [] for img, imgs in duplicates.items(): if len(imgs) == 1 and img not in img_checked: temp = [img] + imgs duplicate_imgs_tuple.append(temp) # deduplication for each in temp: img_checked.append(each) # transformation type for i in range(len(duplicate_imgs_tuple)): duplicate_imgs_tuple[i] = [int(each.split('.')[0]) for each in duplicate_imgs_tuple[i]] # post-processing: # Previous tests found that phash might put three pictures into two similar pairs, so a round of selection was added flatten_tuple = [] for i in duplicate_imgs_tuple: for j in i: flatten_tuple.append(j) counter_dict = Counter(flatten_tuple) # counter count, check duplicate error_keys = [key for key, value in counter_dict.items() if value > 1] # pick out duplicate images treated_dup_imgs_tuple = [each for each in duplicate_imgs_tuple if len( set(each+error_keys))-len(error_keys) == len(each)] # remove duplicate image pairs return treated_dup_imgs_tuple
def main(): # Get all dirs in BASE_DIR BASE_DIR_SUBDIRS = [item[0] for item in os.walk(BASE_DIR)] phasher = PHash() duplicate_image_mappings = [] all_encodings = {} for pic_dir in tqdm([BASE_DIR] + BASE_DIR_SUBDIRS): # Generate encodings for all images in an image directory encodings = phasher.encode_images(image_dir=pic_dir) # Add subdir to all keys (encode_images defaults to key = <filename>; I want the full path as the key) full_path_encodings = { f'{pic_dir}/{k}': v for k, v in encodings.items() } all_encodings.update(full_path_encodings) # Save encodings to disk with open(f"{BASE_DIR}/pictures_encodings.json", 'w') as f: json.dump(all_encodings, f) # Find duplicates using the generated encodings duplicates = phasher.find_duplicates( encoding_map=all_encodings, outfile=f"{BASE_DIR}/pictures_duplicates.json")
def remove_duplicates(data_dir, image_names): phasher = PHash() hashed_images = dict() unique_images = set() for img in tqdm(image_names): img_hash = phasher.encode_image(data_dir + img) hashed_images.setdefault(img_hash, []).append(img) for phash, imgs in hashed_images.items(): unique_images.add(imgs[0]) return unique_images
def test_imagededup(img_dir): #not good for exact match from imagededup.methods import PHash, DHash phasher = PHash() # Generate encodings for all images in an image directory encodings = phasher.encode_images(image_dir=img_dir) # Find duplicates using the generated encodings duplicates = phasher.find_duplicates(encoding_map=encodings, scores=True, max_distance_threshold=0) #for key in duplicates: for key in sorted(duplicates.keys()): if len(duplicates[key]) > 0: print(key, ':', duplicates[key])
def __init__(self, songPath: str): self.songPath = songPath path , ext= os.path.splitext(self.songPath) self.head, self.fileName = os.path.split(self.songPath) if (ext == ".wav"): self.sampleRate, self.Data = wavfile.read(self.songPath) self.wavfile = self.songPath elif (ext ==".mp3"): self.Sound = AudioSegment.from_mp3(self.songPath) self.wavfile = 'database/'+self.fileName[:-4]+'.wav' self.Sound.export(self.wavfile, format="wav") self.sampleRate, self.Data = wavfile.read(self.wavfile) if(len(self.Data.shape)==2): if (self.Data.shape[1]==2): self.Data = np.mean(self.Data, axis=1) self.TimeOfSampling=1/self.sampleRate self.NumberOfSample =int(60/self.TimeOfSampling) self.Data = self.Data[0:self.NumberOfSample] self.HashFuncs = [AHash(),WHash(),PHash(),DHash()] self.HashFileNames = ["AHash","WHash","PHash","DHash"] self.imageArray = None self.spectrogram(self.Data,self.sampleRate) self.SpectrogramFeatures() self.hashFunction()
def get_duplicates(im_paths, method="cnn", encodings=None, deduper=None, **kwargs): from imagededup.methods import PHash, CNN if encodings is None: encodings, deduper = get_encodings(im_paths, method) if method == "cnn": deduper = deduper or CNN() assert isinstance(deduper, CNN) thresh = "min_similarity_threshold" if thresh not in kwargs: kwargs[thresh] = 0.9 elif method == "phash": deduper = deduper or PHash() assert isinstance(deduper, PHash) thresh = "max_distance_threshold" if thresh not in kwargs: kwargs[thresh] = 10 else: raise ValueError(f"Method {method} unknown") duplicates = deduper.find_duplicates(encoding_map=encodings, scores=True, **kwargs) return duplicates, encodings, deduper
def get_encodings(im_paths, method="cnn"): import imagededup.utils.data_generator as data_generator from imagededup.methods import PHash, CNN if method == "cnn": data_generator.DataGenerator = get_data_gen(im_paths) deduper = CNN() encodings = deduper.encode_images('/') elif method == "phash": deduper = PHash() encodings = run_parallel(deduper, im_paths) else: raise ValueError(f"Method {method} unknown") return encodings, deduper
def remove_duplicates(): phasher = PHash() # Generate encodings for all images in an image directory encodings = phasher.encode_images(image_dir=image_dir) # Find duplicates using the generated encodings duplicates = phasher.find_duplicates(encoding_map=encodings) # Remove duplicates duplicate_keys = [key for key in duplicates.keys() if len(duplicates[key]) != 0] removed = 0 for key in duplicates.keys(): if key in duplicate_keys: for dup in duplicates[key]: try: os.remove(os.path.join(image_dir, dup)) duplicate_keys.remove(dup) except: pass removed += 1 print(f'Removed {removed} duplicates')
#print("Caught signal from %r, data %r" % (sender, kw)) status.text(kw['message']) return 'received!' @send_progress.connect def receive_progress(sender, **kw): #print("Caught signal from %r, data %r" % (sender, kw)) progress_bar.progress(int(round(kw['percent']))) #status.text(kw['message']) return 'received!' if option == "pHash": try: phasher = PHash() duplicates = phasher.find_duplicates(image_dir, max_distance_threshold=max_distance, \ scores=True) except Exception as e: duplicates = {} st.write(str(e)) elif option == "CNN": try: cnn_encoder = CNN() duplicates = cnn_encoder.find_duplicates(image_dir=image_dir, scores=True) except Exception as e: duplicates = {} st.write(str(e)) elif option == "aHash": try:
def SpectrogramFeatures(self): featuresData = librosa.feature.spectral_bandwidth(y=np.array(self.Data,dtype=np.float32)) featuresData = PHash().encode_image(image_array = featuresData) FileNameOfspectroFeatures = self.fileName[:-4] +".SpectroFeatures" self.saveFeaturesData(FileNameOfspectroFeatures,featuresData)
ap.add_argument("-f", "--folder", required=True, help="path to directory of images") args, unknown = ap.parse_known_args() folder = args.folder if os.path.isabs(folder): directory = folder else: directory = os.path.sep.join(["data", args.folder]) deleted = directory + "_duplicates" if not os.path.exists(deleted): os.makedirs(deleted) phasher = PHash() duplicates = phasher.find_duplicates_to_remove(image_dir=directory, max_distance_threshold=5) print("[INFO] removing {} duplicates".format(str(len(duplicates)))) for duplicate in duplicates: pathFrom = os.path.join(directory, duplicate) pathTo = os.path.join(deleted, duplicate) os.rename(pathFrom, pathTo) noExtension = os.path.splitext(duplicate)[0] labelFile = os.path.join(directory, noExtension) + ".txt" if os.path.exists(labelFile): txtPath = os.path.join(deleted, noExtension) + ".txt"
import os from imagededup.methods import PHash image_dir = 'I://net hat//001' if __name__ == '__main__': phasher = PHash() duplicates_list = phasher.find_duplicates_to_remove(image_dir) for i in range(0, len(duplicates_list) - 1): duplicates_list_remove = image_dir + '//' + duplicates_list[i] os.remove(duplicates_list_remove)
import cv2 from shutil import copyfile os.mkdir('clean_data') files = os.listdir('data') for i in tqdm(range(len(files))): try: im = cv2.imread('data/' + files[i], cv2.IMREAD_GRAYSCALE) im.shape copyfile('data/' + files[i], 'clean_data/' + files[i]) except: pass phasher = PHash() encodings = phasher.encode_images(image_dir='clean_data/') duplicates = phasher.find_duplicates(encoding_map=encodings) data = [] dkeys = list(duplicates.keys()) ind_key = 0 while (True): print(len(dkeys)) data.append(dkeys[ind_key]) if (len(duplicates[dkeys[ind_key]]) > 0): for j in range(len(duplicates[dkeys[ind_key]])): try: dkeys.remove(duplicates[dkeys[ind_key]][j])
from imagededup.methods import PHash import os import json root = '/media/palm/62C0955EC09538ED/ptt/full_sized' duplicates = [] for cls in os.listdir(root)[1:]: phasher = PHash() # Generate encodings for all images in an image directory encodings = phasher.encode_images(image_dir=os.path.join(root, cls)) # Find duplicates using the generated encodings duplicate = phasher.find_duplicates(encoding_map=encodings, max_distance_threshold=1) with open('/home/palm/PycharmProjects/ptt/datastuffs/dups/'+cls+'.json', 'w') as write: json.dump([duplicate, encodings], write) duplicates.append(duplicate)
PHash: http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html """ import os import matplotlib.pylab as plt import imagededupSt from imagededup.methods import PHash from imagededup.utils import plot_duplicates PATH = 'E:\DATASET\pku-autonomous-driving' test_img_dir = os.path.join(PATH, 'test_images') # Find similar images # __Note:__ `max_distance_threshold` defines the threshold of differences between two images to consider them similar, # the higher the value, the more tolerant it is in differences. # # Below we list the first 15 images found having similar content according to imagededup. # To get the full list, you have to display the content of variable `duplicates`. phasher = PHash() duplicates = phasher.find_duplicates(image_dir=test_img_dir, scores=True, max_distance_threshold=3) print('There are', len([x for x in duplicates if duplicates[x] != []]), 'images with similar images over', len(duplicates), 'images.') # There are 429 images with similar images over 2021 images. plt.figure(figsize=(20, 20)) plot_duplicates(image_dir=test_img_dir, duplicate_map=duplicates, filename='ID_5bf531cf3.jpg')
from pyqtgraph import PlotWidget import pyqtgraph as pg from PyQt5 import QtCore, QtGui, QtWidgets from PyQt5.QtWidgets import QDialog, QApplication, QPushButton, QVBoxLayout, QTableWidgetItem from PyQt5 import QtCore, QtWidgets, QtMultimedia import logging import numpy as np import matplotlib.pyplot as plt from numpy import genfromtxt import pandas as pd import librosa import librosa.display #from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas from PIL import Image from imagededup.methods import PHash phasher = PHash() import os import csv from math import floor from pydub import AudioSegment count = 0 temp2 = [] for filename in os.listdir("./"): count += 1 temp2 = temp2 + [filename] if (("tempDir" in temp2) == False): os.mkdir('tempDir') if (("back" in temp2) == False): os.mkdir('back')
parser.add_argument('-p', '--path', type=str, required=True, help="path of folder, for duplicated trashing") args = parser.parse_args() def remove(path): """ param <path> could either be relative or absolute. """ if os.path.isfile(path) or os.path.islink(path): os.remove(path) # remove the file elif os.path.isdir(path): shutil.rmtree(path) # remove dir and all contains else: raise ValueError("file {} is not a file or dir.".format(path)) if __name__ == "__main__": path = args.path del_list = [] phasher = PHash() encodings = phasher.encode_images(image_dir=path) duplicates = phasher.find_duplicates(encoding_map=encodings) for k,v in duplicates.items(): if len(v) and (k not in del_list): for fname in v: del_list.append(fname) print('Deleting Duplicates :\n{0}'.format(del_list)) for dl in del_list: remove(path + dl)
from imagededup.methods import PHash phasher = PHash() # Generate encodings for all images in an image directory encodings = phasher.encode_images(image_dir='../Testout') # Find duplicates using the generated encodings duplicates = phasher.find_duplicates(encoding_map=encodings) # from imagededup.utils import plot_duplicates # # plot duplicates obtained for a given file using the duplicates dictionary # plot_duplicates(image_dir='path/to/image/directory', # duplicate_map=duplicates, # filename='ukbench00120.jpg')
help='save encoding map (phash of images) as pkl', action='store_true') cache_group.add_argument('--load', help='load encoding map (phash of images) from pkl', type=str) args = parser.parse_args() dist_thresh = int(args.thresh) assert 0 <= dist_thresh <= 64 root_dir = Path(args.directory) assert root_dir.is_dir() out_dir = root_dir.parent / 'Dups_thresh{}'.format(dist_thresh) phasher = PHash() if args.load is not None and Path(args.load).is_file(): import pickle encoding_map = pickle.load(open(args.load, 'rb')) print(f'Encoding map loaded from pickle file: {args.load}!') else: tic = time.perf_counter() encoding_map = phasher.encode_images(image_dir=root_dir, rglob=True) toc = time.perf_counter() print(f'encoding duration: {toc-tic:.3f}s') if args.save: import pickle pickle_file = f"{root_dir.stem}_encoding_map.pkl" pickle.dump(encoding_map, open(pickle_file, "wb")) print(f'Encoding map dumped as pickle at: {pickle_file}')
from imagededup.methods import PHash phasher = PHash() encodings = phasher.encode_images(image_dir='path/to/image/directory') duplicates = phasher.find_duplicates(encoding_map=encodings)
for item in dup_items: image2 = item[0].split('_')[0] if image1 != image2 and ( (image1, image2) not in duplicates_list) and ( (image2, image1) not in duplicates_list): duplicates_list.append((image1, image2)) scores.append(item[1]) duplicates_df = pd.DataFrame(duplicates_list, columns=['image1', 'image2']) duplicates_df['score'] = scores return duplicates_df # The max_distance_threshold parameter of phash.find_duplicates() specifies the hamming distance below which retrieved duplicates are considered valid. We'll start with a max_distance_threshold of 8. # In[5]: phash = PHash() encodings = phash.encode_images( image_dir='../input/hpa-single-cell-image-classification/train') encodings_public = phash.encode_images(image_dir='../input/publichpa_1024') encodings.update(encodings_public) duplicates = phash.find_duplicates(encoding_map=encodings, scores=True, max_distance_threshold=8) duplicates_df = convert_dict_to_df(duplicates) duplicates_df.to_csv('../input/duplicates.csv', index=False)