def detect_duplicate_images(imgpath_list): import sys global DUPLICATE_HASH_PRECISION nImg = len(imgpath_list) lbl = 'checking duplicate' duplicates = {} mark_progress, end_progress = helpers.progress_func(nImg, lbl=lbl) for count, gpath in enumerate(imgpath_list): mark_progress(count) img = imread(gpath) img_hash = helpers.hashstr(img, DUPLICATE_HASH_PRECISION) if not img_hash in duplicates: duplicates[img_hash] = [] duplicates[img_hash].append(gpath) if '--strict' in sys.argv: # Be very safe: Check for collisions for hashstr, gpath_list in duplicates.iteritems(): img1 = imread(gpath_list[0]) for gpath in gpath_list: img2 = imread(gpath) if not np.all(img1 == img2): DUPLICATE_HASH_PRECISION += 8 raise Exception("hash collision. try again") end_progress() return duplicates
def _gen(fpath_list): # Exif generator nGname = len(fpath_list) lbl = '[io] Load Image EXIF' mark_progress, end_progress = helpers.progress_func(nGname, lbl, 16) for count, fpath in enumerate(fpath_list): mark_progress(count) yield read_exif(fpath, **kwargs) end_progress()
def _compute_in_serial(task_list, task_lbl='', verbose=True): # Serialize Tasks result_list = [] nTasks = len(task_list) if verbose: mark_progress, end_prog = helpers.progress_func(nTasks, lbl=task_lbl) # Compute each task for count, (fn, args) in enumerate(task_list): mark_progress(count) result = fn(*args) result_list.append(result) end_prog() else: # Compute each task for (fn, args) in iter(task_list): result = fn(*args) result_list.append(result) print('[parallel] ... done') return result_list
def _compute_in_serial(task_list, task_lbl='', verbose=True): # Serialize Tasks result_list = [] nTasks = len(task_list) if verbose: mark_progress, end_prog = util.progress_func(nTasks, lbl=task_lbl) # Compute each task for count, (fn, args) in enumerate(task_list): mark_progress(count) #sys.stdout.flush() result = fn(*args) result_list.append(result) end_prog() else: # Compute each task for (fn, args) in iter(task_list): result = fn(*args) result_list.append(result) print('[parallel] ... done') return result_list
def _compute_in_parallel(task_list, num_procs, task_lbl='', verbose=True): ''' Input: task list: [ (fn, args), ... ] ''' task_queue = multiprocessing.Queue() done_queue = multiprocessing.Queue() nTasks = len(task_list) # queue tasks for task in iter(task_list): task_queue.put(task) # start processes proc_list = [] for i in xrange(num_procs): printDBG('[parallel] creating process %r' % (i, )) proc = multiprocessing.Process(target=_worker, args=(task_queue, done_queue)) proc.start() proc_list.append(proc_list) # wait for results printDBG('[parallel] waiting for results') sys.stdout.flush() result_list = [] if verbose: mark_progress, end_prog = helpers.progress_func(nTasks, lbl=task_lbl, spacing=num_procs) for count in xrange(len(task_list)): mark_progress(count) printDBG('[parallel] done_queue.get()') result = done_queue.get() result_list.append(result) end_prog() else: for i in xrange(nTasks): done_queue.get() print('[parallel] ... done') printDBG('[parallel] stopping children') # stop children processes for i in xrange(num_procs): task_queue.put('STOP') return result_list
def _compute_in_parallel(task_list, num_procs, task_lbl='', verbose=True): ''' Input: task list: [ (fn, args), ... ] ''' task_queue = multiprocessing.Queue() done_queue = multiprocessing.Queue() nTasks = len(task_list) # queue tasks for task in iter(task_list): task_queue.put(task) # start processes proc_list = [] for i in xrange(num_procs): printDBG('[parallel] creating process %r' % (i,)) proc = multiprocessing.Process(target=_worker, args=(task_queue, done_queue)) proc.daemon = True proc.start() proc_list.append(proc) # wait for results printDBG('[parallel] waiting for results') sys.stdout.flush() result_list = [] if verbose: mark_progress, end_prog = util.progress_func(nTasks, lbl=task_lbl, spacing=num_procs) for count in xrange(len(task_list)): mark_progress(count) printDBG('[parallel] done_queue.get()') result = done_queue.get() result_list.append(result) end_prog() else: for i in xrange(nTasks): done_queue.get() print('[parallel] ... done') printDBG('[parallel] stopping children') # stop children processes for i in xrange(num_procs): task_queue.put('STOP') for proc in proc_list: proc.join() return result_list