def process(input_dir, out_dir_clean, out_dir_discard, res: str): os.makedirs(out_dir_clean, exist_ok=True) os.makedirs(out_dir_discard, exist_ok=True) images_cleaned = set(os.listdir(out_dir_clean)) images_discarded = set(os.listdir(out_dir_discard)) images_dl = os.listdir(input_dir) N = len(images_dl) // NUM_TASKS clean = 0 discarded = 0 for i, imfile in job_enumerate(images_dl): if imfile in images_cleaned: clean += 1 continue if imfile in images_discarded: discarded += 1 continue im = Image.open(join(input_dir, imfile)) res = get_res(res) im2 = resize_or_discard(im, res, should_clean=True) if im2 is not None: fn, ext = os.path.splitext(imfile) im2.save(join(out_dir_clean, fn + '.jpg'), quality=QUALITY) clean += 1 else: im.save(join(out_dir_discard, imfile)) discarded += 1 print(f'\r{os.path.basename(input_dir)} -> {os.path.basename(out_dir_clean)} // ' f'Resized: {clean}/{N}; Discarded: {discarded}/{N}', end='') # Done print(f'\n{os.path.basename(input_dir)} -> {os.path.basename(out_dir_clean)} // ' f'Resized: {clean}/{N}; Discarded: {discarded}/{N}')
def process_all_in(self, input_dir, filter_imgs_dir): images_dl = iter_images(input_dir) # generator of paths # files this job should comperss files_of_job = [p for _, p in job_enumerate(images_dl)] # files that were compressed already by somebody (i.e. this job earlier) processed_already = self.images_cleaned | self.images_discarded # resulting files to be compressed files_of_job = [ p for p in files_of_job if get_fn(p) not in processed_already ] if filter_imgs_dir: ps_orig = cached_listdir_imgs(filter_imgs_dir, discard_shitty=True).ps fns_to_use = set(map(get_fn, ps_orig)) print('Filtering with', len(fns_to_use), 'filenames. Before:', len(files_of_job)) files_of_job = [p for p in files_of_job if get_fn(p) in fns_to_use] print('Filtered, now', len(files_of_job)) N = len(files_of_job) if N == 0: print('Everything processed / nothing to process.') return num_process = 2 if NUM_TASKS > 1 else int( os.environ.get('MAX_PROCESS', 16)) print( f'Processing {N} images using {num_process} processes in {NUM_TASKS} tasks...' ) start = time.time() predicted_time = None with multiprocessing.Pool(processes=num_process) as pool: for i, clean in enumerate( pool.imap_unordered(self.process, files_of_job)): if i > 0 and i % 100 == 0: time_per_img = (time.time() - start) / (i + 1) time_remaining = time_per_img * (N - i) if not predicted_time: predicted_time = time_remaining print( f'\r{time_per_img:.2e} s/img | {i / N * 100:.1f}% | {time_remaining / 60:.1f} min remaining', end='', flush=True) if predicted_time: print( f'Actual time: {(time.time() - start) / 60:.1f} // predicted {predicted_time / 60:.1f}' )
def _create_pickle(root_p, pkl_p, distributed_create, create_without_shitty, num_folder_levels=0): print(f'Globbing {root_p}...') ps = list(iter_images(root_p, num_folder_levels)) print(f'Found {len(ps)} files!') if distributed_create: print('--distributed_create, filtering files...') ps = [p for _, p in task_array.job_enumerate(ps)] print(f'{len(ps)} files left!') if create_without_shitty: print('--create_without_shitty given') database = [] start = time.time() shitty_reasons = defaultdict(int) num_processes = int(os.environ.get('MAX_PROCESS', 16)) if not distributed_create else 8 h = _ProcessHelper(create_without_shitty) with multiprocessing.Pool(processes=num_processes) as pool: for i, res in enumerate(pool.imap_unordered(h.process, ps)): if res is None: continue database.append(res) _, _, shitty_reason = res if shitty_reason: shitty_reasons[shitty_reason] += 1 if i > 0 and i % 100 == 0: time_per_img = (time.time() - start) / (i + 1) time_remaining = time_per_img * (len(ps) - i) info = f'\r{time_per_img:.2e} s/img | {i / len(ps) * 100:.1f}% | {time_remaining / 60:.1f} min remaining' if shitty_reasons and i % 1000 == 0: info += ' ' + '|'.join(f'{reason}:{count}' for reason, count in sorted( shitty_reasons.items(), key=operator.itemgetter(1))) print(info, end='', flush=True) info = '|'.join(f'{reason}:{count}' for reason, count in sorted(shitty_reasons.items(), key=operator.itemgetter(1))) print('\nshitty_reasons', info) print('Processed all') if len(database) == 0: raise ValueError(f'No images found in {root_p}!') print(f'\nGlobbed {len(database)} images, storing in {pkl_p}!') with open(pkl_p, 'wb') as fout: pickle.dump(database, fout)
def process(input_dir, out_dir, res: str, should_clean): inp_base, inp_name = os.path.split(input_dir.rstrip(os.path.sep)) if out_dir is None: out_dir = inp_base output_dir = os.path.join( out_dir, inp_name + '_' + res + ('_clean' if should_clean else '_all')) os.makedirs(output_dir, exist_ok=True) # if should_clean, we do not discard because of HSV, but still might because of size! discard_dir = os.path.join( out_dir, inp_name + '_' + res + ('_discard' if should_clean else '_discard_size')) os.makedirs(discard_dir, exist_ok=True) images_dl = os.listdir(input_dir) N = len(images_dl) // NUM_TASKS images_done = set(os.listdir(output_dir)) | set(os.listdir(discard_dir)) clean = 0 discarded = 0 skipped = 0 for i, imfile in job_enumerate(images_dl): if imfile in images_done: skipped += 1 continue im = Image.open(join(input_dir, imfile)) res = get_res(res) im2 = resize_or_discard(im, res, should_clean=should_clean) if im2 is not None: fn, ext = os.path.splitext(imfile) im2.save(join(output_dir, fn + '.png')) clean += 1 else: im.save(join(discard_dir, imfile)) discarded += 1 print('\r{} -> {} // Skipped {}/{}, Resized {}/{}, Discarded {}/{}'. format(os.path.basename(input_dir), os.path.basename(output_dir), skipped, N, clean, N, discarded, N), end='') # Done print( '\n{} -> {} // Skipped {}/{}, Resized {}/{}, Discarded {}/{} // Completed' .format(input_dir, output_dir, skipped, N, clean, N, discarded, N))
def process_all_in(self, input_dir): images_dl = iter_images(input_dir) # generator of paths # files this job should compress files_of_job = [p for _, p in job_enumerate(images_dl)] # files that were compressed already by somebody (i.e. this job earlier) processed_already = self.images_cleaned | self.images_discarded # resulting files to be compressed files_of_job = [ p for p in files_of_job if get_fn(p) not in processed_already ] N = len(files_of_job) if N == 0: print('Everything processed / nothing to process.') return num_process = 2 if NUM_TASKS > 1 else _NUM_PROCESSES print( f'Processing {N} images using {num_process} processes in {NUM_TASKS} tasks...' ) start = time.time() predicted_time = None with multiprocessing.Pool(processes=num_process) as pool: for i, clean in enumerate( pool.imap_unordered(self.process, files_of_job)): if i > 0 and i % 100 == 0: time_per_img = (time.time() - start) / (i + 1) time_remaining = time_per_img * (N - i) if not predicted_time: predicted_time = time_remaining print( f'\r{time_per_img:.2e} s/img | ' f'{i / N * 100:.1f}% | ' f'{time_remaining / 60:.1f} min remaining', end='', flush=True)
def compress(compressor: Compressor, indir, discard_shitty): ds = cached_listdir_imgs.cached_listdir_imgs(indir, min_size=None, discard_shitty=discard_shitty) imgs = ds.ps compressor.optimal_qs = get_optimal_qs.read(indir) if compressor.optimal_qs: print('Optimal Qs:', len(compressor.optimal_qs)) assert len(imgs) > 0, f'no matches for {indir}' num_imgs_to_process = len(imgs) // task_array.NUM_TASKS images_of_job = [p for _, p in task_array.job_enumerate(imgs)] N_orig = len(images_of_job) images_of_job = [ p for p in images_of_job if os.path.splitext(os.path.basename(p))[0] not in compressor.files_that_exist ] N = len(images_of_job) start = time.time() num_process = 2 if task_array.NUM_TASKS > 1 else int( os.environ.get('MAX_PROCESS', 16)) print( f'{task_array.JOB_ID}:', f'Compressing {N}/{N_orig} images ({ds.id}) using {num_process} processes', f'in {task_array.NUM_TASKS} tasks...') with multiprocessing.Pool(processes=num_process) as pool: for i, _ in enumerate( pool.imap_unordered(compressor.compress, images_of_job)): if i > 0 and i % 5 == 0: time_per_img = (time.time() - start) / (i + 1) time_remaining = time_per_img * (N - i) print( f'\r{time_per_img*num_process:.2e} s/img | ' f'{i / N * 100:.1f}% | {time_remaining / 60:.1f} min remaining', end='', flush=True)
def create_curves_for_images(root_dir, out_dir, grid, mode): times = [] # make sure we exclude here! otherwise, task_array.job_enumerate sees different number of files in every job!! all_img_ps = _get_image_paths(root_dir, exclude={'tmp'}) assert_exc(len(all_img_ps) > 0, 'No images found', ValueError) non_pngs = [p for p in all_img_ps if not p.endswith('.png')] assert_exc( len(non_pngs) == 0, f'Only .pngs are supported by this code! Found {len(non_pngs)} others.' ) measure_over_interval = { 'bpg': bpg_measure_over_interval, 'balle': balle_measure_over_interval, # 'bpgslow': bpg_measure_over_interval_slow, 'jp2k': jp2k_measure_over_interval, 'jp': jp_measure_over_interval, 'webp': webp_measure_over_interval }[mode] for i, img_p in task_array.job_enumerate(all_img_ps): print('>>>', task_array.TASK_ID, 'compresses', os.path.basename(img_p)) img_name = os.path.splitext(os.path.basename(img_p))[0] s = time.time() mf = measures_file_p(out_dir, img_name) if complete_measures_file_exists(mf, num_ops=len(grid)): print(f'Found output for {img_name}, skipping...') continue # need to create measures file with open(mf, 'w+') as f: measure_over_interval(img_p, f, grid) times.append(time.time() - s) avg_time = np.mean(times[-15:]) print('Time left: {:.2f}min'.format(avg_time * (len(all_img_ps) - i) / 60))