def run(): print("Image preprocessing with multi-workers.") print(f"RESIZE: {RESIZE}") print(f"IMAGENET_DIR: {IMAGENET_DIR}") print(f"VISIBLE_SUB_DIRS: {VISIBLE_SUBDIRS}") print(f"DEBUG_MODE: {'ON' if DEBUG else 'OFF'}") print("Warming up tqdm.") for _ in tqdm(range(10)): time.sleep(0.1) print(f'\nScanning dirs..') meta = Metadata.load_or_make( # meta_dir=path.join(root, os.pardir), remake=REBUILD_METADATA, data_dir=IMAGENET_DIR, visible_subdirs=['val'] if DEBUG else VISIBLE_SUBDIRS, extensions=IMG_EXTENSIONS, ) n_classes = len([v for v in meta.idx_to_samples]) n_images = sum([len(v) for v in meta.idx_to_samples.values()]) print(f'Done. {n_classes} classes and {n_images} images are found.') num_process = 1 if DEBUG else min(mp.cpu_count(), MAX_N_PROCESS) # num_process = mp.cpu_count() chunks = chunkify_classes(meta, num_process) if DEBUG: print('Start single processing for debugging.') results = [process(chunks[0])] # for debugging else: print(f'Start {num_process} processes.') pool = mp.Pool(processes=num_process) results = pool.map(process, chunks) pool.close() # no more task pool.join() # wrap up current tasks print("Preprocessing completed.") print('\n' * num_process) saved_total = 0 passed_total = 0 error_msgs_total = [] for saved, passed, error_msgs in results: saved_total += saved passed_total += passed error_msgs_total.extend(error_msgs) print(f"[!] {saved_total} saved file(s) / " f"{passed_total} ignored (already exist) file(s) / " f"{len(error_msgs_total)} error(s).") # log errors logfile_name = 'errors.txt' base = path.normpath(path.join(IMAGENET_DIR, os.pardir)) dataset = path.basename(IMAGENET_DIR) dataset = "_".join([dataset, NEW_DIR_POSTFIX, *map(str, RESIZE)]) logfile_path = path.join(base, dataset, logfile_name) with open(logfile_path, 'w') as f: for i, error_msg in enumerate(error_msgs_total): f.write(f'[Error {i}] {error_msg}\n') print(f"Error messages logged in {logfile_path}. " "Top 10 lines are as follows:") os.system(f'head -n 10 {logfile_path}')