def main(): args = parse_arguments() if args.exclude: exclude_names = read_image_names(args.exclude) else: exclude_names = set() if args.refine: annotations = read_annotations(args.annotation_path) include_names = frozenset(annotations.keys()) else: annotations = {} include_names = UniverseSet() image_names = [] for image_path in find_images(args.images_folder): image_name = image_path.relative_to(args.images_root.resolve()) if image_name in exclude_names or image_name not in include_names: continue image_names.append(image_name) print(f'{len(image_names)} images to classify') images_not_found = frozenset(include_names) - frozenset(image_names) if images_not_found: print(f'Warning: images cannot be found: {list(images_not_found)}') selector = ImageSelector(args.images_root, image_names, args.annotation_path, annotations) selector.run()
def main(): args = parse_arguments() images_dir = args.images_dir model = get_model() image_paths = list(find_images(images_dir)) embeddings = compute_embeddings(model, image_paths) df = pd.DataFrame({ 'image_paths': [str(path.relative_to(args.data_root)) for path in image_paths], 'embeddings': list(embeddings), }) table = pa.Table.from_pandas(df) pq.write_table(table, args.output)
def main(): args = parse_arguments() print('Reading datasets...') dataset_keys = set() for dataset_csv_path in find_dataset_csvs(args.data_root / 'datasets'): input_df = pd.read_csv(dataset_csv_path, header=None) dataset_keys.update(input_df[0]) print('Checking metadata...') metadata_keys = set() for metadata_csv_path in find_metadata_csvs(args.data_root / 'source/metadata'): df = pd.read_csv(metadata_csv_path, header=None) metadata_keys.update(df[0]) not_in_metadata = dataset_keys - metadata_keys if not_in_metadata: print('WARNING: Keys in dataset csvs, but not in metadata csvs:') for key in not_in_metadata: print(f' {key}') print() print('Checking object files...') object_keys = frozenset( str(image_path.relative_to(args.data_root)) for image_path in find_images(args.data_root / 'source/')) print('Checking tabular files...') table_keys = set() for table_path in find_table_files(args.data_root / 'source/'): df = load_dataframe(table_path) table_keys.update(df[0]) missing_keys = (dataset_keys | metadata_keys) - (object_keys | table_keys) if missing_keys: print('ERROR: Keys in csvs, but not among object files:') for key in missing_keys: print(f' {key}') print() print('Done')
def main(): args = parse_arguments() if Path(args.output_file).exists(): blacklist = pd.read_csv(args.output_file, header=None, names=['image_name']) blacklisted = set(blacklist.image_name) else: blacklisted = set() images_dir = Path(args.images_dir) image_paths = list(find_images(images_dir)) random.shuffle(image_paths) hash_map = defaultdict(list) # hash -> List[image_path] for image_path in image_paths: image_name = image_path.relative_to(args.data_root) with open(image_path, 'rb') as infile: image_data = infile.read() image_hash = hashlib.blake2b(image_data).hexdigest() hash_map[image_hash].append(image_name) for image_hash, hash_paths in hash_map.items(): if len(hash_paths) <= 1: continue print(f'Hash {image_hash} is shared by multiple images:') for hash_path in hash_paths: print(f' {hash_path}') print('Blacklisting all except the first one', end='\n\n') blacklisted.update(hash_paths[1:]) blacklist_series = pd.Series(list(blacklisted)) blacklist_series.to_csv(args.output_file, index=False)
def main(): # pylint: disable=too-many-locals args = parse_arguments() if Path(args.output_file).exists(): blacklist = pd.read_csv(args.output_file, header=None, names=['image_name']) blacklisted = set(blacklist.image_name) else: blacklisted = set() if args.images_dir: images_dir = args.images_dir model = get_model() image_paths = list(find_images(images_dir)) embeddings = compute_embeddings(model, image_paths) image_names = [ path.relative_to(args.data_root) for path in image_paths ] elif args.embeddings_file: data_df = pq.read_pandas(args.embeddings_file).to_pandas() data_df = data_df[~data_df.image_paths.isin(blacklisted)] image_names = list(data_df.image_paths) embeddings = tf.constant(list(data_df.embeddings)) else: raise RuntimeError('Must pass either images dir or embeddings file') distances = row_pairwise_distances(embeddings) # Don't show images as similar to themselves diag_len = tf.size(tf.linalg.diag_part(distances)) distances = tf.linalg.set_diag(distances, tf.broadcast_to(np.inf, [diag_len])) pct_to_show = 0.25 number_to_show = int(len(image_names)**2 * pct_to_show * 2) lowest_distances, lowest_indexes = tf.math.top_k( tf.reshape(-distances, [-1]), min(3 * len(image_names), number_to_show), ) lowest_distances = -lowest_distances image_pairs = [] for i, distance in zip(lowest_indexes, lowest_distances): img1, img2 = np.unravel_index(i, distances.shape) # pylint: disable=unbalanced-tuple-unpacking if img1 > img2: # Only show upper triangle of distance matrix continue image_pairs.append( ImagePair( image1_path=Path(image_names[img1]), image2_path=Path(image_names[img2]), distance=distance, )) duplicate_browser = DuplicateBrowser(image_pairs, args.data_root, args.output_file) duplicate_browser.run()