예제 #1
0
def main():
    args = parse_arguments()

    if args.exclude:
        exclude_names = read_image_names(args.exclude)
    else:
        exclude_names = set()

    if args.refine:
        annotations = read_annotations(args.annotation_path)
        include_names = frozenset(annotations.keys())
    else:
        annotations = {}
        include_names = UniverseSet()

    image_names = []
    for image_path in find_images(args.images_folder):
        image_name = image_path.relative_to(args.images_root.resolve())

        if image_name in exclude_names or image_name not in include_names:
            continue

        image_names.append(image_name)
    print(f'{len(image_names)} images to classify')

    images_not_found = frozenset(include_names) - frozenset(image_names)
    if images_not_found:
        print(f'Warning: images cannot be found: {list(images_not_found)}')

    selector = ImageSelector(args.images_root, image_names,
                             args.annotation_path, annotations)
    selector.run()
예제 #2
0
def main():
    args = parse_arguments()
    images_dir = args.images_dir

    model = get_model()

    image_paths = list(find_images(images_dir))
    embeddings = compute_embeddings(model, image_paths)

    df = pd.DataFrame({
        'image_paths':
        [str(path.relative_to(args.data_root)) for path in image_paths],
        'embeddings':
        list(embeddings),
    })
    table = pa.Table.from_pandas(df)
    pq.write_table(table, args.output)
예제 #3
0
def main():
    args = parse_arguments()

    print('Reading datasets...')
    dataset_keys = set()
    for dataset_csv_path in find_dataset_csvs(args.data_root / 'datasets'):
        input_df = pd.read_csv(dataset_csv_path, header=None)
        dataset_keys.update(input_df[0])

    print('Checking metadata...')
    metadata_keys = set()
    for metadata_csv_path in find_metadata_csvs(args.data_root /
                                                'source/metadata'):
        df = pd.read_csv(metadata_csv_path, header=None)
        metadata_keys.update(df[0])

    not_in_metadata = dataset_keys - metadata_keys
    if not_in_metadata:
        print('WARNING: Keys in dataset csvs, but not in metadata csvs:')
        for key in not_in_metadata:
            print(f'  {key}')
        print()

    print('Checking object files...')
    object_keys = frozenset(
        str(image_path.relative_to(args.data_root))
        for image_path in find_images(args.data_root / 'source/'))

    print('Checking tabular files...')
    table_keys = set()
    for table_path in find_table_files(args.data_root / 'source/'):
        df = load_dataframe(table_path)
        table_keys.update(df[0])

    missing_keys = (dataset_keys | metadata_keys) - (object_keys | table_keys)
    if missing_keys:
        print('ERROR: Keys in csvs, but not among object files:')
        for key in missing_keys:
            print(f'  {key}')
        print()

    print('Done')
예제 #4
0
def main():
    args = parse_arguments()

    if Path(args.output_file).exists():
        blacklist = pd.read_csv(args.output_file,
                                header=None,
                                names=['image_name'])
        blacklisted = set(blacklist.image_name)
    else:
        blacklisted = set()

    images_dir = Path(args.images_dir)
    image_paths = list(find_images(images_dir))
    random.shuffle(image_paths)

    hash_map = defaultdict(list)  # hash -> List[image_path]
    for image_path in image_paths:
        image_name = image_path.relative_to(args.data_root)
        with open(image_path, 'rb') as infile:
            image_data = infile.read()

        image_hash = hashlib.blake2b(image_data).hexdigest()
        hash_map[image_hash].append(image_name)

    for image_hash, hash_paths in hash_map.items():
        if len(hash_paths) <= 1:
            continue

        print(f'Hash {image_hash} is shared by multiple images:')
        for hash_path in hash_paths:
            print(f'  {hash_path}')
        print('Blacklisting all except the first one', end='\n\n')
        blacklisted.update(hash_paths[1:])

    blacklist_series = pd.Series(list(blacklisted))
    blacklist_series.to_csv(args.output_file, index=False)
예제 #5
0
def main():  # pylint: disable=too-many-locals
    args = parse_arguments()

    if Path(args.output_file).exists():
        blacklist = pd.read_csv(args.output_file,
                                header=None,
                                names=['image_name'])
        blacklisted = set(blacklist.image_name)
    else:
        blacklisted = set()

    if args.images_dir:
        images_dir = args.images_dir

        model = get_model()

        image_paths = list(find_images(images_dir))
        embeddings = compute_embeddings(model, image_paths)
        image_names = [
            path.relative_to(args.data_root) for path in image_paths
        ]
    elif args.embeddings_file:
        data_df = pq.read_pandas(args.embeddings_file).to_pandas()

        data_df = data_df[~data_df.image_paths.isin(blacklisted)]

        image_names = list(data_df.image_paths)
        embeddings = tf.constant(list(data_df.embeddings))
    else:
        raise RuntimeError('Must pass either images dir or embeddings file')

    distances = row_pairwise_distances(embeddings)

    # Don't show images as similar to themselves
    diag_len = tf.size(tf.linalg.diag_part(distances))
    distances = tf.linalg.set_diag(distances,
                                   tf.broadcast_to(np.inf, [diag_len]))

    pct_to_show = 0.25
    number_to_show = int(len(image_names)**2 * pct_to_show * 2)
    lowest_distances, lowest_indexes = tf.math.top_k(
        tf.reshape(-distances, [-1]),
        min(3 * len(image_names), number_to_show),
    )
    lowest_distances = -lowest_distances

    image_pairs = []
    for i, distance in zip(lowest_indexes, lowest_distances):
        img1, img2 = np.unravel_index(i, distances.shape)  # pylint: disable=unbalanced-tuple-unpacking
        if img1 > img2:
            # Only show upper triangle of distance matrix
            continue

        image_pairs.append(
            ImagePair(
                image1_path=Path(image_names[img1]),
                image2_path=Path(image_names[img2]),
                distance=distance,
            ))

    duplicate_browser = DuplicateBrowser(image_pairs, args.data_root,
                                         args.output_file)
    duplicate_browser.run()