예제 #1
0
def main():
    imgnt = imagenet.ImageNetData()
    wnids = list(imgnt.train_imgs_by_wnid.keys())

    train_tarball_names = get_tarball_names(wnids, 'imagenet-train/')
    val_tarball_names = get_tarball_names(wnids, 'imagenet-validation/val-')

    def flatten_train_tarball(tarball_name):
        return flatten_tarball(tarball_name,
                               prefix="imagenet-train-individual/")

    def flatten_val_tarball(tarball_name):
        return flatten_tarball(tarball_name,
                               prefix="imagenet-validation-individual/")

    pwex = pywren.default_executor()
    futures = pwex.map(flatten_val_tarball, val_tarball_names)
    failed_wnids = []
    for future, wnid in zip(futures, wnids):
        try:
            future.result()
        except:
            failed_wnids.append(wnid)
            print('wnid failed', wnid)
    print(failed_wnids)
    results = pywren.get_all_results(futures)
예제 #2
0
def generate_top_k_wnids_json():
    top_k = 21
    imgnet = imagenet.ImageNetData()
    wnids = list(imgnet.train_imgs_by_wnid.keys())
    result = get_all_top_k_wnids(wnids, top_k)
    with open('../data/metadata/wnid_to_most_similar_wnids.json', 'w') as fp:
        json.dump(result, fp, indent=2)
예제 #3
0
def featurize_test_images(bucket, prefix, batch_size):
    imgnt = imagenet.ImageNetData()
    to_featurize = []
    to_featurize_keys = []
    client = utils.get_s3_client()
    start = timer()
    num_batches = 0
    for k in imgnt.test_filenames:
        key_name = os.path.join(prefix, f"{k}.npy")
        key_exists = utils.key_exists(bucket, key_name)
        if not key_exists:
            img = imgnt.load_image(k,
                                   size='scaled_256',
                                   force_rgb=True,
                                   verbose=False)
            img = skimage.transform.resize(img,
                                           FEATURIZE_SIZE,
                                           preserve_range=True)
            to_featurize.append(img)
            to_featurize_keys.append(k)
            if len(to_featurize) >= batch_size:
                num_batches += 1
                featurize_and_upload_batch(to_featurize, to_featurize_keys,
                                           batch_size, bucket, prefix, client)
                end = timer()
                print('processing bach {} (size {}) took {} seconds'.format(
                    num_batches, len(to_featurize), end - start))
                start = timer()
                to_featurize = []
                to_featurize_keys = []
    if len(to_featurize) > 0:
        featurize_and_upload_batch(to_featurize, to_featurize_keys, batch_size,
                                   bucket, prefix, client)
예제 #4
0
def above_threshold(dataset_size, selection_frequency_threshold,
                    min_num_annotations, seed, output_filename, starting_from,
                    wnid_thresholds_filename):
    output_filepath = pathlib.Path(
        __file__).parent / '../data/datasets' / output_filename
    output_filepath = output_filepath.resolve()
    assert not output_filepath.is_file()

    if starting_from is not None:
        starting_from_filepath = pathlib.Path(
            __file__).parent / '../data/datasets' / starting_from
        starting_from_filepath = starting_from_filepath.resolve()
        assert starting_from_filepath.is_file()
        with open(starting_from_filepath, 'r') as f:
            starting_from_loaded = json.load(f)

    if wnid_thresholds_filename is not None:
        wnid_thresholds_filepath = pathlib.Path(wnid_thresholds_filename)
        wnid_thresholds_filepath = wnid_thresholds_filepath.resolve()
        assert wnid_thresholds_filepath.is_file()
        with open(wnid_thresholds_filepath, 'r') as f:
            wnid_thresholds = json.load(f)
    else:
        wnid_thresholds = None

    review_targets = {'l2': 1.2e8, 'dssim': 0.2205, 'fc7': 1.32e4}

    success, result, sampling_candidates, exclusions, carried_over_from_prev = dataset_sampling.sample_above_threshold(
        dataset_size=dataset_size,
        selection_frequency_threshold=selection_frequency_threshold,
        min_num_annotations=min_num_annotations,
        near_duplicate_review_targets=review_targets,
        seed=seed,
        starting_from=starting_from_loaded,
        wnid_thresholds=wnid_thresholds)

    if not success:
        imgnet = imagenet.ImageNetData()
        num_per_class = dataset_size // 1000
        print('Failed to sample a valid dataset.')
        print(
            'The following wnids have fewer than {} candidates above threshold {} with at least {} annotations'
            .format(num_per_class, selection_frequency_threshold,
                    min_num_annotations))
        for wnid, cur_candidates in sampling_candidates.items():
            if len(cur_candidates) < num_per_class - len(
                    carried_over_from_prev[wnid]):
                print(
                    '    {}: {} sampling candidates, plus {} carried over from the previous dataset  ({})'
                    .format(wnid, len(cur_candidates),
                            len(carried_over_from_prev[wnid]),
                            ', '.join(imgnet.class_info_by_wnid[wnid].synset)))
                for reason, excluded_candidates in exclusions[wnid].items():
                    print('        {}: {} candidates'.format(
                        reason, len(excluded_candidates)))
    result['output_filename'] = output_filename
    with open(output_filepath, 'w') as f:
        json.dump(result, f, indent=2)
    print('Wrote dataset to {}'.format(output_filepath))
예제 #5
0
파일: mturk.py 프로젝트: wuxf-ml/ImageNetV2
def generate_hit_html(hit_data,
                      html_template_path,
                      html_style_path,
                      add_question_header=True):
    imagenet_data = imagenet.ImageNetData()
    with open(html_template_path, "r") as f:
        html_text = f.read()
    with open(html_style_path, "r") as f:
        style_text = f.read()
    htmls = {}
    for hit in hit_data:
        out_html = ''
        if (add_question_header):
            out_html += QUESTION_HEADER
        wnid = hit["wnid"]
        class_info = imagenet_data.class_info_by_wnid[wnid]
        synset = class_info.synset
        gloss = class_info.gloss
        wikipedia_pages = class_info.wikipedia_pages
        wikipedia_page = ", ".join(
            ['<a href="{0}">{1}</a>'.format(x, x) for x in wikipedia_pages])
        synset = " or ".join(synset)
        image_html = ''
        for i, image in enumerate(hit["images_all"]):
            if (image in hit["images_to_label"]):
                # image is an id
                encrypted_image = utils.encrypt_string_with_magic(image)
                image_decrypted = utils.decrypt_string_with_magic(
                    encrypted_image)
                assert (image_decrypted == image)
                encrypted_image_quoted = quote(encrypted_image)
                s3_link = "https://s3-us-west-2.amazonaws.com/imagenet2datav2/encrypted/{0}".format(
                    encrypted_image_quoted) + ".jpg"
                #print("S3 links ", s3_link)
            else:
                encrypted_image = utils.encrypt_string_with_magic(image)
                image_decrypted = utils.decrypt_string_with_magic(
                    encrypted_image)
                assert (image_decrypted == image)
                encrypted_image_quoted = quote(encrypted_image)
                s3_link = "https://s3-us-west-2.amazonaws.com/imagenet2datav2/encrypted/{0}".format(
                    encrypted_image_quoted) + ".jpg"
            html = HTML_TEMPLATE.format(img=encrypted_image,
                                        url=s3_link,
                                        checkboxnum=i)
            image_html += html
            image_html += "\n"
        html_body = html_text.format(image_data=image_html,
                                     synset=synset,
                                     gloss=gloss,
                                     wiki=wikipedia_page)
        out_html += html_body
        out_html += style_text
        if (add_question_header):
            out_html += QUESTION_FOOTER
        htmls[hit["uuid"]] = out_html
    return htmls
예제 #6
0
def featurize_candidates(bucket, prefix, batch_size, source_filename):
    imgnt = imagenet.ImageNetData()
    cds = candidate_data.CandidateData(verbose=False)
    filenames_to_ignore = [
        '2018-08-06_17:33_vaishaal.json',
        '2018-08-17_17:24_vaishaal.json',
        'vaishaal_hits_submitted_2018-08-17-18:28:33-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-18:50:38-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-19:28:24-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-19:56:28-PDT.json',
        'vaishaal_hits_submitted_2018-08-25-09:47:26-PDT.json']
    mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=filenames_to_ignore, verbose=False)
    to_featurize = []
    to_featurize_keys = []
    client = utils.get_s3_client()
    i = 0
    #candidate_list = dataset_sampling.get_histogram_sampling_ndc_candidates(imgnet=imgnt, cds=cds, mturk=mturk)
    start = timer()
    with open('../data/metadata/fc7_candidates.json', 'r') as f:
        candidate_list = json.load(f)
    for k in candidate_list:
        key_name = os.path.join(prefix, str(k)+".npy")
        key_exists = utils.key_exists(bucket, key_name)
        if not key_exists:
            img = cds.load_image(k, size='original', verbose=False)
            img  = skimage.transform.resize(img, FEATURIZE_SIZE, preserve_range=True)
            to_featurize.append(img)
            to_featurize_keys.append(k)
            #if i > 250:
            #    break;
            i = i + 1
            print('Got candidate {}'.format(i))
    end = timer()
    print(f"Took {end-start} seconds to get remaining candidates.")
    print('Beginning featurization of {} items'.format(len(to_featurize_keys)))
    if len(to_featurize) > 0:
        to_featurize = np.stack(to_featurize, axis=0)
        print(f"input shape {to_featurize.shape}")
        batch_size = min(len(to_featurize), batch_size)
        features = featurize.vgg16_features(to_featurize, batch_size=batch_size)
        print(f"features shape {features.shape}")
        for i,f in enumerate(features):
            key_name = os.path.join(prefix, to_featurize_keys[i]+".npy")
            bio = io.BytesIO()
            np.save(bio, f)
            print("writing key {0}".format(key_name))
            utils.put_s3_object_bytes_with_backoff(bio.getvalue(), key_name)
    print(f"Took {end-start} seconds to get remaining candidates.")
예제 #7
0
def get_similarity_sorted_wnids(q_wnid):
    """Returns a list of sorted (wnid, dist) duples in order of 
    most similar to least similar to the query wnid."""
    q_synset = wn.synset_from_pos_and_offset(q_wnid[0], int(q_wnid[1:]))
    imgnet = imagenet.ImageNetData()
    wnids = list(imgnet.train_imgs_by_wnid.keys())
    similarity_dict = {}
    for wnid in wnids:
        cur_synset = wn.synset_from_pos_and_offset(wnid[0], int(wnid[1:]))
        similarity_dict[wnid] = q_synset.path_similarity(cur_synset)

    sorted_wnids = [
        (k, similarity_dict[k])
        for k in sorted(similarity_dict, key=similarity_dict.get, reverse=True)
    ]
    return sorted_wnids
예제 #8
0
def main():
    with open('../data/metadata/unprocessed_wnids.json', 'r') as f:
        bad_wnids = json.load(f)

    #with open('../data/metadata/wnid_to_parent_2.json', 'r') as f:
    #   wnid_to_parent = json.load(f)

    imgnt = imagenet.ImageNetData()

    wnid_to_parent = {}
    wnids_with_additional_search_terms = []
    wnids_with_no_additional_search_terms = []
    for wnid in bad_wnids:
        if wnid not in wnid_to_parent:
            wnid_to_parent[wnid] = []
        synset = imgnt.class_info_by_wnid[wnid].synset
        gloss = imgnt.class_info_by_wnid[wnid].gloss
        cur_synset = wn.synset_from_pos_and_offset(wnid[0], int(wnid[1:]))
        gloss_list = gloss.split()
        for parent in cur_synset.hypernyms():
            inherited_hypernym = parent.hypernyms()
            for inherited_parent in inherited_hypernym:
                inherited_hypernym_list = inherited_parent.lemma_names()
            parent_list = parent.lemma_names()
        intersect = intersection(gloss_list, parent_list)
        if len(intersect) > 0:
            wnid_to_parent[wnid].extend(intersect)
            wnid_to_parent[wnid] = list(set(wnid_to_parent[wnid]))
            wnids_with_additional_search_terms.append(wnid)

        print('Wnid: ', wnid)
        print('Synset: ', synset)
        print('Gloss: ', gloss)
        print('Parent : ', parent_list)
        print('Parents parent: ', inherited_hypernym_list)
        print('Intersection', intersect)
        print()

    with open('../data/metadata/wnid_to_parent_3.json', 'w') as f:
        json.dump(wnid_to_parent, f, indent=2)
    with open(
            '../data/metadata/unprocessed_wnids_with_additional_search_terms.json',
            'w') as f:
        json.dump(list(set(wnids_with_additional_search_terms)), f, indent=2)
예제 #9
0
def sample_val_dummy(dataset_size, seed):
    num_classes = 1000
    assert dataset_size % num_classes == 0
    num_per_class = dataset_size // num_classes
    rng = random.Random(seed)
    imgnet = imagenet.ImageNetData()

    dataset_images = []
    all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys())))
    for wnid in all_wnids:
        images_for_wnid = list(sorted(imgnet.val_imgs_by_wnid[wnid]))
        cur_images = rng.sample(images_for_wnid, num_per_class)
        dataset_images.extend(sorted([(x, wnid) for x in cur_images]))
    assert len(dataset_images) == dataset_size

    result = {}
    result['sampling_function'] = 'sample_val_dummy'
    result['time_string'] = get_time_string()
    result['username'] = getpass.getuser()
    result['seed'] = seed
    result['image_filenames'] = dataset_images
    return result
예제 #10
0
def sample_val_annotated(dataset_size, min_num_annotations, seed):
    num_classes = 1000
    assert dataset_size % num_classes == 0
    num_per_class = dataset_size // num_classes
    rng = random.Random(seed)
    imgnet = imagenet.ImageNetData()
    mturk = mturk_data.MTurkData(live=True,
                                 load_assignments=True,
                                 source_filenames_to_ignore=mturk_data.
                                 main_collection_filenames_to_ignore)

    dataset_images = []
    all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys())))
    for wnid in all_wnids:
        valid_images_for_wnid = []
        for img in imgnet.val_imgs_by_wnid[wnid]:
            if img in mturk.image_num_assignments and wnid in mturk.image_num_assignments[
                    img] and mturk.image_num_assignments[img][
                        wnid] >= min_num_annotations:
                valid_images_for_wnid.append(img)
        valid_images_for_wnid = sorted(valid_images_for_wnid)
        assert len(valid_images_for_wnid) >= num_per_class
        cur_images = rng.sample(valid_images_for_wnid, num_per_class)
        dataset_images.extend(sorted([(x, wnid) for x in cur_images]))

    rng.shuffle(dataset_images)
    assert len(dataset_images) == dataset_size

    result = {}
    result['sampling_function'] = 'sample_val_annotated'
    result['min_num_annotations'] = min_num_annotations
    result['time_string'] = get_time_string()
    result['username'] = getpass.getuser()
    result['seed'] = seed
    result['image_filenames'] = dataset_images
    return result
예제 #11
0
def download_images(datasets, include_val):
    imgnet = imagenet.ImageNetData()
    cds = candidate_data.CandidateData(exclude_blacklisted_candidates=False)
    loader = image_loader.ImageLoader(imgnet, cds)

    all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys())))
    assert len(all_wnids) == 1000

    for dataset in datasets.split(','):
        print(f'Downloading images for dataset {dataset} ...')

        dataset_filepath = pathlib.Path(
            __file__).parent / '../data/datasets' / (dataset + '.json')
        dataset_filepath = dataset_filepath.resolve()
        assert dataset_filepath.is_file()
        with open(dataset_filepath, 'r') as f:
            data = json.load(f)

        dataset_by_wnid = {x: [] for x in all_wnids}
        for img, wnid in data['image_filenames']:
            dataset_by_wnid[wnid].append(img)
        for cur_wnid in tqdm.tqdm(all_wnids):
            images_to_download = dataset_by_wnid[cur_wnid]
            #if include_val:
            #    images_to_download.extend(imgnet.val_imgs_by_wnid[cur_wnid])
            loader.load_image_bytes_batch(images_to_download,
                                          size='scaled_500',
                                          verbose=False)

    if include_val:
        print('Downloading all validation images ...')
        for cur_wnid in tqdm.tqdm(all_wnids):
            images_to_download = imgnet.val_imgs_by_wnid[cur_wnid]
            loader.load_image_bytes_batch(images_to_download,
                                          size='scaled_500',
                                          verbose=False)
import json
import pathlib

import click
import tqdm

import candidate_data
import image_loader
import imagenet

imgnet = imagenet.ImageNetData()
cds = candidate_data.CandidateData(exclude_blacklisted_candidates=False)
loader = image_loader.ImageLoader(imgnet, cds)

all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys())))
assert len(all_wnids) == 1000

print('Downloading all candidate images ...')
for cur_wnid in tqdm.tqdm(all_wnids):
    images_to_download = cds.candidates_by_wnid[cur_wnid]
    images_to_download = [x['id_ours'] for x in images_to_download]
    loader.load_image_bytes_batch(images_to_download,
                                  size='scaled_500',
                                  verbose=False)

if __name__ == "__main__":
    download_images()
예제 #13
0
def main(args):
    imgnt = imagenet.ImageNetData(verbose=False)
    cds = candidate_data.CandidateData(verbose=False)
    filenames_to_ignore = [
        '2018-08-06_17:33_vaishaal.json', '2018-08-17_17:24_vaishaal.json',
        'vaishaal_hits_submitted_2018-08-17-18:28:33-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-18:50:38-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-19:28:24-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-19:56:28-PDT.json',
        'vaishaal_hits_submitted_2018-08-25-09:47:26-PDT.json'
    ]
    mturk = mturk_data.MTurkData(
        live=True,
        load_assignments=True,
        source_filenames_to_ignore=filenames_to_ignore,
        verbose=False)

    with open(args.input_filename, 'rb') as fp:
        nn_results = pickle.load(fp)
    print('Current nearest neighbor statistics')
    print_nn_stats(nn_results)
    print()

    metric_to_cd, cd_to_dist_counter = select_test_candidates(
        imgnt, cds, mturk, nn_results, args)

    print('Remaining candidate distances to compute')
    for d, num_cds in cd_to_dist_counter.items():
        print('{} cds left for metric {}'.format(num_cds, d))
    print()
    print('Computing neighbors for ')
    for d in distance_metrics:
        print('{} candidates in metric {}.'.format(len(metric_to_cd[d]), d))
    print()

    for metric in args.metrics:
        if metric == 'l2' or metric == 'fc7':
            candidates = metric_to_cd[metric]
            result, _ = compute_distances_for_all_references(
                candidates, metric, imgnt, cds, mturk, args)
            if len(result) != len(candidates):
                print('WARNING: len(result) {} len(candidates) {}'.format(
                    len(result), len(candidates)))
                #assert len(result) == len(candidates)
            nn_results = save_ndc_result(result, nn_results, metric, args)
    for metric in args.metrics:
        if metric == 'dssim':
            print('Computing distances for dssim')
            candidates = metric_to_cd[metric]
            result = compute_distances_for_wnid_references(
                candidates, metric, imgnt, cds, mturk, args)
            if len(result) != len(candidates):
                print('WARNING: len(result) {} len(candidates) {}'.format(
                    len(result), len(candidates)))
                #assert len(result) == len(candidates)
            print('Saving results')
            start = timer()
            nn_results = save_ndc_result(result, nn_results, metric, args)
            end = timer()
            print('Saving the results took {} seconds'.format(end - start))
    num_candidates_left = {}
    for d in args.metrics:
        num_candidates_left[d] = cd_to_dist_counter[d]
    return num_candidates_left
예제 #14
0
def compute_nearest_neighbors(distance_measures, candidate_filenames,
                              reference_filenames, top_k, window_size, cache,
                              cache_root):
    cache_key = compute_hash(distance_measures, candidate_filenames,
                             reference_filenames, top_k, window_size)
    full_key = f"{cache_root}/{cache_key}"
    timing_info = {}
    if cache:
        if utils.key_exists(BUCKET, full_key):
            load_start = timer()
            ret_value = pickle.loads(
                utils.get_s3_object_bytes_with_backoff(full_key)[0])
            load_end = timer()
            compute_start = compute_end = timer()
            timing_info['load_start'] = load_start
            timing_info['load_end'] = load_end
            timing_info['compute_start'] = compute_start
            timing_info['compute_end'] = compute_end
            timing_info['cached'] = True
            return ret_value, timing_info

    imgnt = imagenet.ImageNetData(cache_on_local_disk=True,
                                  verbose=False,
                                  cache_root_path='/tmp/imagenet2_cache')
    cds = candidate_data.CandidateData(cache_on_local_disk=True,
                                       load_metadata_from_s3=True,
                                       verbose=False,
                                       cache_root_path='/tmp/imagenet2_cache')
    loader = image_loader.ImageLoader(imgnt,
                                      cds,
                                      cache_on_local_disk=True,
                                      num_tries=4,
                                      cache_root_path='/tmp/imagenet2_cache')
    load_start = timer()
    if ('l2' in distance_measures) or ('dssim' in distance_measures):
        candidate_image_dict = loader.load_image_batch(candidate_filenames,
                                                       size='scaled_256',
                                                       force_rgb=True,
                                                       verbose=False)
        reference_image_dict = loader.load_image_batch(reference_filenames,
                                                       size='scaled_256',
                                                       force_rgb=True,
                                                       verbose=False)
    if 'fc7' in distance_measures:
        candidate_feature_dict = loader.load_features_batch(
            candidate_filenames, verbose=False)
        reference_feature_dict = loader.load_features_batch(
            reference_filenames, verbose=False)
    load_end = timer()

    compute_start = timer()
    result = {}
    for distance_measure in distance_measures:
        if distance_measure == 'l2':
            result['l2'] = compute_l2_distances(candidate_image_dict,
                                                reference_image_dict, 196608)
        elif distance_measure == 'dssim':
            result['dssim'] = compute_dssim_distances(candidate_image_dict,
                                                      reference_image_dict,
                                                      window_size)
        elif distance_measure == 'fc7':
            result['fc7'] = compute_l2_distances(candidate_feature_dict,
                                                 reference_feature_dict, 4096)
        else:
            raise ValueError('Unknown distance measure')
    compute_end = timer()
    timing_info = {}
    timing_info['load_start'] = load_start
    timing_info['load_end'] = load_end
    timing_info['compute_start'] = compute_start
    timing_info['compute_end'] = compute_end
    timing_info['cached'] = False

    res = compute_top_k(result, top_k)
    if cache:
        utils.put_s3_object_bytes_with_backoff(pickle.dumps(res), full_key)

    return res, timing_info
def carry_over_reviews(dataset_filename, starting_from):
    dataset_filepath = pathlib.Path(
        __file__).parent / '../data/datasets' / dataset_filename
    dataset_filepath = dataset_filepath.resolve()
    assert dataset_filepath.is_file()
    with open(dataset_filepath, 'r') as f:
        dataset = json.load(f)

    prev_dataset_filepath = pathlib.Path(
        __file__).parent / '../data/datasets' / starting_from
    prev_dataset_filepath = prev_dataset_filepath.resolve()
    assert prev_dataset_filepath.is_file()
    with open(prev_dataset_filepath, 'r') as f:
        prev_dataset = json.load(f)
    assert dataset['starting_from'] == prev_dataset['output_filename']

    assert starting_from.endswith('.json')
    prev_review_filename = starting_from[:-5] + '_review.json'
    prev_review_filepath = pathlib.Path(
        __file__).parent / '../data/dataset_reviews' / prev_review_filename
    prev_review_filepath = prev_review_filepath.resolve()
    assert prev_review_filepath.is_file()
    with open(prev_review_filepath, 'r') as f:
        prev_review = json.load(f)

    imgnet = imagenet.ImageNetData()
    all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys())))
    assert len(all_wnids) == 1000

    prev_dataset_by_wnid = {}
    dataset_by_wnid = {}
    for wnid in all_wnids:
        prev_dataset_by_wnid[wnid] = []
        dataset_by_wnid[wnid] = []
    for img, wnid in dataset['image_filenames']:
        dataset_by_wnid[wnid].append(img)
    for img, wnid in prev_dataset['image_filenames']:
        prev_dataset_by_wnid[wnid].append(img)

    new_review = {}
    for wnid in all_wnids:
        new_review[wnid] = {}
        new_review[wnid]['problematic'] = False
        if prev_review[wnid]['problematic']:
            new_review[wnid]['reviewed'] = False
        else:
            if images_are_same(dataset_by_wnid[wnid],
                               prev_dataset_by_wnid[wnid]):
                new_review[wnid]['reviewed'] = prev_review[wnid]['reviewed']
            else:
                new_review[wnid]['reviewed'] = False
    assert dataset_filename.endswith('.json')
    new_review_filename = dataset_filename[:-5] + '_review.json'
    new_review_filepath = pathlib.Path(
        __file__).parent / '../data/dataset_reviews' / new_review_filename
    new_review_filepath = new_review_filepath.resolve()
    assert not new_review_filepath.is_file()
    with open(new_review_filepath, 'w') as f:
        json.dump(new_review, f, indent=2, sort_keys=True)
    print('Wrote new review data to {}'.format(new_review_filepath))
    num_reviewed = len([x for x in new_review.items() if x[1]['reviewed']])
    num_problematic = len(
        [x for x in new_review.items() if x[1]['problematic']])
    print('    {} reviewed wnids'.format(num_reviewed))
    print('    {} problematic wnids'.format(num_problematic))
예제 #16
0
def best(dataset_size, min_num_annotations, seed, output_filename,
         starting_from):
    output_filepath = pathlib.Path(
        __file__).parent / '../data/datasets' / output_filename
    output_filepath = output_filepath.resolve()
    assert not output_filepath.is_file()

    if starting_from is not None:
        starting_from_filepath = pathlib.Path(
            __file__).parent / '../data/datasets' / starting_from
        starting_from_filepath = starting_from_filepath.resolve()
        assert starting_from_filepath.is_file()
        with open(starting_from_filepath, 'r') as f:
            starting_from_loaded = json.load(f)
    else:
        starting_from_loaded = None

    imgnet = imagenet.ImageNetData()
    mturk = mturk_data.MTurkData(live=True,
                                 load_assignments=True,
                                 source_filenames_to_ignore=mturk_data.
                                 main_collection_filenames_to_ignore)

    review_targets = {'l2': 1.2e8, 'dssim': 0.2205, 'fc7': 1.32e4}

    success, result, sampling_candidates, exclusions, carried_over_from_prev = dataset_sampling.sample_best(
        dataset_size=dataset_size,
        min_num_annotations=min_num_annotations,
        near_duplicate_review_targets=review_targets,
        seed=seed,
        starting_from=starting_from_loaded)

    if not success:
        num_per_class = dataset_size // 1000
        print('Failed to sample a valid dataset.')
        print(
            'The following wnids have fewer than {} candidates with at least {} annotations'
            .format(num_per_class, min_num_annotations))
        for wnid, cur_candidates in sampling_candidates.items():
            if len(cur_candidates) < num_per_class - len(
                    carried_over_from_prev[wnid]):
                print(
                    '    {}: {} sampling candidates, plus {} carried over from the previous dataset  ({})'
                    .format(wnid, len(cur_candidates),
                            len(carried_over_from_prev[wnid]),
                            ', '.join(imgnet.class_info_by_wnid[wnid].synset)))
                for reason, excluded_candidates in exclusions[wnid].items():
                    print('        {}: {} candidates'.format(
                        reason, len(excluded_candidates)))

    avg_selection_frequency = 0.0
    for img, wnid in result['image_filenames']:
        avg_selection_frequency += mturk.image_fraction_selected[img][wnid]
    avg_selection_frequency /= len(result['image_filenames'])
    print(f'\nAverage selection frequency: {avg_selection_frequency:.2}')

    all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys())))
    selection_frequencies_by_wnid = {x: [] for x in all_wnids}
    for img, wnid in result['image_filenames']:
        selection_frequencies_by_wnid[wnid].append(
            mturk.image_fraction_selected[img][wnid])
    min_selection_frequency_by_wnid = {
        x: min(selection_frequencies_by_wnid[x])
        for x in all_wnids
    }
    avg_selection_frequency_by_wnid = {
        x: statistics.mean(selection_frequencies_by_wnid[x])
        for x in all_wnids
    }

    show_worst_k = 20
    print('\nwnids with the smallest minimum selection frequencies:')
    for wnid, sel_freq in sorted(min_selection_frequency_by_wnid.items(),
                                 key=lambda x: (x[1], x[0]))[:show_worst_k]:
        synset = ', '.join(imgnet.class_info_by_wnid[wnid].synset)
        print(f'    {wnid}: {sel_freq:.3f}    ({synset})')
    print('\nwnids with the smallest average selection frequencies:')
    for wnid, sel_freq in sorted(avg_selection_frequency_by_wnid.items(),
                                 key=lambda x: (x[1], x[0]))[:show_worst_k]:
        synset = ', '.join(imgnet.class_info_by_wnid[wnid].synset)
        print(f'    {wnid}: {sel_freq:.3f}    ({synset})')

    result['output_filename'] = output_filename
    with open(output_filepath, 'w') as f:
        json.dump(result, f, indent=2)
    print('\nWrote dataset to {}'.format(output_filepath))
예제 #17
0
def main(args):
    imgnt = imagenet.ImageNetData()
    with open(args.flickr_api_key_filename, 'r') as f:
        flickr_api_keys = json.load(f)
        api_key = flickr_api_keys[0]
        api_secret = flickr_api_keys[1]

    with open(args.wnids, 'r') as f:
        wnids = json.load(f)
    print('processing {} wnids'.format(len(wnids)))

    if not args.parallel:
        all_results = []
        for wnid in wnids:
            print("Flickr search for wnid {}".format(wnid))
            res = flickr_search_synset(imgnt, [wnid], api_key, api_secret,
                                       args)
            all_results += res
    else:
        pywren_config = wc.default()
        pywren_config["runtime"]["s3_bucket"] = "imagenet2datav2"
        pywren_config["runtime"][
            "s3_key"] = "pywren.runtime/pywren_runtime-3.6-imagenet2.tar.gz"
        pwex = pywren.default_executor(config=pywren_config)
        pywren_func = lambda x: flickr_search_synset(imgnt, x, api_key,
                                                     api_secret, args)
        pywren_args = list(
            utils.chunks(wnids,
                         int(np.ceil(len(wnids) / args.num_serial_tasks))))
        num_images_per_wnid = {}
        with open(
                '../data/metadata/flickr_' + args.min_date_uploaded + '_' +
                args.max_date_uploaded + '.json', 'r') as fp:
            num_images_per_wnid = json.load(fp)

        for ii, lst in enumerate(pywren_args):
            print("Map {} over {} wnids ".format(ii, len(lst)))
            unfinished_wnids = []
            for wnid in lst:
                if wnid not in num_images_per_wnid:
                    unfinished_wnids.append(wnid)
            print("Executing pywren call for {} wnids".format(
                len(unfinished_wnids)))
            futures = pwex.map(pywren_func, [[x] for x in unfinished_wnids])
            pywren.wait(futures)
            results = [f.result()[0] for f in futures]
            num_images = [f.result()[1] for f in futures]
            for ii, wnid in enumerate(unfinished_wnids):
                num_images_per_wnid[wnid] = num_images[ii]
            all_results = []
            for res in results:
                all_results += res
            with open(
                    '../data/metadata/flickr_' + args.min_date_uploaded + '_' +
                    args.max_date_uploaded + '.json', 'w') as fp:
                json.dump(num_images_per_wnid, fp, indent=2)
    print('Got {} results'.format(len(all_results)))
    current_date = str(datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'))
    out_file = '../data/search_results/' + current_date + '_' + getpass.getuser(
    ) + '.json'
    with open(out_file, 'w+') as fp:
        json.dump(all_results, fp, indent=2)
                print(f"expected {true_answer}, got {result}")
                print(v)
                assert true_answer == result
                print("Passed NDC for metric {0} for test {1}".format(
                    m, prefix))
                if (exact):
                    if (not np.isclose(v[0][1], 0)):
                        print(m, val, k, v)
                    assert np.isclose(v[0][1], 0)
        return res

    return test


if __name__ == "__main__":
    im_data = imagenet.ImageNetData()
    #image_names = im_data.get_all_val_image_names() + im_data.get_all_train_image_names() + im_data
    #image_names = im_data.get_all_val_image_names() + im_data.get_all_train_image_names() + im_data
    references = im_data.get_all_val_image_names()[:100]
    custom_test = make_test(references,
                            mod_fn=lambda x: x + np.random.randn(*x.shape),
                            metrics=['fc7'],
                            size='scaled_256',
                            num_extra_images=10,
                            exact=False)
    custom_test(top_k=10,
                extra_pairs=[("n02085936_7394.JPEG", "n02085936_10397.JPEG")])
    custom_test(top_k=10)

    references = im_data.get_all_train_image_names()[:100]
    custom_test = make_test(references,
예제 #19
0
import json
import os
import urllib.request

import imagenet
import utils

imgnet = imagenet.ImageNetData(load_class_info=False)


def lookup_wnid(wnid):
    url = 'http://www.image-net.org/api/text/wordnet.synset.getwords?wnid={0}'.format(
        wnid)
    return urllib.request.urlopen(url).read().decode().strip().split('\n')


gloss_bytes = utils.get_s3_file_bytes('metadata/gloss.txt',
                                      cache_on_local_disk=False)
gloss_string = gloss_bytes.decode('utf-8')
gloss_lines = gloss_string.split('\n')
gloss = {}
for line in gloss_lines:
    wnid = line[:9]
    cur_gloss = line[10:]
    gloss[wnid] = cur_gloss

tmpci2 = []
wnids = sorted(imgnet.train_imgs_by_wnid.keys())

for ii, wnid in enumerate(wnids):
    cur_dict = {}
예제 #20
0
def eval(dataset, models, batch_size):
    dataset_filename = dataset
    if models == 'all':
        models = all_models
    else:
        models = models.split(',')
    for model in models:
        assert model in all_models

    dataset_filepath = pathlib.Path(__file__).parent / '../data/datasets' / (
        dataset_filename + '.json')
    print('Reading dataset from {} ...'.format(dataset_filepath))
    with open(dataset_filepath, 'r') as f:
        dataset = json.load(f)
    cur_imgs = [x[0] for x in dataset['image_filenames']]

    imgnet = imagenet.ImageNetData()
    cds = candidate_data.CandidateData(load_metadata_from_s3=False,
                                       exclude_blacklisted_candidates=False)
    loader = image_loader.ImageLoader(imgnet, cds)

    pbar = tqdm(total=len(cur_imgs), desc='Dataset download')
    img_data = loader.load_image_bytes_batch(
        cur_imgs,
        size='scaled_500',
        verbose=False,
        download_callback=lambda x: pbar.update(x))
    pbar.close()

    for model in tqdm(models, desc='Model evaluations'):
        if (model not in extra_models):
            tqdm.write('Evaluating {}'.format(model))
            resize_size = 256
            center_crop_size = 224
            if model == 'inception_v3':
                resize_size = 299
                center_crop_size = 299
            data_loader = eval_utils.get_data_loader(
                cur_imgs,
                imgnet,
                cds,
                image_size='scaled_500',
                resize_size=resize_size,
                center_crop_size=center_crop_size,
                batch_size=batch_size)
            pt_model = getattr(torchvision.models, model)(pretrained=True)
            if (torch.cuda.is_available()):
                pt_model = pt_model.cuda()
            pt_model.eval()
            tqdm.write('    Number of trainable parameters: {}'.format(
                sum(p.numel() for p in pt_model.parameters()
                    if p.requires_grad)))

            predictions, top1_acc, top5_acc, total_time, num_images = eval_utils.evaluate_model(
                pt_model, data_loader, show_progress_bar=True)
            tqdm.write('    Evaluated {} images'.format(num_images))
            tqdm.write('    Top-1 accuracy: {:.2f}'.format(100.0 * top1_acc))
            tqdm.write('    Top-5 accuracy: {:.2f}'.format(100.0 * top5_acc))
            tqdm.write(
                '    Total time: {:.1f}  (average time per image: {:.2f} ms)'.
                format(total_time, 1000.0 * total_time / num_images))
            npy_out_filepath = pathlib.Path(
                __file__).parent / '../data/predictions' / dataset_filename / (
                    model + '.npy')
            npy_out_filepath = npy_out_filepath.resolve()
            directory = os.path.dirname(npy_out_filepath)
            if not os.path.exists(directory):
                os.makedirs(directory)
            if (os.path.exists(npy_out_filepath)):
                old_preds = np.load(npy_out_filepath)
                np.save(f'{npy_out_filepath}.{int(time.time())}', old_preds)
                print('checking old preds is same as new preds')
                if not np.allclose(old_preds, predictions):
                    diffs = np.round(old_preds - predictions, 4)
                    print('old preds != new preds')
                else:
                    print('old preds == new_preds!')
            np.save(npy_out_filepath, predictions)
            tqdm.write('    Saved predictions to {}'.format(npy_out_filepath))
        else:
            tqdm.write('Evaluating extra model {}'.format(model))
            if (model in {"dpn68b", "dpn92", "dpn107"}):
                pt_model = pretrainedmodels.__dict__[model](
                    num_classes=1000, pretrained='imagenet+5k')
            else:
                pt_model = pretrainedmodels.__dict__[model](
                    num_classes=1000, pretrained='imagenet')
            tf_img = pretrained_utils.TransformImage(pt_model)
            load_img = pretrained_utils.LoadImage()
            tqdm.write('    Number of trainable parameters: {}'.format(
                sum(p.numel() for p in pt_model.parameters()
                    if p.requires_grad)))

            #print(pt_model)
            #print(load_img)
            dataset = eval_utils.ImageLoaderDataset(cur_imgs,
                                                    imgnet,
                                                    cds,
                                                    'scaled_500',
                                                    transform=tf_img)

            data_loader = torch.utils.data.DataLoader(dataset,
                                                      batch_size=batch_size,
                                                      shuffle=False,
                                                      num_workers=0,
                                                      pin_memory=True)
            if (torch.cuda.is_available()):
                pt_model = pt_model.cuda()

            pt_model.eval()
            predictions, top1_acc, top5_acc, total_time, num_images = eval_utils.evaluate_model(
                pt_model, data_loader, show_progress_bar=True)
            tqdm.write('    Evaluated {} images'.format(num_images))
            tqdm.write('    Top-1 accuracy: {:.2f}'.format(100.0 * top1_acc))
            tqdm.write('    Top-5 accuracy: {:.2f}'.format(100.0 * top5_acc))
            tqdm.write(
                '    Total time: {:.1f}  (average time per image: {:.2f} ms)'.
                format(total_time, 1000.0 * total_time / num_images))
            npy_out_filepath = pathlib.Path(
                __file__).parent / '../data/predictions' / dataset_filename / (
                    model + '.npy')
            npy_out_filepath = npy_out_filepath.resolve()
            directory = os.path.dirname(npy_out_filepath)
            if not os.path.exists(directory):
                os.makedirs(directory)
            if (os.path.exists(npy_out_filepath)):
                old_preds = np.load(npy_out_filepath)
                np.save(f'{npy_out_filepath}.{int(time.time())}', old_preds)
                print('checking old preds is same as new preds')
                if not np.allclose(old_preds, predictions):
                    diffs = np.round(old_preds - predictions, 4)
                    print('old preds != new preds')
                else:
                    print('old preds == new_preds!')
            np.save(npy_out_filepath, predictions)
            tqdm.write('    Saved predictions to {}'.format(npy_out_filepath))
예제 #21
0
파일: mturk.py 프로젝트: wuxf-ml/ImageNetV2
def _generate_hits(candidates,
                   images_per_hit=25,
                   pos_control=0,
                   neg_control=0,
                   seed=0):
    ''' Generates a list of dictionaries fully specifying the HITs '''
    #assert(neg_control == 0)
    c_data = candidate_data.CandidateData()
    imagenet_data = imagenet.ImageNetData()
    with open("../data/metadata/wnid_to_most_similar_wnids.json") as f:
        neg_ids = json.loads(f.read())
    grouped_by_class = defaultdict(list)
    np.random.seed(seed)
    hits = []
    print("Num Candidates ", len(candidates))
    for c in candidates:
        c_json = c_data.all_candidates[c]
        c_wnid = c_data.all_candidates[c]["wnid"]
        grouped_by_class[c_wnid].append(c_json)
    wiki_fail = False
    for k, v in grouped_by_class.items():
        class_info = imagenet_data.class_info_by_wnid[k]
        if (len(class_info.wikipedia_pages) == 0):
            print(f"no wikipedia page for {k}")
            wiki_fail = True
        hit_lines = list(
            utils.chunks(v, images_per_hit - pos_control - neg_control))
        tail_len = len(hit_lines[-1])

        if (tail_len != len(hit_lines[0]) and tail_len <
            (images_per_hit - pos_control - neg_control)):
            idxs = np.random.choice(len(v) - tail_len,
                                    images_per_hit - tail_len - pos_control -
                                    neg_control,
                                    replace=False)
            for i in idxs:
                hit_lines[-1].append(v[i])

        for hit_line in hit_lines:
            hit_data = {}
            hit_data["wnid"] = k
            # list of image ids
            hit_data["images_to_label"] = []
            hit_data["images_pos_control"] = []
            hit_data["images_neg_control"] = []
            hit_data["images_all"] = []
            hit_data["user"] = getpass.getuser()
            hit_data["uuid"] = str(uuid.uuid4())
            hit_data["time"] = str(datetime.now(tzlocal()))
            hit_data["submitted"] = False
            hit_data["hit_id"] = ''
            hit_data["hit_type_id"] = ''

            val_imgs_dict = imagenet_data.val_imgs_by_wnid
            wnid = k
            pos_class = val_imgs_dict[wnid]
            pos_extra = int(
                np.ceil((images_per_hit - pos_control - neg_control -
                         len(hit_line)) / 2))
            neg_extra = int(
                np.floor((images_per_hit - pos_control - neg_control -
                          len(hit_line)) / 2))
            if (len(hit_line) == images_per_hit - pos_control - neg_control):
                assert (pos_extra == 0)
                assert (neg_extra == 0)

            idxs_pos = np.random.choice(len(pos_class),
                                        pos_control + pos_extra,
                                        replace=False)
            if (wnid not in neg_ids):
                assert False

            neg_wnid = neg_ids[wnid][1]

            neg_class = val_imgs_dict[neg_wnid]
            idxs_neg = np.random.choice(len(neg_class),
                                        neg_control + neg_extra,
                                        replace=False)
            #idxs_neg = []
            pos_control_list = []
            neg_control_list = []

            for i in idxs_pos:
                pos_control_list.append(pos_class[i])
            for i in idxs_neg:
                neg_control_list.append(neg_class[i])

            for i, image in enumerate(hit_line):
                hit_data["images_to_label"].append(image['id_ours'])
                hit_data["images_all"].append(image['id_ours'])

            # right now this won't work
            for i, image in enumerate(pos_control_list):
                hit_data["images_pos_control"].append(image)
                hit_data["images_all"].append(image)

            for i, image in enumerate(neg_control_list):
                hit_data["images_neg_control"].append(image)
                hit_data["images_all"].append(image)
            np.random.shuffle(hit_data["images_all"])
            hits.append(hit_data)
    if (wiki_fail):
        assert False
    return hits
예제 #22
0
def generate_opposite_class_json():
    imgnet = imagenet.ImageNetData()
    wnids = list(imgnet.train_imgs_by_wnid.keys())
    result = get_all_negative_wnids(wnids)
    with open('../data/metadata/wnid_to_farthest_wnid.json', 'w') as fp:
        json.dump(result, fp, indent=2)
    def test(top_k=5, seed=586724699, extra_pairs=[]):
        im_data = imagenet.ImageNetData()
        np.random.seed(seed)
        images = np.random.choice(references,
                                  control_images + num_extra_images,
                                  replace=False)
        extra_images = images[control_images:]
        images = images[:control_images]
        image_ids = []
        img_info = []
        test_dataset = []
        client = utils.get_s3_client()
        true_dict = {}
        to_featurize = []
        to_featurize_keys = []
        for im_name in images:
            im_meta, img = make_test_img(im_data,
                                         im_name,
                                         prefix=prefix,
                                         size=size,
                                         exact=exact)
            true_dict[im_meta['id_ours']] = im_name
            img_info.append(im_meta)
            img_orig = img
            if not exact:
                img = mod_fn(img)
                img = resize(img, (256, 256), preserve_range=True)
            else:
                im_bytes = img
                img = imageio.imread(img)
            if 'fc7' in metrics:
                key_name = os.path.join("imagenet2candidates_featurized",
                                        f"{im_meta['id_ours']}.npy")
                im_resize = resize(img_orig, (224, 224), preserve_range=True)
                to_featurize.append(im_resize.astype('float32'))
                to_featurize_keys.append(key_name)
            bio = io.BytesIO()
            if not exact:
                imageio.imwrite(uri=bio, im=img, format="jpg", quality=100)
                bstream = bio.getvalue()

            else:
                print("Exact bytes..")
                bstream = im_bytes
            key = "imagenet2candidates_scaled/{0}.jpg".format(
                im_meta['id_ours'])
            print("uploading.. to {0}".format(key))
            client.put_object(Bucket=bucket, Key=key, Body=bstream)
        if len(to_featurize) > 0:
            to_featurize = np.stack(to_featurize, axis=0)
            batch_size = min(len(to_featurize), 32)
            features = featurize.vgg16_features(to_featurize,
                                                batch_size=batch_size,
                                                use_gpu=False)
            for i, f in enumerate(features):
                key_name = to_featurize_keys[i]
                bio = io.BytesIO()
                np.save(bio, f)
                print("writing features key {0}".format(key_name))
                bstream = bio.getvalue()
                print("feature hash ", hashlib.sha1(bstream).hexdigest())
                client.put_object(Key=key_name, Bucket=bucket, Body=bstream)

        with open(
                "../data/search_results/test_{0}_results.json".format(prefix),
                "w+") as f:
            f.write(json.dumps(img_info))
        candidates = [x['id_ours'] for x in img_info]
        extra_images = list(extra_images)
        print("extra pairs", extra_pairs)
        print("len extra_images", len(extra_images))
        for e, v in extra_pairs:
            true_dict[e] = v
            candidates.append(e)
            extra_images.append(v)
            print("len after append extra_images", len(extra_images))

        for e in extra_images:
            true_dict[e] = e

        for e in images:
            true_dict[e] = e

        reference_names = list(images) + list(extra_images)
        print(
            f"running near duplicate check on {candidates} vs {reference_names}"
        )
        print(f"num references {len(references)}")
        res, t_info = near_duplicate_checker.get_near_duplicates(
            candidates,
            reference_names,
            top_k=top_k,
            dssim_window_size=35,
            use_pywren=False,
            ref_chunk_size=100,
            cd_chunk_size=100,
            distance_metrics=metrics)
        for m, val in res.items():
            for k, v in val.items():
                true_answer = true_dict[k]
                result = v[0][0]
                if (true_answer != result):
                    print(m, val, k, v)
                print(f"expected {true_answer}, got {result}")
                print(v)
                assert true_answer == result
                print("Passed NDC for metric {0} for test {1}".format(
                    m, prefix))
                if (exact):
                    if (not np.isclose(v[0][1], 0)):
                        print(m, val, k, v)
                    assert np.isclose(v[0][1], 0)
        return res
예제 #24
0
def wnid_histogram(dataset_size, min_num_annotations_candidates,
                   min_num_annotations_val, min_num_val_images_per_wnid, seed,
                   output_filename, starting_from, allow_upward_sampling):
    output_filepath = pathlib.Path(
        __file__).parent / '../data/datasets' / output_filename
    output_filepath = output_filepath.resolve()
    assert not output_filepath.is_file()

    if starting_from is not None:
        starting_from_filepath = pathlib.Path(
            __file__).parent / '../data/datasets' / starting_from
        starting_from_filepath = starting_from_filepath.resolve()
        assert starting_from_filepath.is_file()
        with open(starting_from_filepath, 'r') as f:
            starting_from_loaded = json.load(f)
    else:
        starting_from_loaded = None

    review_targets = {'l2': 1.2e8, 'dssim': 0.2205, 'fc7': 1.32e4}
    histogram_bins = [0.2, 0.4, 0.6, 0.8]
    num_bins = len(histogram_bins) + 1
    success, result, results_metadata = dataset_sampling.sample_wnid_histogram(
        dataset_size=dataset_size,
        histogram_bins=histogram_bins,
        min_num_annotations_candidates=min_num_annotations_candidates,
        min_num_annotations_val=min_num_annotations_val,
        min_num_val_images_per_wnid=min_num_val_images_per_wnid,
        near_duplicate_review_targets=review_targets,
        seed=seed,
        starting_from=starting_from_loaded,
        allow_upward_sampling=allow_upward_sampling)
    imgnet = imagenet.ImageNetData()
    all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys())))
    if not success:
        total_num_problematic_bins = 0
        print(
            f'Failed to sample a valid dataset  ({len(result["image_filenames"])} instead of {dataset_size} images).'
        )
        print(
            'The following wnid bins have insufficient images (before potential upward sampling):'
        )
        for wnid in all_wnids:
            cur_histogram = results_metadata['wnid_histograms'][wnid]
            cur_sampling_candidates = results_metadata['sampling_candidates'][
                wnid]
            cur_carried_over_from_prev = results_metadata[
                'carried_over_from_prev'][wnid]
            cur_exclusions = results_metadata['exclusions'][wnid]

            problematic_bins = []
            for cur_bin in range(num_bins):
                if cur_histogram[cur_bin] > len(
                        cur_sampling_candidates[cur_bin]) + len(
                            cur_carried_over_from_prev[cur_bin]):
                    problematic_bins.append(cur_bin)
            total_num_problematic_bins += len(problematic_bins)
            if len(problematic_bins) > 0:
                print('wnid {} ({})'.format(
                    wnid, ', '.join(imgnet.class_info_by_wnid[wnid].synset)))
                for cur_bin in problematic_bins:
                    cur_low, cur_high = dataset_sampling.get_bin_boundaries(
                        histogram_bins, cur_bin)
                    cur_valid = len(cur_sampling_candidates[cur_bin]) + len(
                        cur_carried_over_from_prev[cur_bin])
                    print(
                        '    bin ({} {}): target {}, currently have {}'.format(
                            cur_low, cur_high, cur_histogram[cur_bin],
                            cur_valid))
                    print('        {} sampling candidates'.format(
                        len(cur_sampling_candidates[cur_bin])))
                    print(
                        '        {} carried over from previous dataset'.format(
                            len(cur_carried_over_from_prev[cur_bin])))
                    for reason, excluded_candidates in cur_exclusions[
                            cur_bin].items():
                        print('        {}: {} excluded candidates'.format(
                            reason, len(excluded_candidates)))
                print()
        print(
            '{} problematic bins in total'.format(total_num_problematic_bins))

    if allow_upward_sampling:
        num_upward_sampled = 0
        print('\nUpward sampled the following images:')
        for wnid in all_wnids:
            upward_sampled_for_wnid = results_metadata['upward_sampled'][wnid]
            has_upsampled_bins = False
            for cur_bin in range(num_bins):
                if len(upward_sampled_for_wnid[cur_bin]) > 0:
                    has_upsampled_bins = True
                    break
            if has_upsampled_bins:
                print('wnid {} ({})'.format(
                    wnid, ', '.join(imgnet.class_info_by_wnid[wnid].synset)))
                for cur_bin in range(num_bins):
                    cur_upward_sampled = upward_sampled_for_wnid[cur_bin]
                    for cid, to_bin in cur_upward_sampled:
                        original_low, original_high = dataset_sampling.get_bin_boundaries(
                            histogram_bins, cur_bin)
                        to_low, to_high = dataset_sampling.get_bin_boundaries(
                            histogram_bins, to_bin)
                        print(
                            f'    sampled {cid} belonging to bin ({original_low} {original_high}) from bin ({to_low} {to_high}) instead'
                        )
                        num_upward_sampled += 1
                print()
        print(f'\nUpwarded sampled {num_upward_sampled} images in total')
        if not success:
            print(
                'The following wnid have insufficient images even after upward sampling:'
            )
            num_per_class = dataset_size // 1000
            images_by_wnid = {}
            for img, wnid in result['image_filenames']:
                if wnid not in images_by_wnid:
                    images_by_wnid[wnid] = []
                images_by_wnid[wnid].append(img)
            for wnid in all_wnids:
                if len(images_by_wnid[wnid]) < num_per_class:
                    print('    wnid {}: {} / {} images  ({})'.format(
                        wnid, len(images_by_wnid[wnid]), num_per_class,
                        ', '.join(imgnet.class_info_by_wnid[wnid].synset)))

    result['output_filename'] = output_filename
    with open(output_filepath, 'w') as f:
        json.dump(result, f, indent=2)
    print('Wrote dataset to {}'.format(output_filepath))
예제 #25
0
def sample_wnid_histogram(*,
                          dataset_size,
                          histogram_bins,
                          min_num_annotations_candidates,
                          min_num_annotations_val,
                          min_num_val_images_per_wnid,
                          near_duplicate_review_targets,
                          seed,
                          starting_from=None,
                          allow_upward_sampling=False):
    num_classes = 1000
    assert dataset_size % num_classes == 0
    for metric in near_duplicate_data.metric_names:
        assert metric in near_duplicate_review_targets
    assert len(near_duplicate_review_targets) == len(
        near_duplicate_data.metric_names)
    num_per_class = dataset_size // num_classes
    num_bins = len(histogram_bins) + 1
    rng = random.Random(seed)
    imgnet = imagenet.ImageNetData()
    cds = candidate_data.CandidateData(load_metadata_from_s3=False,
                                       exclude_blacklisted_candidates=False)
    mturk = mturk_data.MTurkData(live=True,
                                 load_assignments=True,
                                 source_filenames_to_ignore=mturk_data.
                                 main_collection_filenames_to_ignore)
    ndc = near_duplicate_data.NearDuplicateData(imgnet=imgnet,
                                                candidates=cds,
                                                mturk_data=mturk,
                                                load_review_thresholds=True)
    all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys())))

    success = True
    histograms_success, wnid_histograms, usable_val_imgs_by_wnid = compute_wnid_histograms(
        imgnet=imgnet,
        mturk=mturk,
        min_num_annotations_val=min_num_annotations_val,
        min_num_val_images_per_wnid=min_num_val_images_per_wnid,
        histogram_bins=histogram_bins,
        num_per_class=num_per_class)
    if not histograms_success:
        success = False
    prev_dataset_by_wnid = get_prev_dataset_by_wnid(starting_from,
                                                    dataset_size, all_wnids,
                                                    cds)

    def is_cid_ok(cid, wnid):
        if cid in cds.blacklist:
            return False, 'blacklisted'
        if cid not in mturk.image_num_assignments:
            return False, 'few_assignments'
        if wnid not in mturk.image_num_assignments[cid]:
            return False, 'few_assignments'
        if mturk.image_num_assignments[cid][
                wnid] < min_num_annotations_candidates:
            return False, 'few_assignments'
        if ndc.is_near_duplicate[cid]:
            return False, 'near_duplicate'
        sufficiently_reviewed = True
        if cid not in ndc.review_threshold:
            sufficiently_reviewed = False
        else:
            for metric in near_duplicate_data.metric_names:
                if metric not in ndc.review_threshold[cid]:
                    sufficiently_reviewed = False
                elif ndc.review_threshold[cid][
                        metric] <= near_duplicate_review_targets[metric]:
                    sufficiently_reviewed = False
        if not sufficiently_reviewed:
            return False, 'unreviewed'
        return True, None

    dataset_images = []
    sampling_candidates = {}
    exclusions = {}
    carried_over_from_prev = {}
    upward_sampled = {}
    for wnid in all_wnids:
        cur_target = wnid_histograms[wnid]
        exclusions[wnid] = {}
        for x in range(num_bins):
            exclusions[wnid][x] = OrderedDict([('blacklisted', []),
                                               ('few_assignments', []),
                                               ('below_threshold', []),
                                               ('near_duplicate', []),
                                               ('unreviewed', [])])
        carried_over_from_prev[wnid] = {x: [] for x in range(num_bins)}
        sampled_images_by_bin = {x: [] for x in range(num_bins)}
        prev_by_bin = {x: [] for x in range(num_bins)}
        for cid in prev_dataset_by_wnid[wnid]:
            cur_freq = mturk.image_fraction_selected[cid][wnid]
            cur_bin = get_histogram_bin(cur_freq, histogram_bins)
            cur_ok, cur_reason = is_cid_ok(cid, wnid)
            if cur_ok:
                prev_by_bin[cur_bin].append(cid)
            else:
                exclusions[wnid][cur_bin][cur_reason].append(cid)
        for cur_bin in range(num_bins):
            if len(prev_by_bin[cur_bin]) <= cur_target[cur_bin]:
                sampled_images_by_bin[cur_bin].extend(prev_by_bin[cur_bin])
                carried_over_from_prev[wnid][cur_bin].extend(
                    prev_by_bin[cur_bin])
            else:
                cur_sample = rng.sample(prev_by_bin[cur_bin],
                                        cur_target[cur_bin])
                sampled_images_by_bin[cur_bin].extend(cur_sample)
                carried_over_from_prev[wnid][cur_bin].extend(cur_sample)

        sample_candidates_by_bin = {x: [] for x in range(num_bins)}
        unmodified_sample_candidates_by_bin = {x: [] for x in range(num_bins)}
        for cand in cds.candidates_by_wnid[wnid]:
            cid = cand['id_ours']
            if cid in mturk.image_fraction_selected and wnid in mturk.image_fraction_selected[
                    cid]:
                cur_freq = mturk.image_fraction_selected[cid][wnid]
            else:
                cur_freq = 0.0
            cur_bin = get_histogram_bin(cur_freq, histogram_bins)
            cur_ok, cur_reason = is_cid_ok(cid, wnid)
            if cur_ok:
                already_used = False
                for tmp_bin in range(num_bins):
                    if cid in carried_over_from_prev[wnid][tmp_bin]:
                        already_used = True
                if not already_used:
                    sample_candidates_by_bin[cur_bin].append(cid)
            else:
                exclusions[wnid][cur_bin][cur_reason].append(cid)
        for cur_bin in range(num_bins):
            sample_candidates_by_bin[cur_bin] = list(
                sorted(sample_candidates_by_bin[cur_bin]))
            unmodified_sample_candidates_by_bin[cur_bin] = copy.deepcopy(
                sample_candidates_by_bin[cur_bin])
            num_remaining_to_sample = cur_target[cur_bin] - len(
                sampled_images_by_bin[cur_bin])
            if num_remaining_to_sample > len(
                    sample_candidates_by_bin[cur_bin]):
                if not allow_upward_sampling:
                    success = False
                cur_sample = sample_candidates_by_bin[cur_bin]
                sample_candidates_by_bin[cur_bin] = []
            else:
                cur_sample = rng.sample(sample_candidates_by_bin[cur_bin],
                                        num_remaining_to_sample)
                sample_candidates_by_bin[cur_bin] = list(
                    set(sample_candidates_by_bin[cur_bin]) - set(cur_sample))
            sampled_images_by_bin[cur_bin].extend(cur_sample)
        if allow_upward_sampling:
            upward_sampled[wnid] = []
            for cur_bin in range(num_bins):
                cur_upward_sampled = []
                num_remaining_to_sample = cur_target[cur_bin] - len(
                    sampled_images_by_bin[cur_bin])
                if num_remaining_to_sample > 0:
                    assert len(sample_candidates_by_bin[cur_bin]) == 0
                for _ in range(num_remaining_to_sample):
                    found_bin = False
                    for next_bin in range(cur_bin + 1, num_bins):
                        if len(sample_candidates_by_bin[next_bin]) > 0:
                            sample_candidates_from_prev = set(
                                sample_candidates_by_bin[next_bin]) & set(
                                    prev_dataset_by_wnid[wnid])
                            if len(sample_candidates_from_prev) > 0:
                                cur_sample = [
                                    list(sample_candidates_from_prev)[0]
                                ]
                                print(
                                    f'    upward sampled {cur_sample[0]} from the prev dataset'
                                )
                            else:
                                cur_sample = rng.sample(
                                    sample_candidates_by_bin[next_bin], 1)
                                print(
                                    f'    upward sampled {cur_sample[0]} randomly'
                                )
                            assert len(cur_sample) == 1
                            sampled_images_by_bin[cur_bin].extend(cur_sample)
                            sample_candidates_by_bin[next_bin] = list(
                                set(sample_candidates_by_bin[next_bin]) -
                                set(cur_sample))
                            cur_upward_sampled.append(
                                (cur_sample[0], next_bin))
                            found_bin = True
                            break
                    if not found_bin:
                        success = False
                upward_sampled[wnid].append(cur_upward_sampled)
        for cur_bin in range(num_bins):
            dataset_images.extend([x, wnid]
                                  for x in sampled_images_by_bin[cur_bin])
        sampling_candidates[wnid] = unmodified_sample_candidates_by_bin

    rng.shuffle(dataset_images)
    if len(dataset_images) > dataset_size:
        print(len(dataset_images), dataset_size)
    assert len(dataset_images) <= dataset_size
    if success:
        assert len(dataset_images) == dataset_size

    result = {}
    result['sampling_function'] = 'sample_wnid_histogram'
    result['target_size'] = dataset_size
    result['histogram_bins'] = histogram_bins
    result['min_num_annotations_candidates'] = min_num_annotations_candidates
    result['min_num_annotations_val'] = min_num_annotations_val
    result['min_num_val_images_per_wnid'] = min_num_val_images_per_wnid
    result['near_duplicate_review_targets'] = near_duplicate_review_targets
    result['time_string'] = get_time_string()
    result['username'] = getpass.getuser()
    result['seed'] = seed
    result['image_filenames'] = dataset_images
    result['is_valid'] = success
    result['allow_upward_sampling'] = allow_upward_sampling
    if starting_from is not None:
        result['starting_from'] = starting_from['output_filename']

    result_metadata = {}
    result_metadata['wnid_histograms'] = wnid_histograms
    result_metadata['usable_val_imgs_by_wnid'] = usable_val_imgs_by_wnid
    result_metadata['sampling_candidates'] = sampling_candidates
    result_metadata['exclusions'] = exclusions
    result_metadata['carried_over_from_prev'] = carried_over_from_prev
    result_metadata['upward_sampled'] = upward_sampled
    return success, result, result_metadata
예제 #26
0
def sample_above_threshold(*,
                           dataset_size,
                           selection_frequency_threshold,
                           min_num_annotations,
                           near_duplicate_review_targets,
                           seed,
                           starting_from=None,
                           wnid_thresholds=None):
    num_classes = 1000
    assert dataset_size % num_classes == 0
    for metric in near_duplicate_data.metric_names:
        assert metric in near_duplicate_review_targets
    assert len(near_duplicate_review_targets) == len(
        near_duplicate_data.metric_names)
    num_per_class = dataset_size // num_classes
    rng = random.Random(seed)
    imgnet = imagenet.ImageNetData()
    cds = candidate_data.CandidateData(load_metadata_from_s3=False,
                                       exclude_blacklisted_candidates=False)
    mturk = mturk_data.MTurkData(live=True,
                                 load_assignments=True,
                                 source_filenames_to_ignore=mturk_data.
                                 main_collection_filenames_to_ignore)
    ndc = near_duplicate_data.NearDuplicateData(imgnet=imgnet,
                                                candidates=cds,
                                                mturk_data=mturk,
                                                load_review_thresholds=True)

    def is_cid_ok(cid, wnid):
        if cid in cds.blacklist:
            return False, 'blacklisted'
        if cid not in mturk.image_num_assignments:
            return False, 'few_assignments'
        if wnid not in mturk.image_num_assignments[cid]:
            return False, 'few_assignments'
        if mturk.image_num_assignments[cid][wnid] < min_num_annotations:
            return False, 'few_assignments'
        if wnid in wnid_thresholds:
            cur_threshold = wnid_thresholds[wnid]
        else:
            cur_threshold = selection_frequency_threshold
        if mturk.image_fraction_selected[cid][wnid] < cur_threshold:
            return False, 'below_threshold'
        if ndc.is_near_duplicate[cid]:
            return False, 'near_duplicate'
        sufficiently_reviewed = True
        for metric in near_duplicate_data.metric_names:
            if cid not in ndc.review_threshold or metric not in ndc.review_threshold[
                    cid]:
                sufficiently_reviewed = False
            elif ndc.review_threshold[cid][
                    metric] <= near_duplicate_review_targets[metric]:
                sufficiently_reviewed = False
        if not sufficiently_reviewed:
            return False, 'unreviewed'
        return True, None

    all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys())))
    if wnid_thresholds is not None:
        for wnid in wnid_thresholds.keys():
            assert wnid in all_wnids

    prev_dataset_by_wnid = get_prev_dataset_by_wnid(starting_from,
                                                    dataset_size, all_wnids,
                                                    cds)

    dataset_images = []
    sampling_candidates = {}
    exclusions = {}
    success = True
    carried_over_from_prev = {}
    for wnid in all_wnids:
        sampling_candidates[wnid] = []
        exclusions[wnid] = OrderedDict([('blacklisted', []),
                                        ('few_assignments', []),
                                        ('below_threshold', []),
                                        ('near_duplicate', []),
                                        ('unreviewed', [])])
        carried_over_from_prev[wnid] = []
        if wnid in prev_dataset_by_wnid:
            for cid in prev_dataset_by_wnid[wnid]:
                if is_cid_ok(cid, wnid)[0]:
                    carried_over_from_prev[wnid].append(cid)
        for cand in cds.candidates_by_wnid[wnid]:
            cid = cand['id_ours']
            cur_ok, cur_reason = is_cid_ok(cid, wnid)
            if cur_ok:
                if cid not in carried_over_from_prev[wnid]:
                    sampling_candidates[wnid].append(cid)
            else:
                exclusions[wnid][cur_reason].append(cid)
        sampling_candidates[wnid] = list(sorted(sampling_candidates[wnid]))
        remaining_to_sample = num_per_class - len(carried_over_from_prev[wnid])
        if len(sampling_candidates[wnid]) < remaining_to_sample:
            success = False
            tmp_images = [(x, wnid) for x in carried_over_from_prev[wnid]
                          ] + [(x, wnid) for x in sampling_candidates[wnid]]
            dataset_images.extend(tmp_images)
        else:
            new_images = rng.sample(sampling_candidates[wnid],
                                    remaining_to_sample)
            tmp_images = [(x, wnid) for x in carried_over_from_prev[wnid]
                          ] + [(x, wnid) for x in new_images]
            dataset_images.extend(tmp_images)

    rng.shuffle(dataset_images)
    if success:
        assert len(dataset_images) == dataset_size

    result = {}
    result['sampling_function'] = 'sample_above_threshold'
    result['target_size'] = dataset_size
    result['selection_frequency_threshold'] = selection_frequency_threshold
    result['min_num_annotations'] = min_num_annotations
    result['near_duplicate_review_targets'] = near_duplicate_review_targets
    result['time_string'] = get_time_string()
    result['username'] = getpass.getuser()
    result['seed'] = seed
    result['image_filenames'] = dataset_images
    result['is_valid'] = success
    if starting_from is not None:
        result['starting_from'] = starting_from['output_filename']
    if wnid_thresholds is not None:
        result['wnid_thresholds'] = wnid_thresholds
    return success, result, sampling_candidates, exclusions, carried_over_from_prev