def main():
    initialize_dataset(args.dataset)
    build_symbolic_dataset = get_symbolic_dataset_builder(args.dataset)
    dataset = build_symbolic_dataset(args)

    if args.nr_vis is None:
        args.nr_vis = min(100, len(dataset))

    if args.random:
        indices = random.choice(len(dataset), size=args.nr_vis, replace=False)
    else:
        indices = list(range(args.nr_vis))

    vis = HTMLTableVisualizer(args.data_vis_dir,
                              'Dataset: ' + args.dataset.upper())
    vis.begin_html()
    with vis.table('Metainfo', [
            HTMLTableColumnDesc('k', 'Key', 'text', {}, None),
            HTMLTableColumnDesc('v', 'Value', 'code', {}, None)
    ]):
        for k, v in args.__dict__.items():
            vis.row(k=k, v=v)

    with vis.table('Visualize', [
            HTMLTableColumnDesc('id', 'QuestionID', 'text', {}, None),
            HTMLTableColumnDesc('image', 'QA', 'figure', {'width': '100%'},
                                None),
            HTMLTableColumnDesc(
                'qa', 'QA', 'text', css=None, td_css={'width': '30%'}),
            HTMLTableColumnDesc(
                'p', 'Program', 'code', css=None, td_css={'width': '30%'})
    ]):
        for i in tqdm(indices):
            feed_dict = GView(dataset[i])
            image_filename = osp.join(args.data_image_root,
                                      feed_dict.image_filename)
            image = Image.open(image_filename)

            if 'objects' in feed_dict:
                fig, ax = vis_bboxes(image,
                                     feed_dict.objects,
                                     'object',
                                     add_text=False)
            else:
                fig, ax = vis_bboxes(image, [], 'object', add_text=False)
            _ = ax.set_title('object bounding box annotations')

            QA_string = """
                <p><b>Q</b>: {}</p>
                <p><b>A</b>: {}</p>
            """.format(feed_dict.question_raw, feed_dict.answer)
            P_string = '\n'.join([repr(x) for x in feed_dict.program_seq])

            vis.row(id=i, image=fig, qa=QA_string, p=P_string)
            plt.close()
    vis.end_html()

    logger.info(
        'Happy Holiday! You can find your result at "http://monday.csail.mit.edu/xiuming'
        + osp.realpath(args.data_vis_dir) + '".')
Exemplo n.º 2
0
def main():
    logger.critical('Loading the dataset.')
    data = io.load(args.caption)
    # Step 1: filter out images.
    images = {c['image_id'] for c in data['annotations']}
    # Step 2: build a reverse mapping for images.
    id2image = {i['id']: i for i in data['images']}
    images = [id2image[i] for i in images]

    import torchvision.transforms as T
    image_transform = T.Compose([
        T.Resize((args.image_size, args.image_size)),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    dataset = COCOImageDataset(images, args.image_root, image_transform)

    logger.critical('Building the model.')

    model = FeatureExtractor()
    if args.use_gpu:
        model.cuda()
        if args.gpu_parallel:
            from jactorch.parallel import JacDataParallel
            model = JacDataParallel(model, device_ids=args.gpus).cuda()
        cudnn.benchmark = True

    model.eval()
    dataloader = dataset.make_dataloader(args.batch_size,
                                         shuffle=False,
                                         drop_last=False,
                                         nr_workers=args.data_workers)
    output_file = io.open_h5(args.output, 'w')
    writer = AsyncWriter(output_file, total_size=len(dataset))

    for feed_dict in tqdm(dataloader,
                          total=len(dataloader),
                          desc='Extracting features'):
        if args.use_gpu:
            feed_dict = async_copy_to(feed_dict, 0)

        with torch.no_grad():
            output_dict = model(feed_dict)

        writer.feed(output_dict)

    writer.join()
    output_file.close()

    io.dump(args.output_images_json, images)
Exemplo n.º 3
0
def gen_vocab(dataset):
    all_words = set()
    for i in tqdm(len(dataset), desc='Building the vocab'):
        metainfo = dataset.get_metainfo(i)
        for w in metainfo['question_tokenized']:
            all_words.add(w)

    import jaclearn.embedding.constant as const
    vocab = Vocab()
    vocab.add(const.EBD_ALL_ZEROS)
    for w in sorted(all_words):
        vocab.add(w)
    for w in [const.EBD_UNKNOWN, const.EBD_BOS, const.EBD_EOS]:
        vocab.add(w)
    for w in gdef.extra_embeddings:
        vocab.add(w)

    return vocab
def save_images(processed):
    #images is a list of pairs of images
    vis = HTMLTableVisualizer(args.data_vis_dir,
                              'Dataset: ' + args.dataset.upper())
    vis.begin_html()

    indices = len(processed)

    # if qa is None:
    #     with vis.table('Visualize', [
    #         HTMLTableColumnDesc('scene', 'Scene', 'figure', {'width': '50%'},None),
    #         HTMLTableColumnDesc('object', 'Attention', 'figure', {'width': '50%'},None),
    #     ]):
    #         for i in tqdm(indices):
    #             scene_image = images[i][0]
    #             object_image = images[i][1]

    #             scene_fig, ax = vis_bboxes(scene_image, [], 'object', add_text=False)
    #             object_fig, ax = vis_bboxes(object_image, [], 'object', add_text=False)

    #             vis.row(scene=scene_fig, object=object_fig)
    #             plt.close()
    #     vis.end_html()

    # else:
    with vis.table('Visualize', [
            HTMLTableColumnDesc('scene', 'Scene', 'figure', {'width': '80%'},
                                None),
            HTMLTableColumnDesc('object', 'Attention', 'figure',
                                {'width': '80%'}, None),
            HTMLTableColumnDesc('accurate',
                                'Accuracy',
                                'text',
                                css=None,
                                td_css={'width': '30%'}),
            HTMLTableColumnDesc(
                'qa', 'QA', 'text', css=None, td_css={'width': '30%'}),
    ]):
        for i in tqdm(indices):
            d = processed[i]
            scene_image = d['original_image']
            object_image = d['attention_image']
            question = d['question']
            guessed_answer = d['guessed_answer']
            true_answer = d['correct_answer']
            model_correct = d['correct']
            relational = d['relational']

            scene_fig, ax = vis_bboxes(scene_image, [],
                                       'object',
                                       add_text=False)
            object_fig, ax = vis_bboxes(object_image, [],
                                        'object',
                                        add_text=False)

            accurate_string = """
                    <p><b>Model is correct</b>: {}</p>
                    <p><b>Question is relational</b>: {}</p>
                """.format(model_correct, relational)

            QA_string = """
                    <p><b>Q</b>: {}</p>
                    <p><b>Guessed answer</b>: {}</p>
                    <p><b>True answer</b>: {}</p>
                """.format(question, guessed_answer, true_answer)

            vis.row(scene=scene_fig,
                    object=object_fig,
                    accurate=accurate_string,
                    qa=QA_string)
            plt.close()
    vis.end_html()