def main(): initialize_dataset(args.dataset) build_symbolic_dataset = get_symbolic_dataset_builder(args.dataset) dataset = build_symbolic_dataset(args) if args.nr_vis is None: args.nr_vis = min(100, len(dataset)) if args.random: indices = random.choice(len(dataset), size=args.nr_vis, replace=False) else: indices = list(range(args.nr_vis)) vis = HTMLTableVisualizer(args.data_vis_dir, 'Dataset: ' + args.dataset.upper()) vis.begin_html() with vis.table('Metainfo', [ HTMLTableColumnDesc('k', 'Key', 'text', {}, None), HTMLTableColumnDesc('v', 'Value', 'code', {}, None) ]): for k, v in args.__dict__.items(): vis.row(k=k, v=v) with vis.table('Visualize', [ HTMLTableColumnDesc('id', 'QuestionID', 'text', {}, None), HTMLTableColumnDesc('image', 'QA', 'figure', {'width': '100%'}, None), HTMLTableColumnDesc( 'qa', 'QA', 'text', css=None, td_css={'width': '30%'}), HTMLTableColumnDesc( 'p', 'Program', 'code', css=None, td_css={'width': '30%'}) ]): for i in tqdm(indices): feed_dict = GView(dataset[i]) image_filename = osp.join(args.data_image_root, feed_dict.image_filename) image = Image.open(image_filename) if 'objects' in feed_dict: fig, ax = vis_bboxes(image, feed_dict.objects, 'object', add_text=False) else: fig, ax = vis_bboxes(image, [], 'object', add_text=False) _ = ax.set_title('object bounding box annotations') QA_string = """ <p><b>Q</b>: {}</p> <p><b>A</b>: {}</p> """.format(feed_dict.question_raw, feed_dict.answer) P_string = '\n'.join([repr(x) for x in feed_dict.program_seq]) vis.row(id=i, image=fig, qa=QA_string, p=P_string) plt.close() vis.end_html() logger.info( 'Happy Holiday! You can find your result at "http://monday.csail.mit.edu/xiuming' + osp.realpath(args.data_vis_dir) + '".')
def main(): logger.critical('Loading the dataset.') data = io.load(args.caption) # Step 1: filter out images. images = {c['image_id'] for c in data['annotations']} # Step 2: build a reverse mapping for images. id2image = {i['id']: i for i in data['images']} images = [id2image[i] for i in images] import torchvision.transforms as T image_transform = T.Compose([ T.Resize((args.image_size, args.image_size)), T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) dataset = COCOImageDataset(images, args.image_root, image_transform) logger.critical('Building the model.') model = FeatureExtractor() if args.use_gpu: model.cuda() if args.gpu_parallel: from jactorch.parallel import JacDataParallel model = JacDataParallel(model, device_ids=args.gpus).cuda() cudnn.benchmark = True model.eval() dataloader = dataset.make_dataloader(args.batch_size, shuffle=False, drop_last=False, nr_workers=args.data_workers) output_file = io.open_h5(args.output, 'w') writer = AsyncWriter(output_file, total_size=len(dataset)) for feed_dict in tqdm(dataloader, total=len(dataloader), desc='Extracting features'): if args.use_gpu: feed_dict = async_copy_to(feed_dict, 0) with torch.no_grad(): output_dict = model(feed_dict) writer.feed(output_dict) writer.join() output_file.close() io.dump(args.output_images_json, images)
def gen_vocab(dataset): all_words = set() for i in tqdm(len(dataset), desc='Building the vocab'): metainfo = dataset.get_metainfo(i) for w in metainfo['question_tokenized']: all_words.add(w) import jaclearn.embedding.constant as const vocab = Vocab() vocab.add(const.EBD_ALL_ZEROS) for w in sorted(all_words): vocab.add(w) for w in [const.EBD_UNKNOWN, const.EBD_BOS, const.EBD_EOS]: vocab.add(w) for w in gdef.extra_embeddings: vocab.add(w) return vocab
def save_images(processed): #images is a list of pairs of images vis = HTMLTableVisualizer(args.data_vis_dir, 'Dataset: ' + args.dataset.upper()) vis.begin_html() indices = len(processed) # if qa is None: # with vis.table('Visualize', [ # HTMLTableColumnDesc('scene', 'Scene', 'figure', {'width': '50%'},None), # HTMLTableColumnDesc('object', 'Attention', 'figure', {'width': '50%'},None), # ]): # for i in tqdm(indices): # scene_image = images[i][0] # object_image = images[i][1] # scene_fig, ax = vis_bboxes(scene_image, [], 'object', add_text=False) # object_fig, ax = vis_bboxes(object_image, [], 'object', add_text=False) # vis.row(scene=scene_fig, object=object_fig) # plt.close() # vis.end_html() # else: with vis.table('Visualize', [ HTMLTableColumnDesc('scene', 'Scene', 'figure', {'width': '80%'}, None), HTMLTableColumnDesc('object', 'Attention', 'figure', {'width': '80%'}, None), HTMLTableColumnDesc('accurate', 'Accuracy', 'text', css=None, td_css={'width': '30%'}), HTMLTableColumnDesc( 'qa', 'QA', 'text', css=None, td_css={'width': '30%'}), ]): for i in tqdm(indices): d = processed[i] scene_image = d['original_image'] object_image = d['attention_image'] question = d['question'] guessed_answer = d['guessed_answer'] true_answer = d['correct_answer'] model_correct = d['correct'] relational = d['relational'] scene_fig, ax = vis_bboxes(scene_image, [], 'object', add_text=False) object_fig, ax = vis_bboxes(object_image, [], 'object', add_text=False) accurate_string = """ <p><b>Model is correct</b>: {}</p> <p><b>Question is relational</b>: {}</p> """.format(model_correct, relational) QA_string = """ <p><b>Q</b>: {}</p> <p><b>Guessed answer</b>: {}</p> <p><b>True answer</b>: {}</p> """.format(question, guessed_answer, true_answer) vis.row(scene=scene_fig, object=object_fig, accurate=accurate_string, qa=QA_string) plt.close() vis.end_html()