def __init__(self, split, transform=None, word_encoder=None, is_train=True, caption_max_len=35): """ :param split: split, one of 'train', 'val', 'test' :param transform: image transform pipeline """ TextureDescriptionData.__init__(self, phid_format=None) self.transform = transform self.is_train = is_train self.caption_max_len = caption_max_len self.split = split assert self.split in ('train', 'val', 'test') self.word_encoder = word_encoder if self.word_encoder is None: self.word_encoder = WordEncoder() self.img_desc_ids = list() for img_i, img_name in enumerate(self.img_splits[split]): desc_num = len(self.img_data_dict[img_name]['descriptions']) self.img_desc_ids += [(img_i, desc_i) for desc_i in range(desc_num)]
def analyze_caption(dataset=None, wc=None, cm_range=(0, 1)): metric_name = 'caption_Meteor' if dataset is None: dataset = TextureDescriptionData(phid_format='set') with open('output/show_attend_tell/results/pred_v2_last_beam5_test.json', 'r') as f: pred_captions = json.load(f) analyzer = SubsetAnalyzer(metric_name) # scorer = Bleu(4) scorer = Meteor() for img_i, img_name in tqdm(enumerate(dataset.img_splits['test']), total=len(dataset.img_splits['test']), desc='analyzing %s over images' % metric_name): img_data = dataset.img_data_dict[img_name] phids = img_data['phrase_ids'] phrases = [dataset.phid_to_phrase(i) for i in phids] gt_captions = img_data['descriptions'] pred_caption = pred_captions[img_name][0] score, _ = scorer.compute_score({0: gt_captions}, {0: [pred_caption]}) # bleu4 = score[3] meteor = score analyzer.update(value=meteor, img_names=[img_name], phrases=phrases) wc = analyzer.report('visualizations/subset/' + metric_name, wc=wc, cm_range=cm_range) return wc
def analyze_phrase_retrieval(dataset=None, wc=None, models=('cls', 'tri'), cm_range=(0, 1)): metric_name = 'phrase_retrieval_ap' if dataset is None: dataset = TextureDescriptionData(phid_format='set') gt_matrix = dataset.get_img_phrase_match_matrices('test') for m in models: neg_distances = np.load(model_preds[m]) # neg_distances = np.load('output/triplet_match/c34_bert_l2_s_lr0.00001/eval_visualize_test/pred_scores.npy') match_scores = neg_distances analyzer = SubsetAnalyzer(metric_name) for img_i, img_name in tqdm(enumerate(dataset.img_splits['test']), total=len(dataset.img_splits['test']), desc='analyzing %s with %s' % (metric_name, m)): img_data = dataset.img_data_dict[img_name] phid_set = img_data['phrase_ids'] phrases = [dataset.phid_to_phrase(i) for i in phid_set] phrase_idx_sorted = np.argsort(-match_scores[img_i, :]) i2p_correct = gt_matrix[img_i, phrase_idx_sorted] ap = average_precision(i2p_correct) analyzer.update(value=ap, img_names=[img_name], phrases=phrases) wc = analyzer.report('visualizations/subset/%s__%s' % (metric_name, m), wc=None, cm_range=cm_range) return wc
def analyze_image_retrieval(dataset=None, wc=None, models=('cls', 'tri'), cm_range=(0, 1)): metric_name = 'image_retrieval_ap' if dataset is None: dataset = TextureDescriptionData(phid_format='set') gt_matrix = dataset.get_img_phrase_match_matrices( 'test') # img_num x phrase_num for m in models: pred_scores = np.load(model_preds[m]) # pred_scores = np.load('output/naive_classify/v1_35_ft2,4_fc512_tuneTrue/eval_visualize_test/pred_scores.npy') match_scores = pred_scores analyzer = SubsetAnalyzer(metric_name) for ph_i, ph in tqdm(enumerate(dataset.phrases), total=len(dataset.phrases), desc='analyzing %s with %s' % (metric_name, m)): gt_img_names = [ dataset.img_splits['test'][i] for i in range(gt_matrix.shape[0]) if gt_matrix[i, ph_i] ] img_idx_sorted = np.argsort(-match_scores[:, ph_i]) p2i_correct = gt_matrix[img_idx_sorted, ph_i] ap = average_precision(p2i_correct) analyzer.update(value=ap, img_names=gt_img_names, phrases=[ph]) wc = analyzer.report('visualizations/subset/%s__%s' % (metric_name, m), wc=None, cm_range=cm_range) return wc
def __init__(self, split='train', lang_input='phrase', neg_img=True, neg_lang=True): data.Dataset.__init__(self) TextureDescriptionData.__init__(self, phid_format='str') self.split = split self.lang_input = lang_input self.neg_img = neg_img self.neg_lang = neg_lang self.img_transform = build_transforms(is_train=False) self.pos_pairs = list() for img_i, img_name in enumerate(self.img_splits[self.split]): img_data = self.img_data_dict[img_name] if self.lang_input == 'phrase': self.pos_pairs += [(img_i, ph) for ph in img_data['phrase_ids']] elif self.lang_input == 'description': self.pos_pairs += [ (img_i, desc_idx) for desc_idx in range(len(img_data['descriptions'])) ] else: raise NotImplementedError return
def __init__(self, split='train', is_train=True, cached_resnet_feats=None): data.Dataset.__init__(self) TextureDescriptionData.__init__(self, phid_format='set') self.split = split self.is_train = is_train self.cached_resnet_feats = cached_resnet_feats self.use_cache = self.cached_resnet_feats is not None and len( self.cached_resnet_feats) > 0 self.transform = None if not self.use_cache: self.transform = build_transforms(is_train) print('PhraseClassifyDataset initialized.')
def __init__(self, model=None, img_transform=None, device='cuda', trained_path='output/triplet_match/c34_bert_l2_s_lr0.00001', model_file='BEST_checkpoint.pth', split_to_phrases=False, dataset=None): if model is None: model, device = load_model(trained_path, model_file) model.eval() self.model = model self.device = device if img_transform is None: img_transform = build_transforms(is_train=False) self.img_transform = img_transform self.split_to_phrases = split_to_phrases if dataset is None: dataset = TextureDescriptionData(phid_format=None) self.dataset = dataset self.ph_vec_dict = None if self.split_to_phrases: ph_vecs = get_phrase_vecs(self.model, self.dataset) self.ph_vec_dict = {dataset.phrases[i]: ph_vecs[i] for i in range(len(dataset.phrases))} self.img_vecs = dict() return
def compare_top_k(split, top_k=5): dataset = TextureDescriptionData() gt_captions = dict() for img_name in dataset.img_splits[split]: img_data = dataset.img_data_dict[img_name] gt_captions[img_name] = img_data['descriptions'] # gt_captions[img_name] = list() # for desc in img_data['descriptions']: # cap = ' '.join(WordEncoder.tokenize(desc)) # gt_captions[img_name].append(cap) for model_type in ('tri', 'cls'): if model_type is 'tri': model, _ = tri_load_model() else: model, _ = cls_load_model(dataset=dataset) # for top_k in range(1, 11): print('**** %s : top %d ****' % (model_type, top_k)) predictions = top_k_caption(top_k, model_type=model_type, model=model, dataset=dataset, split=split) print(list(predictions.items())[0]) compute_metrics(gt_captions, predictions) return
def retrieve_with_desc_eval(pred_fn, dataset=None, split='val'): """ INPUT: pred_fn: prediction function, input (desc, split='test'), output scores over images dataset: instance of TextureDescriptionData split: default is 'val'. match_scores should cover all imgs in this split """ if dataset is None: dataset = TextureDescriptionData(phid_format='set') rrs = list() for img_i, img_name in tqdm(enumerate(dataset.img_splits[split]), total=len(dataset.img_splits[split]), desc='computing mrr in retrieve_with_desc'): img_data = dataset.img_data_dict[img_name] for desc in img_data['descriptions']: pred_scores = pred_fn(desc, split=split) if np.all(pred_scores == 0): rrs.append(0) else: r = 1 for s in pred_scores: if s > pred_scores[img_i]: r += 1 rrs.append(1.0 / r) mrr = np.mean(rrs) print('mean reciprocal rank on %s: %f' % (split, mrr)) return mrr
def analyze_image_retrieval_desc(dataset=None, wc=None, cm_range=(0, 1)): metric_name = 'image_retrieval_desc_mrr' analyzer_cache_path = 'output/triplet_match/da3_bert_lr0.00001/subset_analyze_img_ret_desc.pkl' if os.path.exists(analyzer_cache_path): # if False: with open(analyzer_cache_path, 'rb') as f: analyzer = pickle.load(f) else: if dataset is None: dataset = TextureDescriptionData(phid_format=None) tri_desc_retriever = RetrieveImgFromDesc( dataset=dataset, split_to_phrases=False, trained_path='output/triplet_match/da3_bert_lr0.00001') analyzer = SubsetAnalyzer(metric_name) for img_i, img_name in tqdm(enumerate(dataset.img_splits['test']), total=len(dataset.img_splits['test']), desc='analyzing %s over images' % metric_name): img_data = dataset.img_data_dict[img_name] for desc in img_data['descriptions']: pred_scores = tri_desc_retriever(desc, split='test') if np.all(pred_scores == 0): v = 0 else: r = 1 for s in pred_scores: if s > pred_scores[img_i]: r += 1 v = 1.0 / r phrases = dataset.description_to_phrases(desc) analyzer.update(value=v, img_names=[img_name], phrases=phrases, desc=desc) with open( 'output/triplet_match/da3_bert_lr0.00001/subset_analyze_img_ret_desc.pkl', 'wb') as f: pickle.dump(analyzer, f) wc = analyzer.report('visualizations/subset/' + metric_name, wc=wc, cm_range=cm_range) return wc
def retrieve_img_desc_compare(split='test'): dataset = TextureDescriptionData() cls_retriever = ClsRetrieveImgFromDesc(dataset=dataset) tri_retriever = TriRetrieveImgFromDesc(dataset=dataset, split_to_phrases=True) tri_desc_retriever = TriRetrieveImgFromDesc( dataset=dataset, split_to_phrases=False, trained_path='output/triplet_match/da3_bert_lr0.00001') for fn in [tri_retriever, tri_desc_retriever, cls_retriever]: retrieve_with_desc_eval(pred_fn=fn, dataset=dataset, split=split) return
def top_k_caption(top_k=5, model_type='cls', model=None, dataset=None, split='val'): if dataset is None: print('top_k_caption load dataset') dataset = TextureDescriptionData(phid_format=None) if model_type == 'cls': captioner = ClsCaption(dataset=dataset, model=model) elif model_type == 'tri': captioner = TriCaption(dataset=dataset, model=model) else: raise NotImplementedError predictions = dict() for img_name in tqdm(dataset.img_splits[split], desc='captioning %s top %d on %s' % (model_type, top_k, split)): img = dataset.load_img(img_name) caption = captioner(img, top_k) predictions[img_name] = [caption] return predictions
def __init__(self, model=None, img_transform=None, dataset=None, device='cuda'): if model is None: model, device = load_model() self.model = model self.device = device if img_transform is None: img_transform = build_transforms(is_train=False) self.img_transform = img_transform if dataset is None: dataset = TextureDescriptionData(phid_format=None) self.dataset = dataset self.ph_vecs = get_phrase_vecs(self.model, self.dataset) return
def retrieve_img(input_cases, exp_name='fore_color'): syn_dataset = SyntheticData() syn_imgs = syn_dataset.get_all_imgs() texture_dataset = TextureDescriptionData() cls_retriever = ClsRetrieveImgFromDesc(dataset=texture_dataset) cls_img_ph_scores = cls_retriever.get_img_ph_scores(imgs=syn_imgs) cls_fn = lambda desc: cls_retriever(desc, img_ph_scores=cls_img_ph_scores) cls_results = retrieve_img_eval(cls_fn, input_cases) del cls_retriever, cls_img_ph_scores, cls_fn print('classifier_retrieve done. acc_all: %.4f; acc_hard: %.4f' % (float(np.mean(cls_results[1])), float(np.mean(cls_results[2])))) tri_retriever = TriRetrieveImgFromDesc(dataset=texture_dataset, split_to_phrases=True) tri_img_vecs = tri_retriever.get_img_vecs(imgs=syn_imgs) tri_fn = lambda desc: tri_retriever(desc, img_vecs=tri_img_vecs) tri_results = retrieve_img_eval(tri_fn, input_cases) del tri_retriever, tri_img_vecs, tri_fn print('triplet_retrieve done. acc_all: %.4f; acc_hard: %.4f' % (float(np.mean(tri_results[1])), float(np.mean(tri_results[2])))) tri_desc_retriever = TriRetrieveImgFromDesc( dataset=texture_dataset, split_to_phrases=False, trained_path='output/triplet_match/da3_bert_lr0.00001') tri_desc_img_vecs = tri_desc_retriever.get_img_vecs(imgs=syn_imgs) tri_desc_fn = lambda desc: tri_desc_retriever(desc, img_vecs=tri_desc_img_vecs) tri_desc_results = retrieve_img_eval(tri_desc_fn, input_cases) del tri_desc_retriever, tri_desc_img_vecs, tri_desc_fn print('triplet_desc_retrieve done. acc_all: %.4f; acc_hard: %.4f' % (float( np.mean(tri_desc_results[1])), float(np.mean(tri_desc_results[2])))) results = { 'input_cases': input_cases, 'cls_results': cls_results, 'tri_results': tri_results, 'tri_desc_results': tri_desc_results } np.save( 'applications/synthetic_imgs/visualizations/results/retrieve_img_%s.npy' % exp_name, results, allow_pickle=True) return results
def load_model(trained_path='output/naive_classify/v1_35_ft2,4_fc512_tuneTrue', model_file='epoch075.pth', dataset=None): cfg.merge_from_file(os.path.join(trained_path, 'train.yml')) if dataset is None: dataset = TextureDescriptionData(phid_format=None) model: PhraseClassifier = PhraseClassifier( class_num=len(dataset.phrases), pretrained_backbone=True, fc_dims=cfg.MODEL.FC_DIMS, use_feats=cfg.MODEL.BACKBONE_FEATS) model_path = os.path.join(trained_path, 'checkpoints', model_file) model.load_state_dict(torch.load(model_path)) device = torch.device(cfg.MODEL.DEVICE) model.to(device) return model, device
def __init__(self, model=None, img_transform=None, device='cuda', dataset=None): if dataset is None: dataset = TextureDescriptionData(phid_format=None) self.dataset = dataset if model is None: model, device = load_model(dataset=self.dataset) model.eval() self.model = model self.device = device if img_transform is None: img_transform = build_transforms(is_train=False) self.img_transform = img_transform self.img_ph_scores = dict() return
def caption_pred(): texture_dataset = TextureDescriptionData(phid_format=None) syn_dataset = SyntheticData() pred_cls = dict() pred_tri = dict() cls_captioner = ClsCaption(dataset=texture_dataset) tri_captioner = TriCaption(dataset=texture_dataset) for i in tqdm(range(len(syn_dataset)), desc='captioning top5'): img = syn_dataset.get_img(*syn_dataset.unravel_index(i)) pred_cls[i] = [cls_captioner(img, top_k=5)] pred_tri[i] = [tri_captioner(img, top_k=5)] pred_sat_t = sat_caption(beam_size=5, img_dataset=syn_dataset, split=None) pred_sat = dict() for k, v in pred_sat_t.items(): k = int(k.data) pred_sat[k] = [v[0]] if k == 0: print(k, pred_sat[k]) pred_dicts = [pred_cls, pred_tri, pred_sat] np.save('applications/synthetic_imgs/visualizations/results/caption.npy', pred_dicts) print('pred captions saved.') return pred_dicts
def main(ph_num=100): cub_dataset = CUBDataset(split=None, val_ratio=0) classes = cub_dataset.class_names # model_name = 'LogisticRegression_newton-cg_multinomial_C1' # _, model = classify(cub_dataset=cub_dataset, feat_mode='ph_cls', model_name=model_name, norm=False, # val_ratio=0, ks=[ph_num]) # weights = model.coef_ # classes x phrases # np.save('applications/fine_grained_classification/classify_ph_weights_%d.npy' % ph_num, weights) weights = np.load( 'applications/fine_grained_classification/classify_ph_weights_%d.npy' % ph_num) texture_dataset = TextureDescriptionData() phrases = texture_dataset.phrases # mean_v = np.mean(weights) std_v = np.std(weights) min_v = -1 * std_v max_v = std_v pos_std = weights[weights > 0] neg_std = weights[weights < 0] print('ready') folder = 'applications/fine_grained_classification/ph_clouds_%d_np' % ph_num if not os.path.exists(folder): os.makedirs(folder) # for cls_i, cls in enumerate(classes): for cls_i in [17, 55, 188]: cls = classes[cls_i] ph_weights = weights[cls_i, :] ph_freq_dict = {ph: abs(w) for ph, w in zip(phrases, ph_weights)} pos_dict = {ph: w for ph, w in zip(phrases, ph_weights) if w > 0} neg_dict = {ph: -w for ph, w in zip(phrases, ph_weights) if w < 0} def color_fn(phrase, *args, **kwargs): ph_i = texture_dataset.phrase_to_phid(phrase) # v = (ph_weights[ph_i] - min_v) / (max_v - min_v) w = ph_weights[ph_i] if w > 0: v = w / pos_std + 0.5 else: v = w / neg_std + 0.5 cmap = cm.get_cmap('coolwarm') rgba = cmap(v, bytes=True) return rgba red_fn = lambda *args, **kwargs: "red" blue_fn = lambda *args, **kwargs: "blue" # wc = WordCloud(background_color="white", color_func=color_fn, prefer_horizontal=0.9, # height=600, width=1200, min_font_size=5, margin=4, max_words=500, # font_path='visualizations/DIN Alternate Bold.ttf') # wc.generate_from_frequencies(ph_freq_dict) # wc_path = os.path.join(folder, '%s.jpg' % cls) # print(wc_path) # wc.to_file(wc_path) wc = WordCloud(background_color="white", color_func=red_fn, prefer_horizontal=0.9, height=600, width=1200, min_font_size=4, margin=2, max_words=500, font_path='visualizations/DIN Alternate Bold.ttf') wc.generate_from_frequencies(pos_dict) wc_path = os.path.join(folder, '%s_pos.jpg' % cls) print(wc_path) wc.to_file(wc_path) wc = WordCloud(background_color="white", color_func=blue_fn, prefer_horizontal=0.9, height=600, width=1200, min_font_size=4, margin=2, max_words=500, font_path='visualizations/DIN Alternate Bold.ttf') wc.generate_from_frequencies(neg_dict) wc_path = os.path.join(folder, '%s_neg.jpg' % cls) print(wc_path) wc.to_file(wc_path)
def main_eval(): # load configs parser = argparse.ArgumentParser( description="Triplet (phrase) retrieval evaluation") parser.add_argument('-p', '--trained_path', help="path to trained model (where there is cfg file)", default='output/triplet_match/c34_bert_l2_s_lr0.00001') parser.add_argument('-m', '--model_file', help='file name of the cached model ', default='BEST_checkpoint.pth') parser.add_argument('-o', '--opts', default=None, nargs=argparse.REMAINDER, help="e.g. EVAL_SPLIT test") args = parser.parse_args() cfg.merge_from_file(os.path.join(args.trained_path, 'train.yml')) if args.opts is not None: cfg.merge_from_list(args.opts) # set random seed torch.manual_seed(cfg.RAND_SEED) np.random.seed(cfg.RAND_SEED) random.seed(cfg.RAND_SEED) dataset = TextureDescriptionData(phid_format=None) img_dataset = ImgOnlyDataset(split=cfg.EVAL_SPLIT, transform=build_transforms(is_train=False), texture_dataset=dataset) img_dataloader = DataLoader(img_dataset, batch_size=1, shuffle=False) phrase_dataset = PhraseOnlyDataset(texture_dataset=dataset) phrase_dataloader = DataLoader(phrase_dataset, batch_size=32, shuffle=False) model: TripletMatch = TripletMatch( vec_dim=cfg.MODEL.VEC_DIM, neg_margin=cfg.LOSS.MARGIN, distance=cfg.MODEL.DISTANCE, img_feats=cfg.MODEL.IMG_FEATS, lang_encoder_method=cfg.MODEL.LANG_ENCODER) model_path = os.path.join(args.trained_path, 'checkpoints', args.model_file) model.load_state_dict(torch.load(model_path)) device = torch.device(cfg.DEVICE) model.to(device) do_eval(model, img_dataloader, phrase_dataloader, device, split=cfg.EVAL_SPLIT, visualize_path=os.path.join(args.trained_path, 'eval_visualize_%s' % cfg.EVAL_SPLIT), add_to_summary_name=model_path + ':' + cfg.EVAL_SPLIT)
def retrieve_img_visualize(split='test', top_k=10, mode='desc', input_descs=None, visualize_count=100): dataset = TextureDescriptionData() cls_retriever = ClsRetrieveImgFromDesc(dataset=dataset) tri_retriever = TriRetrieveImgFromDesc(dataset=dataset, split_to_phrases=True) tri_desc_retriever = TriRetrieveImgFromDesc( dataset=dataset, split_to_phrases=False, trained_path='output/triplet_match/da3_bert_lr0.00001') pred_fns = [cls_retriever, tri_retriever, tri_desc_retriever] img_pref = 'https://www.robots.ox.ac.uk/~vgg/data/dtd/thumbs/' html_str = '''<!DOCTYPE html> <html lang="en"> <head> <title>Image Retrieval Comparison</title> <link rel="stylesheet" href="retrieve_img_style.css"> </head> <body> <h1>Image Retrieval Comparison</h1> <span class="model">Classifier vs. Triplet(phrase) vs. Triplet(description)</span><br> <hr> ''' if input_descs is not None: html_path = 'visualizations/retrieve_img_predefined.html' gt_img_names = None elif mode is 'desc': html_path = 'visualizations/retrieve_img_desc.html' gt_img_names = dataset.img_splits[split] if len(gt_img_names) > visualize_count: gt_img_names = np.random.choice(gt_img_names, visualize_count, replace=False) input_descs = [ np.random.choice(dataset.img_data_dict[img_name]['descriptions']) for img_name in gt_img_names ] elif mode is 'phrase': html_path = 'visualizations/retrieve_img_phrase.html' gt_img_names = None input_descs = dataset.phrases if len(input_descs) > visualize_count: input_descs = np.random.choice(input_descs, visualize_count, replace=False) else: raise NotImplementedError for input_i in tqdm(range(len(input_descs)), desc='generating html'): desc = input_descs[input_i] html_str += '<span class="desc">%s</span><br>\n' % desc if gt_img_names is not None: img_name = gt_img_names[input_i] html_str += 'gt image: {img_name} <img src="{img_pref}{img_name}" alt="{img_name}"><br>\n'\ .format(img_pref=img_pref, img_name=img_name) for pred_fn in pred_fns: img_scores = pred_fn(desc, split=split) sorted_img_idxs = np.argsort(img_scores * -1.0) top_k_idxs = sorted_img_idxs[:top_k] for img_idx in top_k_idxs: img_name = dataset.img_splits[split][img_idx] html_str += '<img src="{img_pref}{img_name}" alt="{img_name}">\n'\ .format(img_pref=img_pref, img_name=img_name) html_str += '<br>\n' html_str += '<hr>\n' html_str += '</body\n></html>' with open(html_path, 'w') as f: f.write(html_str) return
def retrieve_eval(match_scores, dataset=None, split='val', mode='img2phrase', visualize_path=None, max_visualize_num=100, add_to_summary_name=None, verbose=True): """ INPUT: match_scores: [img_num x phrase_num], match_scores[i,j] shows how well img_i and phrase_j matches dataset: instance of TextureDescriptionData split: default is 'val'. match_scores should cover all imgs in this split visualize_path: if None, no visualization max_visualize_num: if <= 0, visualize all in this split """ if dataset is None: dataset = TextureDescriptionData(phid_format='set') gt_matrix = dataset.get_img_phrase_match_matrices(split) img_num = gt_matrix.shape[0] phrase_num = gt_matrix.shape[1] if mode in i2p_mode_names: # each row is prediction for one image. phrase sorted by pred scores. values are whether the phrase is correct i2p_correct = np.zeros_like(gt_matrix, dtype=bool) # img_num x phrase_num i2p_phrase_idxs = np.zeros_like(i2p_correct, dtype=int) for img_i in range(img_num): phrase_idx_sorted = np.argsort(-match_scores[img_i, :]) i2p_phrase_idxs[img_i] = phrase_idx_sorted i2p_correct[img_i] = gt_matrix[img_i, phrase_idx_sorted] retrieve_binary_lists = i2p_correct retrieve_idxs = i2p_phrase_idxs # gt_count = dataset.get_gt_phrase_count(split) elif mode in p2i_mode_names: # each row is prediction for one prhase. images sorted by pred scores. values are whether the image is correct p2i_correct = np.zeros_like(gt_matrix, dtype=bool).transpose() # class_num x img_num p2i_img_idxs = np.zeros_like(p2i_correct, dtype=int) for pi in range(phrase_num): img_idx_sorted = np.argsort(-match_scores[:, pi]) p2i_img_idxs[pi] = img_idx_sorted p2i_correct[pi] = gt_matrix[img_idx_sorted, pi] retrieve_binary_lists = p2i_correct retrieve_idxs = p2i_img_idxs # gt_count = np.sum(gt_matrix, axis=0) else: raise NotImplementedError # calculate metrics metrics = dict() mean_reciprocal_rank_ = mean_reciprocal_rank(retrieve_binary_lists) r_precision_ = r_precision(retrieve_binary_lists) mean_average_precision_ = mean_average_precision(retrieve_binary_lists) metrics['mean_reciprocal_rank'] = mean_reciprocal_rank_ metrics['r_precision'] = r_precision_ metrics['mean_average_precision'] = mean_average_precision_ for k in [5, 10, 20, 50, 100]: precision_at_k_ = mean_precision_at_k(retrieve_binary_lists, k) recall_at_k_ = mean_recall_at_k(retrieve_binary_lists, k, gt_count=None) metrics['precision_at_%03d' % k] = precision_at_k_ metrics['recall_at_%03d' % k] = recall_at_k_ # print metrics if verbose: print('## retrieve_eval {mode} on {split} ##'.format(mode=mode, split=split)) for m, v in sorted(metrics.items(), key=lambda mv: mv[0]): print('%s: %.4f' % (m, v)) # add to summary if mode in i2p_mode_names: log_to_summary(add_to_summary_name, i2p_metrics=metrics) elif mode in p2i_mode_names: log_to_summary(add_to_summary_name, p2i_metrics=metrics) # visualization if visualize_path is not None: if max_visualize_num <= 0 or max_visualize_num > len(retrieve_binary_lists): max_visualize_num = len(retrieve_binary_lists) if not os.path.exists(visualize_path): os.makedirs(visualize_path) precisions = list() recalls = list() for k in range(1, 101): precisions.append(mean_precision_at_k(retrieve_binary_lists, k)) recalls.append(mean_recall_at_k(retrieve_binary_lists, k, gt_count=None)) # plot pr curve and topk-recall curve plot_precision_recall_curves(mode, precisions, recalls, visualize_path) # generate html file generate_html(dataset, split, mode, visualize_path, max_visualize_num, img_num, phrase_num, gt_matrix, match_scores, metrics, retrieve_idxs) return metrics
def eval_caption(split, dataset=None, pred_captions=None, pred_captions_fpath=None, html_path=None, visualize_count=100): if pred_captions is None: with open(pred_captions_fpath, 'r') as f: pred_captions = json.load(f) assert type(pred_captions) == dict if dataset is None: dataset = TextureDescriptionData(phid_format=None) gt_captions = dict() for img_name in dataset.img_splits[split]: img_data = dataset.img_data_dict[img_name] gt_captions[img_name] = img_data['descriptions'] # gt_captions[img_name] = list() # for desc in img_data['descriptions']: # cap = ' '.join(WordEncoder.tokenize(desc)) # gt_captions[img_name].append(cap) pred_k_metrics_list = list() pred_per_img = len(list(pred_captions.values())[0]) for pred_k in range(pred_per_img): print('Metrics on %d-th predicted caption:' % (pred_k + 1)) tic = time.time() pred_caps_k = { img_name: [caps[pred_k]] for img_name, caps in pred_captions.items() } metrics_k = compute_metrics(gt_captions, pred_caps_k) pred_k_metrics_list.append(metrics_k) toc = time.time() print('time cost: %.1f s' % (toc - tic)) pred_k_metrics_dict = dict() for metric in pred_k_metrics_list[0].keys(): pred_k_metrics_dict[metric] = [ metric_dict[metric] for metric_dict in pred_k_metrics_list ] if html_path is not None: html_str = '<html><body>\n' html_str += '<h1>Captioning metrics</h1>\n' for pred_k in range(len(pred_k_metrics_list)): html_str += '<b>Metrics on %d-th predicted captions:</b><br>\n' % ( pred_k + 1) for k, v in pred_k_metrics_list[pred_k].items(): mean = np.mean(pred_k_metrics_dict[k][:pred_k + 1]) html_str += '%s: %f (mean of top %d: %f)<br>\n' % ( k, v, pred_k + 1, mean) img_names = dataset.img_splits[split] html_str += '<table>\n' for img_i, img_name in enumerate(img_names): html_str += '<tr style="border-bottom:1px solid black; border-collapse: collapse;">' html_str += '<td><img src=https://maxwell.cs.umass.edu/mtimm/images/%s width=300></td>\n' % img_name # pred caps pred_caps = pred_captions[img_name] pred_str = '<b>Predicted captions:</b><br><br>\n' for ci, cap in enumerate(pred_caps): pred_str += '({ci}) {cap}<br>\n'.format(ci=ci, cap=cap) html_str += '<td>' + pred_str + '</td>\n' # gt descriptions descriptions = dataset.img_data_dict[img_name]['descriptions'] desc_str = '<b>Ground-truth descriptions:</b><br><br>\n' for di, desc in enumerate(descriptions): desc_str += '({di}) {desc}<br>\n'.format(di=di, desc=desc) html_str += '<td>' + desc_str + '</td>\n' html_str += '</tr>\n' if img_i >= visualize_count: break html_str += '</table></body></html>' with open(html_path, 'w') as f: f.write(html_str) return pred_k_metrics_list
def compare_visualize(split='test', html_path='visualizations/caption.html', visualize_count=100): dataset = TextureDescriptionData() word_encoder = WordEncoder() # cls_predictions = top_k_caption(top_k=5, model_type='cls', dataset=dataset, split=split) # with open('output/naive_classify/v1_35_ft2,4_fc512_tuneTrue/caption_top5_%s.json' % split, 'w') as f: # json.dump(cls_predictions, f) # tri_predictions = top_k_caption(top_k=5, model_type='tri', dataset=dataset, split=split) # with open('output/triplet_match/c34_bert_l2_s_lr0.00001/caption_top5_%s.json' % split, 'w') as f: # json.dump(tri_predictions, f) cls_predictions = json.load( open( 'output/naive_classify/v1_35_ft2,4_fc512_tuneTrue/caption_top5_%s.json' % split)) tri_predictions = json.load( open( 'output/triplet_match/c34_bert_l2_s_lr0.00001/caption_top5_%s.json' % split)) sat_predictions = json.load( open('output/show_attend_tell/results/pred_v2_last_beam5_%s.json' % split)) pred_dicts = [cls_predictions, tri_predictions, sat_predictions] img_pref = 'https://www.robots.ox.ac.uk/~vgg/data/dtd/thumbs/' html_str = '''<!DOCTYPE html> <html lang="en"> <head> <title>Caption visualize</title> <link rel="stylesheet" href="caption_style.css"> </head> <body> <table> <col class="column-one"> <col class="column-two"> <col class="column-three"> <tr> <th style="text-align: center">Image</th> <th>Predicted captions</th> <th>Ground-truth descriptions</th> </tr> ''' for img_i, img_name in enumerate(dataset.img_splits[split]): gt_descs = dataset.img_data_dict[img_name]['descriptions'] gt_desc_str = '|'.join(gt_descs) gt_html_str = '' for ci, cap in enumerate(gt_descs): gt_html_str += '[%d] %s<br>\n' % (ci + 1, cap) pred_caps = [pred_dict[img_name][0] for pred_dict in pred_dicts] for ci, cap in enumerate(pred_caps): tokens = WordEncoder.tokenize(cap) for ti, t in enumerate(tokens): if t in gt_desc_str and len(t) > 1: tokens[ti] = '<span class="correct">%s</span>' % t pred_caps[ci] = word_encoder.detokenize(tokens) html_str += ''' <tr> <td> <img src={img_pref}{img_name} alt="{img_name}"> </td> <td> <span class="pred_name">Classifier top 5:</span><br> {pred0}<br> <span class="pred_name">Triplet top 5:</span><br> {pred1}<br> <span class="pred_name">Show-attend-tell:</span><br> {pred2}<br> </td> <td> {gt} </td> </tr> '''.format(img_pref=img_pref, img_name=img_name, pred0=pred_caps[0], pred1=pred_caps[1], pred2=pred_caps[2], gt=gt_html_str) if img_i >= visualize_count: break html_str += '</table>\n</body\n></html>' with open(html_path, 'w') as f: f.write(html_str) return
# scorer = Bleu(4) scorer = Meteor() for img_i, img_name in tqdm(enumerate(dataset.img_splits['test']), total=len(dataset.img_splits['test']), desc='analyzing %s over images' % metric_name): img_data = dataset.img_data_dict[img_name] phids = img_data['phrase_ids'] phrases = [dataset.phid_to_phrase(i) for i in phids] gt_captions = img_data['descriptions'] pred_caption = pred_captions[img_name][0] score, _ = scorer.compute_score({0: gt_captions}, {0: [pred_caption]}) # bleu4 = score[3] meteor = score analyzer.update(value=meteor, img_names=[img_name], phrases=phrases) wc = analyzer.report('visualizations/subset/' + metric_name, wc=wc, cm_range=cm_range) return wc if __name__ == '__main__': dataset = TextureDescriptionData(phid_format='set') wc = None # analyze_phrase_retrieval(dataset, cm_range=(0.25, 0.35)) # wc = analyze_phrase_retrieval(dataset) # analyze_image_retrieval(dataset, cm_range=(0, 0.5)) analyze_image_retrieval_desc(dataset, cm_range=(0, 0.2)) # analyze_caption(dataset, cm_range=(0.17, 0.24))