Exemplo n.º 1
0
    def __init__(self,
                 split,
                 transform=None,
                 word_encoder=None,
                 is_train=True,
                 caption_max_len=35):
        """
        :param split: split, one of 'train', 'val', 'test'
        :param transform: image transform pipeline
        """
        TextureDescriptionData.__init__(self, phid_format=None)
        self.transform = transform
        self.is_train = is_train
        self.caption_max_len = caption_max_len
        self.split = split
        assert self.split in ('train', 'val', 'test')

        self.word_encoder = word_encoder
        if self.word_encoder is None:
            self.word_encoder = WordEncoder()

        self.img_desc_ids = list()
        for img_i, img_name in enumerate(self.img_splits[split]):
            desc_num = len(self.img_data_dict[img_name]['descriptions'])
            self.img_desc_ids += [(img_i, desc_i)
                                  for desc_i in range(desc_num)]
Exemplo n.º 2
0
def analyze_caption(dataset=None, wc=None, cm_range=(0, 1)):
    metric_name = 'caption_Meteor'
    if dataset is None:
        dataset = TextureDescriptionData(phid_format='set')
    with open('output/show_attend_tell/results/pred_v2_last_beam5_test.json',
              'r') as f:
        pred_captions = json.load(f)
    analyzer = SubsetAnalyzer(metric_name)
    # scorer = Bleu(4)
    scorer = Meteor()

    for img_i, img_name in tqdm(enumerate(dataset.img_splits['test']),
                                total=len(dataset.img_splits['test']),
                                desc='analyzing %s over images' % metric_name):
        img_data = dataset.img_data_dict[img_name]
        phids = img_data['phrase_ids']
        phrases = [dataset.phid_to_phrase(i) for i in phids]
        gt_captions = img_data['descriptions']
        pred_caption = pred_captions[img_name][0]
        score, _ = scorer.compute_score({0: gt_captions}, {0: [pred_caption]})
        # bleu4 = score[3]
        meteor = score
        analyzer.update(value=meteor, img_names=[img_name], phrases=phrases)

    wc = analyzer.report('visualizations/subset/' + metric_name,
                         wc=wc,
                         cm_range=cm_range)
    return wc
Exemplo n.º 3
0
def analyze_phrase_retrieval(dataset=None,
                             wc=None,
                             models=('cls', 'tri'),
                             cm_range=(0, 1)):
    metric_name = 'phrase_retrieval_ap'
    if dataset is None:
        dataset = TextureDescriptionData(phid_format='set')
    gt_matrix = dataset.get_img_phrase_match_matrices('test')
    for m in models:
        neg_distances = np.load(model_preds[m])
        # neg_distances = np.load('output/triplet_match/c34_bert_l2_s_lr0.00001/eval_visualize_test/pred_scores.npy')
        match_scores = neg_distances

        analyzer = SubsetAnalyzer(metric_name)

        for img_i, img_name in tqdm(enumerate(dataset.img_splits['test']),
                                    total=len(dataset.img_splits['test']),
                                    desc='analyzing %s with %s' %
                                    (metric_name, m)):
            img_data = dataset.img_data_dict[img_name]
            phid_set = img_data['phrase_ids']
            phrases = [dataset.phid_to_phrase(i) for i in phid_set]

            phrase_idx_sorted = np.argsort(-match_scores[img_i, :])
            i2p_correct = gt_matrix[img_i, phrase_idx_sorted]
            ap = average_precision(i2p_correct)
            analyzer.update(value=ap, img_names=[img_name], phrases=phrases)

        wc = analyzer.report('visualizations/subset/%s__%s' % (metric_name, m),
                             wc=None,
                             cm_range=cm_range)
    return wc
Exemplo n.º 4
0
def analyze_image_retrieval(dataset=None,
                            wc=None,
                            models=('cls', 'tri'),
                            cm_range=(0, 1)):
    metric_name = 'image_retrieval_ap'
    if dataset is None:
        dataset = TextureDescriptionData(phid_format='set')
    gt_matrix = dataset.get_img_phrase_match_matrices(
        'test')  # img_num x phrase_num
    for m in models:
        pred_scores = np.load(model_preds[m])
        # pred_scores = np.load('output/naive_classify/v1_35_ft2,4_fc512_tuneTrue/eval_visualize_test/pred_scores.npy')
        match_scores = pred_scores

        analyzer = SubsetAnalyzer(metric_name)

        for ph_i, ph in tqdm(enumerate(dataset.phrases),
                             total=len(dataset.phrases),
                             desc='analyzing %s with %s' % (metric_name, m)):
            gt_img_names = [
                dataset.img_splits['test'][i]
                for i in range(gt_matrix.shape[0]) if gt_matrix[i, ph_i]
            ]

            img_idx_sorted = np.argsort(-match_scores[:, ph_i])
            p2i_correct = gt_matrix[img_idx_sorted, ph_i]
            ap = average_precision(p2i_correct)
            analyzer.update(value=ap, img_names=gt_img_names, phrases=[ph])

        wc = analyzer.report('visualizations/subset/%s__%s' % (metric_name, m),
                             wc=None,
                             cm_range=cm_range)
    return wc
Exemplo n.º 5
0
    def __init__(self,
                 split='train',
                 lang_input='phrase',
                 neg_img=True,
                 neg_lang=True):
        data.Dataset.__init__(self)
        TextureDescriptionData.__init__(self, phid_format='str')
        self.split = split
        self.lang_input = lang_input
        self.neg_img = neg_img
        self.neg_lang = neg_lang
        self.img_transform = build_transforms(is_train=False)

        self.pos_pairs = list()
        for img_i, img_name in enumerate(self.img_splits[self.split]):
            img_data = self.img_data_dict[img_name]
            if self.lang_input == 'phrase':
                self.pos_pairs += [(img_i, ph)
                                   for ph in img_data['phrase_ids']]
            elif self.lang_input == 'description':
                self.pos_pairs += [
                    (img_i, desc_idx)
                    for desc_idx in range(len(img_data['descriptions']))
                ]
            else:
                raise NotImplementedError
        return
Exemplo n.º 6
0
    def __init__(self, split='train', is_train=True, cached_resnet_feats=None):
        data.Dataset.__init__(self)
        TextureDescriptionData.__init__(self, phid_format='set')

        self.split = split
        self.is_train = is_train
        self.cached_resnet_feats = cached_resnet_feats
        self.use_cache = self.cached_resnet_feats is not None and len(
            self.cached_resnet_feats) > 0
        self.transform = None
        if not self.use_cache:
            self.transform = build_transforms(is_train)
        print('PhraseClassifyDataset initialized.')
Exemplo n.º 7
0
    def __init__(self, model=None, img_transform=None, device='cuda',
                 trained_path='output/triplet_match/c34_bert_l2_s_lr0.00001', model_file='BEST_checkpoint.pth',
                 split_to_phrases=False, dataset=None):
        if model is None:
            model, device = load_model(trained_path, model_file)
            model.eval()
        self.model = model
        self.device = device

        if img_transform is None:
            img_transform = build_transforms(is_train=False)
        self.img_transform = img_transform

        self.split_to_phrases = split_to_phrases
        if dataset is None:
            dataset = TextureDescriptionData(phid_format=None)
        self.dataset = dataset

        self.ph_vec_dict = None
        if self.split_to_phrases:
            ph_vecs = get_phrase_vecs(self.model, self.dataset)
            self.ph_vec_dict = {dataset.phrases[i]: ph_vecs[i] for i in range(len(dataset.phrases))}

        self.img_vecs = dict()
        return
Exemplo n.º 8
0
def compare_top_k(split, top_k=5):
    dataset = TextureDescriptionData()
    gt_captions = dict()
    for img_name in dataset.img_splits[split]:
        img_data = dataset.img_data_dict[img_name]
        gt_captions[img_name] = img_data['descriptions']
        # gt_captions[img_name] = list()
        # for desc in img_data['descriptions']:
        #     cap = ' '.join(WordEncoder.tokenize(desc))
        #     gt_captions[img_name].append(cap)

    for model_type in ('tri', 'cls'):
        if model_type is 'tri':
            model, _ = tri_load_model()
        else:
            model, _ = cls_load_model(dataset=dataset)

        # for top_k in range(1, 11):
        print('**** %s : top %d ****' % (model_type, top_k))
        predictions = top_k_caption(top_k,
                                    model_type=model_type,
                                    model=model,
                                    dataset=dataset,
                                    split=split)
        print(list(predictions.items())[0])
        compute_metrics(gt_captions, predictions)
    return
Exemplo n.º 9
0
def retrieve_with_desc_eval(pred_fn, dataset=None, split='val'):
    """
    INPUT:
    pred_fn: prediction function, input (desc, split='test'), output scores over images
    dataset: instance of TextureDescriptionData
    split: default is 'val'. match_scores should cover all imgs in this split
    """

    if dataset is None:
        dataset = TextureDescriptionData(phid_format='set')

    rrs = list()
    for img_i, img_name in tqdm(enumerate(dataset.img_splits[split]), total=len(dataset.img_splits[split]),
                                desc='computing mrr in retrieve_with_desc'):
        img_data = dataset.img_data_dict[img_name]
        for desc in img_data['descriptions']:
            pred_scores = pred_fn(desc, split=split)
            if np.all(pred_scores == 0):
                rrs.append(0)
            else:
                r = 1
                for s in pred_scores:
                    if s > pred_scores[img_i]:
                        r += 1
                rrs.append(1.0 / r)
    mrr = np.mean(rrs)
    print('mean reciprocal rank on %s: %f' % (split, mrr))
    return mrr
Exemplo n.º 10
0
def analyze_image_retrieval_desc(dataset=None, wc=None, cm_range=(0, 1)):
    metric_name = 'image_retrieval_desc_mrr'
    analyzer_cache_path = 'output/triplet_match/da3_bert_lr0.00001/subset_analyze_img_ret_desc.pkl'
    if os.path.exists(analyzer_cache_path):
        # if False:
        with open(analyzer_cache_path, 'rb') as f:
            analyzer = pickle.load(f)
    else:
        if dataset is None:
            dataset = TextureDescriptionData(phid_format=None)

        tri_desc_retriever = RetrieveImgFromDesc(
            dataset=dataset,
            split_to_phrases=False,
            trained_path='output/triplet_match/da3_bert_lr0.00001')
        analyzer = SubsetAnalyzer(metric_name)

        for img_i, img_name in tqdm(enumerate(dataset.img_splits['test']),
                                    total=len(dataset.img_splits['test']),
                                    desc='analyzing %s over images' %
                                    metric_name):
            img_data = dataset.img_data_dict[img_name]
            for desc in img_data['descriptions']:
                pred_scores = tri_desc_retriever(desc, split='test')
                if np.all(pred_scores == 0):
                    v = 0
                else:
                    r = 1
                    for s in pred_scores:
                        if s > pred_scores[img_i]:
                            r += 1
                    v = 1.0 / r
                phrases = dataset.description_to_phrases(desc)
                analyzer.update(value=v,
                                img_names=[img_name],
                                phrases=phrases,
                                desc=desc)
        with open(
                'output/triplet_match/da3_bert_lr0.00001/subset_analyze_img_ret_desc.pkl',
                'wb') as f:
            pickle.dump(analyzer, f)

    wc = analyzer.report('visualizations/subset/' + metric_name,
                         wc=wc,
                         cm_range=cm_range)
    return wc
def retrieve_img_desc_compare(split='test'):
    dataset = TextureDescriptionData()
    cls_retriever = ClsRetrieveImgFromDesc(dataset=dataset)
    tri_retriever = TriRetrieveImgFromDesc(dataset=dataset,
                                           split_to_phrases=True)
    tri_desc_retriever = TriRetrieveImgFromDesc(
        dataset=dataset,
        split_to_phrases=False,
        trained_path='output/triplet_match/da3_bert_lr0.00001')
    for fn in [tri_retriever, tri_desc_retriever, cls_retriever]:
        retrieve_with_desc_eval(pred_fn=fn, dataset=dataset, split=split)
    return
Exemplo n.º 12
0
def top_k_caption(top_k=5,
                  model_type='cls',
                  model=None,
                  dataset=None,
                  split='val'):
    if dataset is None:
        print('top_k_caption load dataset')
        dataset = TextureDescriptionData(phid_format=None)
    if model_type == 'cls':
        captioner = ClsCaption(dataset=dataset, model=model)
    elif model_type == 'tri':
        captioner = TriCaption(dataset=dataset, model=model)
    else:
        raise NotImplementedError

    predictions = dict()
    for img_name in tqdm(dataset.img_splits[split],
                         desc='captioning %s top %d on %s' %
                         (model_type, top_k, split)):
        img = dataset.load_img(img_name)
        caption = captioner(img, top_k)
        predictions[img_name] = [caption]
    return predictions
Exemplo n.º 13
0
    def __init__(self, model=None, img_transform=None, dataset=None, device='cuda'):
        if model is None:
            model, device = load_model()
        self.model = model
        self.device = device

        if img_transform is None:
            img_transform = build_transforms(is_train=False)
        self.img_transform = img_transform

        if dataset is None:
            dataset = TextureDescriptionData(phid_format=None)
        self.dataset = dataset

        self.ph_vecs = get_phrase_vecs(self.model, self.dataset)
        return
def retrieve_img(input_cases, exp_name='fore_color'):
    syn_dataset = SyntheticData()
    syn_imgs = syn_dataset.get_all_imgs()
    texture_dataset = TextureDescriptionData()

    cls_retriever = ClsRetrieveImgFromDesc(dataset=texture_dataset)
    cls_img_ph_scores = cls_retriever.get_img_ph_scores(imgs=syn_imgs)
    cls_fn = lambda desc: cls_retriever(desc, img_ph_scores=cls_img_ph_scores)
    cls_results = retrieve_img_eval(cls_fn, input_cases)
    del cls_retriever, cls_img_ph_scores, cls_fn
    print('classifier_retrieve done. acc_all: %.4f; acc_hard: %.4f' %
          (float(np.mean(cls_results[1])), float(np.mean(cls_results[2]))))

    tri_retriever = TriRetrieveImgFromDesc(dataset=texture_dataset,
                                           split_to_phrases=True)
    tri_img_vecs = tri_retriever.get_img_vecs(imgs=syn_imgs)
    tri_fn = lambda desc: tri_retriever(desc, img_vecs=tri_img_vecs)
    tri_results = retrieve_img_eval(tri_fn, input_cases)
    del tri_retriever, tri_img_vecs, tri_fn
    print('triplet_retrieve done. acc_all: %.4f; acc_hard: %.4f' %
          (float(np.mean(tri_results[1])), float(np.mean(tri_results[2]))))

    tri_desc_retriever = TriRetrieveImgFromDesc(
        dataset=texture_dataset,
        split_to_phrases=False,
        trained_path='output/triplet_match/da3_bert_lr0.00001')
    tri_desc_img_vecs = tri_desc_retriever.get_img_vecs(imgs=syn_imgs)
    tri_desc_fn = lambda desc: tri_desc_retriever(desc,
                                                  img_vecs=tri_desc_img_vecs)
    tri_desc_results = retrieve_img_eval(tri_desc_fn, input_cases)
    del tri_desc_retriever, tri_desc_img_vecs, tri_desc_fn
    print('triplet_desc_retrieve done. acc_all: %.4f; acc_hard: %.4f' % (float(
        np.mean(tri_desc_results[1])), float(np.mean(tri_desc_results[2]))))

    results = {
        'input_cases': input_cases,
        'cls_results': cls_results,
        'tri_results': tri_results,
        'tri_desc_results': tri_desc_results
    }
    np.save(
        'applications/synthetic_imgs/visualizations/results/retrieve_img_%s.npy'
        % exp_name,
        results,
        allow_pickle=True)

    return results
Exemplo n.º 15
0
def load_model(trained_path='output/naive_classify/v1_35_ft2,4_fc512_tuneTrue',
               model_file='epoch075.pth',
               dataset=None):
    cfg.merge_from_file(os.path.join(trained_path, 'train.yml'))
    if dataset is None:
        dataset = TextureDescriptionData(phid_format=None)
    model: PhraseClassifier = PhraseClassifier(
        class_num=len(dataset.phrases),
        pretrained_backbone=True,
        fc_dims=cfg.MODEL.FC_DIMS,
        use_feats=cfg.MODEL.BACKBONE_FEATS)

    model_path = os.path.join(trained_path, 'checkpoints', model_file)
    model.load_state_dict(torch.load(model_path))

    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    return model, device
Exemplo n.º 16
0
    def __init__(self,
                 model=None,
                 img_transform=None,
                 device='cuda',
                 dataset=None):
        if dataset is None:
            dataset = TextureDescriptionData(phid_format=None)
        self.dataset = dataset

        if model is None:
            model, device = load_model(dataset=self.dataset)
            model.eval()
        self.model = model
        self.device = device

        if img_transform is None:
            img_transform = build_transforms(is_train=False)
        self.img_transform = img_transform

        self.img_ph_scores = dict()
        return
Exemplo n.º 17
0
def caption_pred():
    texture_dataset = TextureDescriptionData(phid_format=None)
    syn_dataset = SyntheticData()

    pred_cls = dict()
    pred_tri = dict()
    cls_captioner = ClsCaption(dataset=texture_dataset)
    tri_captioner = TriCaption(dataset=texture_dataset)
    for i in tqdm(range(len(syn_dataset)), desc='captioning top5'):
        img = syn_dataset.get_img(*syn_dataset.unravel_index(i))
        pred_cls[i] = [cls_captioner(img, top_k=5)]
        pred_tri[i] = [tri_captioner(img, top_k=5)]

    pred_sat_t = sat_caption(beam_size=5, img_dataset=syn_dataset, split=None)
    pred_sat = dict()
    for k, v in pred_sat_t.items():
        k = int(k.data)
        pred_sat[k] = [v[0]]
        if k == 0:
            print(k, pred_sat[k])
    pred_dicts = [pred_cls, pred_tri, pred_sat]
    np.save('applications/synthetic_imgs/visualizations/results/caption.npy', pred_dicts)
    print('pred captions saved.')
    return pred_dicts
Exemplo n.º 18
0
def main(ph_num=100):
    cub_dataset = CUBDataset(split=None, val_ratio=0)
    classes = cub_dataset.class_names
    # model_name = 'LogisticRegression_newton-cg_multinomial_C1'
    # _, model = classify(cub_dataset=cub_dataset, feat_mode='ph_cls', model_name=model_name, norm=False,
    #                     val_ratio=0, ks=[ph_num])
    # weights = model.coef_  # classes x phrases
    # np.save('applications/fine_grained_classification/classify_ph_weights_%d.npy' % ph_num, weights)
    weights = np.load(
        'applications/fine_grained_classification/classify_ph_weights_%d.npy' %
        ph_num)

    texture_dataset = TextureDescriptionData()
    phrases = texture_dataset.phrases

    # mean_v = np.mean(weights)
    std_v = np.std(weights)
    min_v = -1 * std_v
    max_v = std_v

    pos_std = weights[weights > 0]
    neg_std = weights[weights < 0]

    print('ready')
    folder = 'applications/fine_grained_classification/ph_clouds_%d_np' % ph_num
    if not os.path.exists(folder):
        os.makedirs(folder)
    # for cls_i, cls in enumerate(classes):
    for cls_i in [17, 55, 188]:
        cls = classes[cls_i]
        ph_weights = weights[cls_i, :]
        ph_freq_dict = {ph: abs(w) for ph, w in zip(phrases, ph_weights)}
        pos_dict = {ph: w for ph, w in zip(phrases, ph_weights) if w > 0}
        neg_dict = {ph: -w for ph, w in zip(phrases, ph_weights) if w < 0}

        def color_fn(phrase, *args, **kwargs):
            ph_i = texture_dataset.phrase_to_phid(phrase)
            # v = (ph_weights[ph_i] - min_v) / (max_v - min_v)
            w = ph_weights[ph_i]
            if w > 0:
                v = w / pos_std + 0.5
            else:
                v = w / neg_std + 0.5
            cmap = cm.get_cmap('coolwarm')
            rgba = cmap(v, bytes=True)
            return rgba

        red_fn = lambda *args, **kwargs: "red"
        blue_fn = lambda *args, **kwargs: "blue"
        # wc = WordCloud(background_color="white", color_func=color_fn, prefer_horizontal=0.9,
        #                height=600, width=1200, min_font_size=5, margin=4, max_words=500,
        #                font_path='visualizations/DIN Alternate Bold.ttf')
        # wc.generate_from_frequencies(ph_freq_dict)
        # wc_path = os.path.join(folder, '%s.jpg' % cls)
        # print(wc_path)
        # wc.to_file(wc_path)
        wc = WordCloud(background_color="white",
                       color_func=red_fn,
                       prefer_horizontal=0.9,
                       height=600,
                       width=1200,
                       min_font_size=4,
                       margin=2,
                       max_words=500,
                       font_path='visualizations/DIN Alternate Bold.ttf')
        wc.generate_from_frequencies(pos_dict)
        wc_path = os.path.join(folder, '%s_pos.jpg' % cls)
        print(wc_path)
        wc.to_file(wc_path)

        wc = WordCloud(background_color="white",
                       color_func=blue_fn,
                       prefer_horizontal=0.9,
                       height=600,
                       width=1200,
                       min_font_size=4,
                       margin=2,
                       max_words=500,
                       font_path='visualizations/DIN Alternate Bold.ttf')
        wc.generate_from_frequencies(neg_dict)
        wc_path = os.path.join(folder, '%s_neg.jpg' % cls)
        print(wc_path)
        wc.to_file(wc_path)
Exemplo n.º 19
0
def main_eval():
    # load configs
    parser = argparse.ArgumentParser(
        description="Triplet (phrase) retrieval evaluation")
    parser.add_argument('-p',
                        '--trained_path',
                        help="path to trained model (where there is cfg file)",
                        default='output/triplet_match/c34_bert_l2_s_lr0.00001')
    parser.add_argument('-m',
                        '--model_file',
                        help='file name of the cached model ',
                        default='BEST_checkpoint.pth')
    parser.add_argument('-o',
                        '--opts',
                        default=None,
                        nargs=argparse.REMAINDER,
                        help="e.g. EVAL_SPLIT test")
    args = parser.parse_args()

    cfg.merge_from_file(os.path.join(args.trained_path, 'train.yml'))
    if args.opts is not None:
        cfg.merge_from_list(args.opts)

    # set random seed
    torch.manual_seed(cfg.RAND_SEED)
    np.random.seed(cfg.RAND_SEED)
    random.seed(cfg.RAND_SEED)

    dataset = TextureDescriptionData(phid_format=None)
    img_dataset = ImgOnlyDataset(split=cfg.EVAL_SPLIT,
                                 transform=build_transforms(is_train=False),
                                 texture_dataset=dataset)
    img_dataloader = DataLoader(img_dataset, batch_size=1, shuffle=False)

    phrase_dataset = PhraseOnlyDataset(texture_dataset=dataset)
    phrase_dataloader = DataLoader(phrase_dataset,
                                   batch_size=32,
                                   shuffle=False)

    model: TripletMatch = TripletMatch(
        vec_dim=cfg.MODEL.VEC_DIM,
        neg_margin=cfg.LOSS.MARGIN,
        distance=cfg.MODEL.DISTANCE,
        img_feats=cfg.MODEL.IMG_FEATS,
        lang_encoder_method=cfg.MODEL.LANG_ENCODER)

    model_path = os.path.join(args.trained_path, 'checkpoints',
                              args.model_file)
    model.load_state_dict(torch.load(model_path))

    device = torch.device(cfg.DEVICE)
    model.to(device)

    do_eval(model,
            img_dataloader,
            phrase_dataloader,
            device,
            split=cfg.EVAL_SPLIT,
            visualize_path=os.path.join(args.trained_path,
                                        'eval_visualize_%s' % cfg.EVAL_SPLIT),
            add_to_summary_name=model_path + ':' + cfg.EVAL_SPLIT)
def retrieve_img_visualize(split='test',
                           top_k=10,
                           mode='desc',
                           input_descs=None,
                           visualize_count=100):
    dataset = TextureDescriptionData()
    cls_retriever = ClsRetrieveImgFromDesc(dataset=dataset)
    tri_retriever = TriRetrieveImgFromDesc(dataset=dataset,
                                           split_to_phrases=True)
    tri_desc_retriever = TriRetrieveImgFromDesc(
        dataset=dataset,
        split_to_phrases=False,
        trained_path='output/triplet_match/da3_bert_lr0.00001')
    pred_fns = [cls_retriever, tri_retriever, tri_desc_retriever]

    img_pref = 'https://www.robots.ox.ac.uk/~vgg/data/dtd/thumbs/'

    html_str = '''<!DOCTYPE html>
<html lang="en">
<head>
    <title>Image Retrieval Comparison</title>
    <link rel="stylesheet" href="retrieve_img_style.css">
</head>
<body>
<h1>Image Retrieval Comparison</h1>
<span class="model">Classifier vs. Triplet(phrase) vs. Triplet(description)</span><br>
<hr>
'''
    if input_descs is not None:
        html_path = 'visualizations/retrieve_img_predefined.html'
        gt_img_names = None
    elif mode is 'desc':
        html_path = 'visualizations/retrieve_img_desc.html'
        gt_img_names = dataset.img_splits[split]
        if len(gt_img_names) > visualize_count:
            gt_img_names = np.random.choice(gt_img_names,
                                            visualize_count,
                                            replace=False)
        input_descs = [
            np.random.choice(dataset.img_data_dict[img_name]['descriptions'])
            for img_name in gt_img_names
        ]
    elif mode is 'phrase':
        html_path = 'visualizations/retrieve_img_phrase.html'
        gt_img_names = None
        input_descs = dataset.phrases
        if len(input_descs) > visualize_count:
            input_descs = np.random.choice(input_descs,
                                           visualize_count,
                                           replace=False)
    else:
        raise NotImplementedError

    for input_i in tqdm(range(len(input_descs)), desc='generating html'):
        desc = input_descs[input_i]
        html_str += '<span class="desc">%s</span><br>\n' % desc
        if gt_img_names is not None:
            img_name = gt_img_names[input_i]
            html_str += 'gt image: {img_name} <img src="{img_pref}{img_name}" alt="{img_name}"><br>\n'\
                .format(img_pref=img_pref, img_name=img_name)
        for pred_fn in pred_fns:
            img_scores = pred_fn(desc, split=split)
            sorted_img_idxs = np.argsort(img_scores * -1.0)
            top_k_idxs = sorted_img_idxs[:top_k]
            for img_idx in top_k_idxs:
                img_name = dataset.img_splits[split][img_idx]
                html_str += '<img src="{img_pref}{img_name}" alt="{img_name}">\n'\
                    .format(img_pref=img_pref, img_name=img_name)
            html_str += '<br>\n'
        html_str += '<hr>\n'

    html_str += '</body\n></html>'
    with open(html_path, 'w') as f:
        f.write(html_str)
    return
Exemplo n.º 21
0
def retrieve_eval(match_scores, dataset=None, split='val', mode='img2phrase',
                  visualize_path=None, max_visualize_num=100, add_to_summary_name=None, verbose=True):
    """
    INPUT:
    match_scores: [img_num x phrase_num], match_scores[i,j] shows how well img_i and phrase_j matches
    dataset: instance of TextureDescriptionData
    split: default is 'val'. match_scores should cover all imgs in this split
    visualize_path: if None, no visualization
    max_visualize_num: if <= 0, visualize all in this split
    """

    if dataset is None:
        dataset = TextureDescriptionData(phid_format='set')
    gt_matrix = dataset.get_img_phrase_match_matrices(split)
    img_num = gt_matrix.shape[0]
    phrase_num = gt_matrix.shape[1]

    if mode in i2p_mode_names:
        # each row is prediction for one image. phrase sorted by pred scores. values are whether the phrase is correct
        i2p_correct = np.zeros_like(gt_matrix, dtype=bool)  # img_num x phrase_num
        i2p_phrase_idxs = np.zeros_like(i2p_correct, dtype=int)
        for img_i in range(img_num):
            phrase_idx_sorted = np.argsort(-match_scores[img_i, :])
            i2p_phrase_idxs[img_i] = phrase_idx_sorted
            i2p_correct[img_i] = gt_matrix[img_i, phrase_idx_sorted]
        retrieve_binary_lists = i2p_correct
        retrieve_idxs = i2p_phrase_idxs
        # gt_count = dataset.get_gt_phrase_count(split)
    elif mode in p2i_mode_names:
        # each row is prediction for one prhase. images sorted by pred scores. values are whether the image is correct
        p2i_correct = np.zeros_like(gt_matrix, dtype=bool).transpose()  # class_num x img_num
        p2i_img_idxs = np.zeros_like(p2i_correct, dtype=int)
        for pi in range(phrase_num):
            img_idx_sorted = np.argsort(-match_scores[:, pi])
            p2i_img_idxs[pi] = img_idx_sorted
            p2i_correct[pi] = gt_matrix[img_idx_sorted, pi]
        retrieve_binary_lists = p2i_correct
        retrieve_idxs = p2i_img_idxs
        # gt_count = np.sum(gt_matrix, axis=0)
    else:
        raise NotImplementedError

    # calculate metrics
    metrics = dict()
    mean_reciprocal_rank_ = mean_reciprocal_rank(retrieve_binary_lists)
    r_precision_ = r_precision(retrieve_binary_lists)
    mean_average_precision_ = mean_average_precision(retrieve_binary_lists)
    metrics['mean_reciprocal_rank'] = mean_reciprocal_rank_
    metrics['r_precision'] = r_precision_
    metrics['mean_average_precision'] = mean_average_precision_

    for k in [5, 10, 20, 50, 100]:
        precision_at_k_ = mean_precision_at_k(retrieve_binary_lists, k)
        recall_at_k_ = mean_recall_at_k(retrieve_binary_lists, k, gt_count=None)
        metrics['precision_at_%03d' % k] = precision_at_k_
        metrics['recall_at_%03d' % k] = recall_at_k_

    # print metrics
    if verbose:
        print('## retrieve_eval {mode} on {split} ##'.format(mode=mode, split=split))
    for m, v in sorted(metrics.items(), key=lambda mv: mv[0]):
        print('%s: %.4f' % (m, v))

    # add to summary
    if mode in i2p_mode_names:
        log_to_summary(add_to_summary_name, i2p_metrics=metrics)
    elif mode in p2i_mode_names:
        log_to_summary(add_to_summary_name, p2i_metrics=metrics)

    # visualization
    if visualize_path is not None:
        if max_visualize_num <= 0 or max_visualize_num > len(retrieve_binary_lists):
            max_visualize_num = len(retrieve_binary_lists)
        if not os.path.exists(visualize_path):
            os.makedirs(visualize_path)

        precisions = list()
        recalls = list()
        for k in range(1, 101):
            precisions.append(mean_precision_at_k(retrieve_binary_lists, k))
            recalls.append(mean_recall_at_k(retrieve_binary_lists, k, gt_count=None))

        # plot pr curve and topk-recall curve
        plot_precision_recall_curves(mode, precisions, recalls, visualize_path)

        # generate html file
        generate_html(dataset, split, mode, visualize_path, max_visualize_num, img_num, phrase_num, gt_matrix,
                      match_scores, metrics, retrieve_idxs)

    return metrics
Exemplo n.º 22
0
def eval_caption(split,
                 dataset=None,
                 pred_captions=None,
                 pred_captions_fpath=None,
                 html_path=None,
                 visualize_count=100):
    if pred_captions is None:
        with open(pred_captions_fpath, 'r') as f:
            pred_captions = json.load(f)
    assert type(pred_captions) == dict

    if dataset is None:
        dataset = TextureDescriptionData(phid_format=None)
    gt_captions = dict()
    for img_name in dataset.img_splits[split]:
        img_data = dataset.img_data_dict[img_name]
        gt_captions[img_name] = img_data['descriptions']
        # gt_captions[img_name] = list()
        # for desc in img_data['descriptions']:
        #     cap = ' '.join(WordEncoder.tokenize(desc))
        #     gt_captions[img_name].append(cap)

    pred_k_metrics_list = list()
    pred_per_img = len(list(pred_captions.values())[0])

    for pred_k in range(pred_per_img):
        print('Metrics on %d-th predicted caption:' % (pred_k + 1))
        tic = time.time()
        pred_caps_k = {
            img_name: [caps[pred_k]]
            for img_name, caps in pred_captions.items()
        }
        metrics_k = compute_metrics(gt_captions, pred_caps_k)
        pred_k_metrics_list.append(metrics_k)
        toc = time.time()
        print('time cost: %.1f s' % (toc - tic))

    pred_k_metrics_dict = dict()
    for metric in pred_k_metrics_list[0].keys():
        pred_k_metrics_dict[metric] = [
            metric_dict[metric] for metric_dict in pred_k_metrics_list
        ]

    if html_path is not None:
        html_str = '<html><body>\n'
        html_str += '<h1>Captioning metrics</h1>\n'

        for pred_k in range(len(pred_k_metrics_list)):
            html_str += '<b>Metrics on %d-th predicted captions:</b><br>\n' % (
                pred_k + 1)
            for k, v in pred_k_metrics_list[pred_k].items():
                mean = np.mean(pred_k_metrics_dict[k][:pred_k + 1])
                html_str += '%s: %f (mean of top %d: %f)<br>\n' % (
                    k, v, pred_k + 1, mean)

        img_names = dataset.img_splits[split]
        html_str += '<table>\n'
        for img_i, img_name in enumerate(img_names):
            html_str += '<tr style="border-bottom:1px solid black; border-collapse: collapse;">'
            html_str += '<td><img src=https://maxwell.cs.umass.edu/mtimm/images/%s width=300></td>\n' % img_name
            # pred caps
            pred_caps = pred_captions[img_name]
            pred_str = '<b>Predicted captions:</b><br><br>\n'
            for ci, cap in enumerate(pred_caps):
                pred_str += '({ci}) {cap}<br>\n'.format(ci=ci, cap=cap)
            html_str += '<td>' + pred_str + '</td>\n'
            # gt descriptions
            descriptions = dataset.img_data_dict[img_name]['descriptions']
            desc_str = '<b>Ground-truth descriptions:</b><br><br>\n'
            for di, desc in enumerate(descriptions):
                desc_str += '({di}) {desc}<br>\n'.format(di=di, desc=desc)
            html_str += '<td>' + desc_str + '</td>\n'
            html_str += '</tr>\n'

            if img_i >= visualize_count:
                break

        html_str += '</table></body></html>'
        with open(html_path, 'w') as f:
            f.write(html_str)

    return pred_k_metrics_list
Exemplo n.º 23
0
def compare_visualize(split='test',
                      html_path='visualizations/caption.html',
                      visualize_count=100):
    dataset = TextureDescriptionData()
    word_encoder = WordEncoder()
    # cls_predictions = top_k_caption(top_k=5, model_type='cls', dataset=dataset, split=split)
    # with open('output/naive_classify/v1_35_ft2,4_fc512_tuneTrue/caption_top5_%s.json' % split, 'w') as f:
    #     json.dump(cls_predictions, f)
    # tri_predictions = top_k_caption(top_k=5, model_type='tri', dataset=dataset, split=split)
    # with open('output/triplet_match/c34_bert_l2_s_lr0.00001/caption_top5_%s.json' % split, 'w') as f:
    #     json.dump(tri_predictions, f)
    cls_predictions = json.load(
        open(
            'output/naive_classify/v1_35_ft2,4_fc512_tuneTrue/caption_top5_%s.json'
            % split))
    tri_predictions = json.load(
        open(
            'output/triplet_match/c34_bert_l2_s_lr0.00001/caption_top5_%s.json'
            % split))
    sat_predictions = json.load(
        open('output/show_attend_tell/results/pred_v2_last_beam5_%s.json' %
             split))
    pred_dicts = [cls_predictions, tri_predictions, sat_predictions]
    img_pref = 'https://www.robots.ox.ac.uk/~vgg/data/dtd/thumbs/'

    html_str = '''<!DOCTYPE html>
<html lang="en">
<head>
    <title>Caption visualize</title>
    <link rel="stylesheet" href="caption_style.css">
</head>
<body>
<table>
    <col class="column-one">
    <col class="column-two">
    <col class="column-three">
    <tr>
        <th style="text-align: center">Image</th>
        <th>Predicted captions</th>
        <th>Ground-truth descriptions</th>
    </tr>
'''

    for img_i, img_name in enumerate(dataset.img_splits[split]):
        gt_descs = dataset.img_data_dict[img_name]['descriptions']
        gt_desc_str = '|'.join(gt_descs)
        gt_html_str = ''
        for ci, cap in enumerate(gt_descs):
            gt_html_str += '[%d] %s<br>\n' % (ci + 1, cap)

        pred_caps = [pred_dict[img_name][0] for pred_dict in pred_dicts]
        for ci, cap in enumerate(pred_caps):
            tokens = WordEncoder.tokenize(cap)
            for ti, t in enumerate(tokens):
                if t in gt_desc_str and len(t) > 1:
                    tokens[ti] = '<span class="correct">%s</span>' % t
            pred_caps[ci] = word_encoder.detokenize(tokens)
        html_str += '''
<tr>
    <td>
        <img src={img_pref}{img_name} alt="{img_name}">
    </td>
    <td>
        <span class="pred_name">Classifier top 5:</span><br>
        {pred0}<br>
        <span class="pred_name">Triplet top 5:</span><br>
        {pred1}<br>
        <span class="pred_name">Show-attend-tell:</span><br>
        {pred2}<br>
    </td>
    <td>
        {gt}
    </td>
</tr>
'''.format(img_pref=img_pref,
           img_name=img_name,
           pred0=pred_caps[0],
           pred1=pred_caps[1],
           pred2=pred_caps[2],
           gt=gt_html_str)

        if img_i >= visualize_count:
            break

    html_str += '</table>\n</body\n></html>'
    with open(html_path, 'w') as f:
        f.write(html_str)

    return
Exemplo n.º 24
0
    # scorer = Bleu(4)
    scorer = Meteor()

    for img_i, img_name in tqdm(enumerate(dataset.img_splits['test']),
                                total=len(dataset.img_splits['test']),
                                desc='analyzing %s over images' % metric_name):
        img_data = dataset.img_data_dict[img_name]
        phids = img_data['phrase_ids']
        phrases = [dataset.phid_to_phrase(i) for i in phids]
        gt_captions = img_data['descriptions']
        pred_caption = pred_captions[img_name][0]
        score, _ = scorer.compute_score({0: gt_captions}, {0: [pred_caption]})
        # bleu4 = score[3]
        meteor = score
        analyzer.update(value=meteor, img_names=[img_name], phrases=phrases)

    wc = analyzer.report('visualizations/subset/' + metric_name,
                         wc=wc,
                         cm_range=cm_range)
    return wc


if __name__ == '__main__':
    dataset = TextureDescriptionData(phid_format='set')
    wc = None
    # analyze_phrase_retrieval(dataset, cm_range=(0.25, 0.35))
    # wc = analyze_phrase_retrieval(dataset)
    # analyze_image_retrieval(dataset, cm_range=(0, 0.5))
    analyze_image_retrieval_desc(dataset, cm_range=(0, 0.2))
    # analyze_caption(dataset, cm_range=(0.17, 0.24))