def bbox_overlaps(rois, gt_box): max_bbox = gt_box.size(1) batch_size = rois.size(0) max_rois = rois.size(1) overlaps = bbox_overlaps_batch(rois[:, :, :4], gt_box) overlaps = overlaps.view(batch_size, 1, max_rois, max_bbox)\ .expand(batch_size, 5, max_rois, max_bbox).contiguous()\ .view(-1, max_rois, max_bbox) return overlaps
def gt_grd_eval(self): ref = self.ref pred = self.pred print( 'Number of images in the reference: {}, number of images in the submission: {}' .format(len(ref), len(pred))) results = defaultdict(list) for lst_idx, anns in enumerate(ref): img = str(anns['image_id']) for num_sent, ann in enumerate(anns['captions']): ref_bbox_all = torch.Tensor(ann['process_bnd_box']) sent_idx = ann[ 'process_idx'] # index of word in sentence to evaluate for idx in sent_idx: sel_idx = [ ind for ind, i in enumerate(ann['process_idx']) if idx == i ] assert (len(sel_idx) == 1) ref_bbox = ref_bbox_all[sel_idx[0]] # select matched boxes assert (ref_bbox.size(0) > 0) class_name = ann['process_clss'][sel_idx[0]] if img not in pred: results[class_name].append(0) # image not grounded elif len(pred[img]) != 5: raise Exception( 'Each image must have five caption predictions!') elif idx not in pred[img][num_sent]['idx_in_sent']: results[class_name].append(0) # object not grounded else: pred_ind = pred[img][num_sent]['idx_in_sent'].index( idx) pred_bbox = torch.Tensor( pred[img][num_sent]['bbox'][pred_ind]) overlap = bbox_overlaps_batch(pred_bbox.unsqueeze(0), \ ref_bbox.unsqueeze(0).unsqueeze(0)) results[class_name].append( 1 if torch.max(overlap) > self.iou_thresh else 0) print('Number of groundable objects in this split: {}'.format( len(results))) grd_accu = np.mean( [sum(hm) * 1. / len(hm) for i, hm in results.items()]) print('-' * 80) print('The overall localization accuracy is {:.4f}'.format(grd_accu)) print('-' * 80) if self.verbose: print( 'Object frequency and grounding accuracy per class (descending by object frequency):' ) accu_per_clss = {(i, sum(hm) * 1. / len(hm)): len(hm) for i, hm in results.items()} accu_per_clss = sorted(accu_per_clss.items(), key=lambda x: x[1], reverse=True) for accu in accu_per_clss: print('{} ({}): {:.4f}'.format(accu[0][0], accu[1], accu[0][1])) return grd_accu
def bbox_overlaps(rois, gt_box, frm_mask): overlaps = bbox_overlaps_batch(rois[:, :, :5], gt_box[:, :, :5], frm_mask) return overlaps
def grd_eval(self, mode='all'): # pdb.set_trace() if mode == 'all': print('Evaluating on all object words.') elif mode == 'loc': print('Evaluating only on correctly-predicted object words.') else: raise Exception('Invalid loc mode!') ref = self.ref pred = self.pred print( 'Number of images in the reference: {}, number of images in the submission: {}' .format(len(ref), len(pred))) nlp = StanfordCoreNLP('tools/stanford-corenlp-full-2018-02-27') props = { 'annotators': 'lemma', 'pipelineLanguage': 'en', 'outputFormat': 'json' } vocab_in_split = set() # precision prec = defaultdict(list) for lst_idx, anns in tqdm(enumerate(ref)): img = str(anns['image_id']) for num_sent, ann in enumerate(anns['captions']): if img not in pred: continue # do not penalize if sentence not annotated # try: # assert(len(pred[img]) == 1) # except: # pdb.set_trace() ref_bbox_all = torch.Tensor(ann['process_bnd_box']) idx_in_sent = {} for box_idx, cls in enumerate(ann['process_clss']): vocab_in_split.update(set([cls])) idx_in_sent[cls] = idx_in_sent.get( cls, []) + [ann['process_idx'][box_idx]] sent_idx = ann['process_idx'] # index of gt object words exclude_obj = {json.loads(nlp.annotate(token, properties=props) \ )['sentences'][0]['tokens'][0]['lemma']:1 for token_idx, token in enumerate(ann['tokens'] \ ) if (token_idx not in sent_idx and token != '')} for pred_idx, class_name in enumerate(pred[img][0]['clss']): if class_name in idx_in_sent: gt_idx = min(idx_in_sent[class_name] ) # always consider the first match... sel_idx = [ idx for idx, i in enumerate(ann['process_idx']) if gt_idx == i ] assert (len(sel_idx) == 1) ref_bbox = ref_bbox_all[ sel_idx[0]] # select matched boxes assert (ref_bbox.size(0) > 0) pred_bbox = torch.Tensor( pred[img][0]['bbox'][pred_idx]) overlap = bbox_overlaps_batch(pred_bbox.unsqueeze(0), \ ref_bbox.unsqueeze(0).unsqueeze(0)) prec[class_name].append( 1 if torch.max(overlap) > self.iou_thresh else 0) elif json.loads( nlp.annotate(class_name, properties=props) )['sentences'][0]['tokens'][0]['lemma'] in exclude_obj: pass # do not penalize if gt object word not annotated (missed) else: if mode == 'all': prec[class_name].append(0) # hallucinated object nlp.close() # recall recall = defaultdict(list) for lst_idx, anns in enumerate(ref): img = str(anns['image_id']) for num_sent, ann in enumerate(anns['captions']): ref_bbox_all = torch.Tensor(ann['process_bnd_box']) sent_idx = ann['process_idx'] # index of gt object words for gt_idx in sent_idx: sel_idx = [ idx for idx, i in enumerate(ann['process_idx']) if gt_idx == i ] assert (len(sel_idx) == 1) ref_bbox = ref_bbox_all[sel_idx[0]] # select matched boxes assert (ref_bbox.size(0) > 0) class_name = ann['process_clss'][sel_idx[0]] if img not in pred: recall[class_name].append(0) # image not grounded elif class_name in pred[img][0]['clss']: pred_idx = pred[img][0]['clss'].index( class_name) # always consider the first match... pred_bbox = torch.Tensor( pred[img][0]['bbox'][pred_idx]) overlap = bbox_overlaps_batch(pred_bbox.unsqueeze(0), \ ref_bbox.unsqueeze(0).unsqueeze(0)) recall[class_name].append( 1 if torch.max(overlap) > self.iou_thresh else 0) else: if mode == 'all': recall[class_name].append(0) # object not grounded num_vocab = len(vocab_in_split) print( 'Number of groundable objects in this split: {}'.format(num_vocab)) print('Number of objects in prec and recall: {}, {}'.format( len(prec), len(recall))) prec_accu = np.sum([sum(hm) * 1. / len(hm) for i, hm in prec.items()]) * 1. / num_vocab recall_accu = np.sum( [sum(hm) * 1. / len(hm) for i, hm in recall.items()]) * 1. / num_vocab f1 = 2. * prec_accu * recall_accu / (prec_accu + recall_accu) print('-' * 80) print( 'The overall precision_{0} / recall_{0} / F1_{0} are {1:.4f} / {2:.4f} / {3:.4f}' .format(mode, prec_accu, recall_accu, f1)) print('-' * 80) if self.verbose: print( 'Object frequency and grounding accuracy per class (descending by object frequency):' ) accu_per_clss = {} for i in vocab_in_split: prec_clss = sum(prec[i]) * 1. / len( prec[i]) if i in prec else 0 recall_clss = sum(recall[i]) * 1. / len( recall[i]) if i in recall else 0 accu_per_clss[(i, prec_clss, recall_clss)] = (len(prec[i]), len(recall[i])) accu_per_clss = sorted(accu_per_clss.items(), key=lambda x: x[1][1], reverse=True) for accu in accu_per_clss: print('{} ({} / {}): {:.4f} / {:.4f}'.format( accu[0][0], accu[1][0], accu[1][1], accu[0][1], accu[0][2])) return prec_accu, recall_accu, f1