Exemplo n.º 1
0
def scatterplot_recall(tubes_recall,
                       objects_recall,
                       fig_width=13,
                       fig_height=9,
                       exclude_classes=None):
    class_map = utils.class2idx_map()
    AUC = []
    for class_label in range(1, len(class_map)):  # for each class
        # area under the curve
        AUC.append(eval_utils.average_precision(objects_recall[class_label]))
    name = "tab10"
    cmap = get_cmap(name)
    colors = cmap.colors
    fig = plt.figure(figsize=(fig_width, fig_height))
    plt.grid(alpha=0.4)
    plt.xlabel('AUC (Objects Recall)', fontsize=fig_width)
    plt.ylabel('Tubes Recall', fontsize=fig_width)
    fig.axes[0].tick_params(labelsize=fig_width - 2)
    hex_colors = [matplotlib.colors.to_hex(color) for color in colors]
    for idx, class_label in enumerate(range(1, len(class_map))):
        if exclude_classes is not None:
            if utils.idx2class(class_map, class_label) in exclude_classes:
                continue
        plt.scatter(AUC[idx],
                    tubes_recall[idx],
                    c=hex_colors[idx],
                    label=utils.idx2class(class_map, class_label),
                    s=50)
    fig.axes[0].legend(fontsize=fig_width - 3)
Exemplo n.º 2
0
def plot_objects_recall(objects_recall, fig_width=13, fig_height=9):
    class_map = utils.class2idx_map()
    name = "tab10"
    cmap = get_cmap(name)
    colors = cmap.colors
    fig = plt.figure(figsize=(fig_width, fig_height))
    plt.grid(alpha=0.4)
    plt.xlabel('Attention Threshold', fontsize=fig_width)
    plt.ylabel('Recall', fontsize=fig_width)
    plt.xticks(np.linspace(0, 1, len(class_map)))
    plt.yticks(np.linspace(0, 1, len(class_map)))
    fig.axes[0].tick_params(labelsize=fig_width - 2)
    hex_colors = [matplotlib.colors.to_hex(color) for color in colors]
    for idx, class_label in enumerate(objects_recall):
        plt.plot(objects_recall[class_label][:, 1],
                 objects_recall[class_label][:, 0],
                 c=hex_colors[idx],
                 label=utils.idx2class(class_map, class_label),
                 linewidth=2)
    fig.axes[0].legend(fontsize=fig_width - 4)
Exemplo n.º 3
0
def get_objects_recall(config_path, epoch, split, num_threshold_points=100):
    
    with open(os.path.join(config_path, 'config.pkl'), 'rb') as f:
        cfg_dict = pickle.load(f)    
    
    with open(os.path.join(cfg_dict['am_path'], cfg_dict['filename'], split, 'am_epoch_' + str(epoch) + '_keyframes' + '.pkl'), 'rb') as f:
        am = pickle.load(f)
        
    with open(os.path.join(cfg_dict['annot_path'], 'annotated_data.pkl'), 'rb') as f:
        annotated_data = pickle.load(f)
    
    with open(os.path.join(cfg_dict['annot_path'], 'daly1.1.0.pkl'), 'rb') as f:
        annot = pickle.load(f, encoding='latin1')
    
    obj_annotations = utils.get_obj_annotations(annotated_data, annot)
    
    classes_to_exclude = cfg_dict['classes_to_exclude']
    OH = cfg_dict['out_feature_size'][0]
    OW = cfg_dict['out_feature_size'][1]
    T_fm = cfg_dict['out_feature_temp_size']
    class_map = utils.class2idx_map()
    num_layers = cfg_dict['num_layers']
    num_graphs = cfg_dict['num_graphs']
    
    # collect tubes with IoU > 0.5
    tubes_dict = {}
    for video in annotated_data[split]:
        vid_annot = annotated_data[split][video]
        w, h = vid_annot['(width, height)']
        for instance in vid_annot['action_instances']:
            instance_annot = annotated_data[split][video]['action_instances'][instance]
            keyframes_dict = instance_annot['keyframes']
            keyframe_ids = np.array(list(keyframes_dict.keys()))
            keyframe_boxes = np.copy(np.stack(list(keyframes_dict.values())))
            keyframe_boxes[:, [0, 2]] = np.copy(keyframe_boxes[:, [0, 2]]) * w
            keyframe_boxes[:, [1, 3]] = np.copy(keyframe_boxes[:, [1, 3]]) * h
            for tube_id in instance_annot['tubes']:
                tube = instance_annot['tubes'][tube_id]
                spt_iou = np.mean(utils.get_tube_iou(tube[np.in1d(tube[:, 0], keyframe_ids), 1:5], keyframe_boxes))
                if spt_iou > 0.5:
                    if video not in tubes_dict:
                        tubes_dict[video] = {}
                    if instance not in tubes_dict[video]:
                        tubes_dict[video][instance] = {}
                        tubes_dict[video][instance]['tubes'] = {}
                        tubes_dict[video][instance]['tube_labels'] = []
                    tubes_dict[video][instance]['tubes'][tube_id] = tube
                    tubes_dict[video][instance]['tube_labels'].append(instance_annot['tube_labels'][tube_id])
    
    objects_recall = {}
    thresholds = np.linspace(0, 1, num_threshold_points)
    for class_label in range(1, len(class_map)): # recall curve for each class (exclude background)
        if classes_to_exclude is not None:
            class_name = utils.idx2class(class_map, class_label) 
            if class_name not in classes_to_exclude:
                continue
        # calculate total number of false negatives
        fn = 0
        for video in obj_annotations[split]:
            if (video not in am.keys()) or (video not in tubes_dict):
                continue # no object annotations or no positive tubes
            vid_annot = obj_annotations[split][video]
            for instance in vid_annot['action_instances']:
                if instance not in tubes_dict[video]:
                    continue
                tubes_instance = tubes_dict[video][instance]
                assert len(set(tubes_instance['tube_labels'])) == 1
                instance_label = tubes_instance['tube_labels'][0]
                if class_label != instance_label:
                    continue # skip instances of different class
                keyframes = list(vid_annot['action_instances'][instance].keys())
                for keyframe in keyframes:
                    fn_keyframe = 0
                    for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])):
                        if (vid_annot['action_instances'][instance][keyframe][box_idx][5] == 1) or (vid_annot['action_instances'][instance][keyframe][box_idx][6] == 1):
                            continue
                        fn_keyframe += 1
                    fn += fn_keyframe * len(tubes_instance['tube_labels']) # total number of false negatives
        
        recall_values = np.zeros([len(thresholds), 2])
        for idx, threshold in enumerate(thresholds): # for each threshold
            tp = 0
            fn_ = fn
            for video in obj_annotations[split]:
                if (video not in am.keys()) or (video not in tubes_dict): 
                    continue # no object annotations or no positive tubes
                vid_annot = obj_annotations[split][video]
                W, H = obj_annotations[split][video]['(width, height)']
                for instance in vid_annot['action_instances']:
                    if instance not in tubes_dict[video]:
                        continue # no positive tubes
                    assert len(set(tubes_dict[video][instance]['tube_labels'])) == 1
                    instance_label = tubes_dict[video][instance]['tube_labels'][0]
                    if class_label != instance_label:
                        continue # skip instances of different class
                    keyframes = list(vid_annot['action_instances'][instance].keys())
                            
                    for tube_id in tubes_dict[video][instance]['tubes']: # for each (positive) tube                   
                        for keyframe in keyframes: 
                            att_map_list = []
                            for layer_num in range(num_layers):
                                for graph_num in range(num_graphs):
                                    lngn = str(layer_num) + str(graph_num) # layer number and graph number
                                    att_map = am[video][instance][keyframe][lngn][tube_id]
                                    att_map = att_map.reshape(T_fm, OH, OW)[3]
                                    att_map = att_map.reshape(-1)
                                    att_map = scipy.special.softmax(att_map)
                                    att_map = att_map.reshape(OH, OW)
                                    att_map_list.append(att_map)

                            # get obj annotation for keyframe
                            for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])): # for each object annotation in keyframe
                                obj_box = vid_annot['action_instances'][instance][keyframe][box_idx][0:4]
                                obj_box = obj_box * OH
                                x1 = int(round(obj_box[0]))
                                y1 = int(round(obj_box[1]))
                                x2 = int(round(obj_box[2]))
                                y2 = int(round(obj_box[3]))
                                sum_list = []
                                att_map_idx = 0
                                for layer_num in range(num_layers):
                                    for graph_num in range(num_graphs):
                                        patch = att_map_list[att_map_idx][y1:y2 + 1, x1:x2 + 1]
                                        att_sum = np.sum(patch) # add attention values inside the object bounding box
                                        sum_list.append(att_sum)
                                        att_map_idx += 1
                                is_positive = any(np.array(sum_list) > threshold) # if any of the graphs satisfies condition
                                if is_positive:
                                    tp += 1
                                    fn_ -= 1

            recall_values[idx, 0] = tp / (tp + fn_)
            recall_values[idx, 1] = threshold
        objects_recall[class_label] = recall_values
    return objects_recall
Exemplo n.º 4
0
def videomAP(scores, annot_data, split, cfg, iou_threshold=0.5):    
    
    PR = {}

    scored_tubes = score_tubes(scores, cfg.num_actions)
    pred_tubes = get_tube_predictions(scored_tubes, annot_data, split, cfg.num_actions, nms_threshold=0.2)
    gt_tubes = get_gt_tubes(annot_data, split, list(scored_tubes.keys()), cfg)
    
    for class_label in range(1, cfg.num_actions):

        class_pred_tubes = pred_tubes[class_label]
        class_gt_tubes = gt_tubes[class_label]

        pr = np.empty((len(class_pred_tubes) + 1, 2), dtype=np.float32)
        pr[0,0] = 1.0
        pr[0,1] = 0.0
        
        fp = 0
        tp = 0
        fn = 0
        covered_gt_tubes = {}
        for video in class_gt_tubes:
            covered_gt_tubes[video] = {}
            instances = class_gt_tubes[video]
            for instance in instances:
                num_gt_tubes = len(class_gt_tubes[video][instance])
                covered_gt_tubes[video][instance] = num_gt_tubes * [0]
                fn += num_gt_tubes

        for i, j in enumerate(np.argsort(-np.array([pred_tube[2] for pred_tube in class_pred_tubes]))):
            video, instance, score, tube_id, tube = class_pred_tubes[j]

            is_positive = False
            if video in class_gt_tubes:
                if instance in class_gt_tubes[video]:
                    gt_kf_tubes = class_gt_tubes[video][instance]
                    ious = []
                    for gt_tube in gt_kf_tubes:
                        keyframes = gt_tube[:, 0]
                        ious.append(np.mean(utils.get_tube_iou(tube[np.in1d(tube[:, 0], keyframes), 1:5], gt_tube[:, 1:5])))
                    amax = np.argmax(ious)

                    if ious[amax] >= iou_threshold:
                        if covered_gt_tubes[video][instance][amax] == 0:
                            is_positive = True
                            covered_gt_tubes[video][instance][amax] = 1

            if is_positive:
                tp += 1
                fn -= 1
            else:
                fp += 1

            pr[i+1,0] = tp / (tp + fp)
            pr[i+1,1] = tp / (tp + fn)

        PR[utils.idx2class(cfg.class_map, class_label)] = pr

    AP = {class_name:100 * average_precision(PR[class_name]) for class_name in PR}
    mAP = sum(list(AP.values())) / len(AP)
    return mAP, AP
Exemplo n.º 5
0
def plot_attention_maps(config_path,
                        epoch,
                        pred_type,
                        size=5,
                        split='test',
                        fig_width=15,
                        fig_height=25):

    with open(os.path.join(config_path, 'config.pkl'), 'rb') as f:
        cfg_dict = pickle.load(f)

    am_path = cfg_dict['am_path']
    filename = cfg_dict['filename']
    annot_path = cfg_dict['annot_path']
    class_map = cfg_dict['class_map']
    T_fm = cfg_dict['out_feature_temp_size']
    OH, OW = cfg_dict['out_feature_size']
    num_layers = cfg_dict['num_layers']
    num_graphs = cfg_dict['num_graphs']
    h, w = cfg_dict['img_size']
    data_path = cfg_dict['data_path']

    with open(os.path.join(annot_path, 'annotated_data.pkl'), 'rb') as f:
        annotated_data = pickle.load(f)

    with open(
            os.path.join(am_path, filename, split,
                         'am_epoch_' + str(epoch) + '.pkl'), 'rb') as f:
        am = pickle.load(f)

    for class_label in range(1, cfg_dict['num_actions']):

        print(utils.idx2class(class_map, class_label))
        print(15 * '-')
        class_preds = pred_type[class_label]
        num_preds = len(class_preds)
        if num_preds == 0:
            print()
            continue
        if num_preds > size:
            sample = np.random.choice(num_preds, size=size, replace=False)
        else:
            sample = np.random.choice(num_preds, size=num_preds, replace=False)
        preds = [class_preds[s] for s in sample]

        for idx, pred in enumerate(preds):
            video, instance, score, tube_id, tube = pred
            tbound = annotated_data[split][video]['action_instances'][
                instance]['tbound']
            W, H = annotated_data[split][video]['(width, height)']
            tube_label = annotated_data[split][video]['action_instances'][
                instance]['tube_labels'][tube_id]
            center_frame = int(
                np.random.choice(list(am[video][instance].keys()),
                                 1))  # sample center frame
            frame_seq = np.array([
                frame_num for frame_num in range(center_frame -
                                                 15, center_frame + 16 + 1)
            ])
            frames_in_tbound = []
            for frame_num in frame_seq:
                if (frame_num > tbound[0]) and (frame_num < tbound[-1]):
                    frames_in_tbound.append(frame_num)
            clip = []
            for frame_num in frame_seq:
                if frame_num in frames_in_tbound:
                    clip.append(frame_num)
                else:
                    if frame_num < center_frame:
                        clip.append(frames_in_tbound[0])
                    else:
                        clip.append(frames_in_tbound[-1])
            clip = clip[3::4]  # get every 4th frame of clip

            print('Prediction:', utils.idx2class(class_map, class_label), '|',
                  'Ground Truth:', utils.idx2class(class_map, tube_label))

            fig, ax = plt.subplots(T_fm,
                                   num_layers * num_graphs + 1,
                                   figsize=(fig_width, fig_height))
            for t in range(T_fm):
                frame_num = clip[t]
                #frame_num = center_frame
                boxes = np.copy(
                    annotated_data[split][video]['action_instances'][instance]
                    ['tubes'][tube_id])
                box = boxes[np.where(boxes[:, 0] == frame_num)[0], 1:5][0]
                box[[0, 2]] = (box[[0, 2]] / W) * w
                box[[1, 3]] = (box[[1, 3]] / H) * h
                img = plt.imread(
                    os.path.join(data_path, video,
                                 'frame' + str(frame_num).zfill(6) + '.jpg'))
                rect = plt.Rectangle((box[0], box[1]),
                                     box[2] - box[0],
                                     box[3] - box[1],
                                     fill=False,
                                     edgecolor='r',
                                     linewidth=1)
                ax[t][0].add_patch(rect)
                ax[t][0].imshow(img)
                ax_idx = 0
                for j in range(num_layers):
                    for i in range(num_graphs):
                        ax_idx += 1
                        img2 = plt.imread(
                            os.path.join(
                                data_path, video,
                                'frame' + str(frame_num).zfill(6) + '.jpg'))
                        lngn = str(j) + str(i)
                        att_map = scipy.special.softmax(
                            np.copy(am[video][instance][center_frame][lngn]
                                    [tube_id]))
                        att_map = att_map.reshape(
                            T_fm, OH, OW)[t]  # temporal attention maps
                        #att_map = np.mean(att_map.reshape(T_fm, OH, OW), axis=0) # temporal average attention map
                        res = cv2.resize(att_map,
                                         dsize=(w, h),
                                         interpolation=cv2.INTER_CUBIC)
                        extent = 0, w, 0, h
                        ax[t][ax_idx].imshow(img2, extent=extent)
                        ax[t][ax_idx].imshow(res,
                                             alpha=0.5,
                                             cmap='Reds',
                                             extent=extent)

            plt.pause(.5)
            print('===' * 20)
        print()
        print()
Exemplo n.º 6
0
def extract_features(dataloader, dataset, model, device, annot_data, annot):
    
    model.eval()
    
    obj_annotations = utils.get_obj_annotations(annot_data, annot)
    class_map = utils.class2idx_map(classes_to_exclude=None)
    features_dict = {}
    
    for idx, batch_data in enumerate(dataloader):
        
        imgs = batch_data[0]
        person_boxes = batch_data[1]
        action_labels = batch_data[2]
        num_boxes_per_frame = batch_data[3]
        video_names = batch_data[4]
        instances = batch_data[5]
        center_frames = batch_data[6]
        
        video_name = video_names[0]
        instance = instances[0].item()
        keyframe = center_frames[0].item()
            
        if video_name not in obj_annotations[dataset.split]:
            continue
        if instance not in obj_annotations[dataset.split][video_name]['action_instances']:
            continue
        if keyframe not in obj_annotations[dataset.split][video_name]['action_instances'][instance]:
            continue
       
        num_actors_list = [num_boxes_per_frame[b][15].item() for b in range(imgs.shape[0])]
        
        batch = [data.to(device=device) for data in [imgs, person_boxes]]
        batch.append(num_boxes_per_frame)
        batch.append(action_labels)

        with torch.set_grad_enabled(False):
            actor_features_emb_list, context_features_list = model(batch, 'return_features')
        
        for graph_num in range(len(actor_features_emb_list)):
            if graph_num not in features_dict:
                features_dict[graph_num] = []
            actor_features_emb = actor_features_emb_list[graph_num].detach().cpu().numpy()    
            tube_labels = np.copy(annot_data[dataset.split][video_name]['action_instances'][instance]['tube_labels'])
            for tube_id, tube_label in enumerate(tube_labels):
                if tube_label > 0: # not background
                    features_dict[graph_num].append([actor_features_emb[tube_id, :], utils.idx2class(class_map, tube_label)])
        
        vid_annot = obj_annotations[dataset.split][video_name]
        for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])):
            obj_box = vid_annot['action_instances'][instance][keyframe][box_idx][0:4]
            obj_box = obj_box * 14
            x1 = int(round(obj_box[0]))
            y1 = int(round(obj_box[1]))
            x2 = int(round(obj_box[2]))
            y2 = int(round(obj_box[3]))
            if x1 == x2:
                x1 = int(np.floor(obj_box[0]))
                x2 = int(np.ceil(obj_box[2]))
            if y1 == y2:
                y1 = int(np.floor(obj_box[1]))
                y2 = int(np.ceil(obj_box[3]))
            for graph_num in range(len(context_features_list)):
                obj_features = context_features_list[graph_num][0, :, 3, y1:y2 + 1, x1:x2 + 1].detach().cpu().numpy()
                obj_features = np.mean(obj_features, axis=(1, 2))
                obj_id = int(vid_annot['action_instances'][instance][keyframe][box_idx][4])
                obj_name = annot['objectList'][obj_id]
                features_dict[graph_num].append([obj_features, obj_name])
    
    return features_dict