def scatterplot_recall(tubes_recall, objects_recall, fig_width=13, fig_height=9, exclude_classes=None): class_map = utils.class2idx_map() AUC = [] for class_label in range(1, len(class_map)): # for each class # area under the curve AUC.append(eval_utils.average_precision(objects_recall[class_label])) name = "tab10" cmap = get_cmap(name) colors = cmap.colors fig = plt.figure(figsize=(fig_width, fig_height)) plt.grid(alpha=0.4) plt.xlabel('AUC (Objects Recall)', fontsize=fig_width) plt.ylabel('Tubes Recall', fontsize=fig_width) fig.axes[0].tick_params(labelsize=fig_width - 2) hex_colors = [matplotlib.colors.to_hex(color) for color in colors] for idx, class_label in enumerate(range(1, len(class_map))): if exclude_classes is not None: if utils.idx2class(class_map, class_label) in exclude_classes: continue plt.scatter(AUC[idx], tubes_recall[idx], c=hex_colors[idx], label=utils.idx2class(class_map, class_label), s=50) fig.axes[0].legend(fontsize=fig_width - 3)
def plot_objects_recall(objects_recall, fig_width=13, fig_height=9): class_map = utils.class2idx_map() name = "tab10" cmap = get_cmap(name) colors = cmap.colors fig = plt.figure(figsize=(fig_width, fig_height)) plt.grid(alpha=0.4) plt.xlabel('Attention Threshold', fontsize=fig_width) plt.ylabel('Recall', fontsize=fig_width) plt.xticks(np.linspace(0, 1, len(class_map))) plt.yticks(np.linspace(0, 1, len(class_map))) fig.axes[0].tick_params(labelsize=fig_width - 2) hex_colors = [matplotlib.colors.to_hex(color) for color in colors] for idx, class_label in enumerate(objects_recall): plt.plot(objects_recall[class_label][:, 1], objects_recall[class_label][:, 0], c=hex_colors[idx], label=utils.idx2class(class_map, class_label), linewidth=2) fig.axes[0].legend(fontsize=fig_width - 4)
parser.add_argument('--batch_size', type=int, default=3, help='Batch size') parser.add_argument('--gpu_device', type=str, default=0, help='GPU device (number) to use; defaults to 0') parser.add_argument('--cpu', action='store_true', help='Whether to use CPU instead of GPU; this option overwrites the --gpu_device argument') parser.add_argument('--save_scores', action='store_true', help='Whether to save model scores during inference') parser.add_argument('--split', default='test', help='Dataset split; possible values are \'training\', \'validation\', \'test\'') args = parser.parse_args() with open(os.path.join(args.config_path, 'config.pkl'), 'rb') as f: cfg_dict = pickle.load(f) if 'merge_function' not in cfg_dict: cfg_dict['merge_function'] = 'concat' if 'zero_shot' not in cfg_dict: cfg_dict['zero_shot'] = False cfg_dict['classes_to_exclude'] = None if 'num_features_mixed5c' not in cfg_dict: cfg_dict['num_features_mixed5c'] = 1024 #if 'i3d_weights_path' not in cfg_dict: # cfg_dict['i3d_weights_path'] = 'models/' cfg_dict['i3d_weights_path'] = 'models/' if 'class_map' not in cfg_dict: cfg_dict['class_map'] = utils.class2idx_map(cfg_dict['classes_to_exclude']) cfg = config.GetConfig(**cfg_dict) utils.print_config(cfg) dataloader, test_set, model, device = prepare_inference(cfg, args) inference(dataloader, test_set, model, device, cfg)
def get_tubes_recall(config_path, epoch, split, num_threshold_points=100): with open(os.path.join(config_path, 'config.pkl'), 'rb') as f: cfg_dict = pickle.load(f) scores_path = cfg_dict['scores_path'] model_name = cfg_dict['model_name'] filename = cfg_dict['filename'] with open(os.path.join(scores_path, model_name, filename, split, 'scores_epoch_' + str(epoch) + '.pkl'), 'rb') as f: scores = pickle.load(f) scored_tubes = score_tubes(scores, cfg_dict['num_actions']) with open(os.path.join(cfg_dict['annot_path'], 'annotated_data.pkl'), 'rb') as f: annotated_data = pickle.load(f) classes_to_exclude = cfg_dict['classes_to_exclude'] class_map = utils.class2idx_map() # collect tubes with IoU > 0.5 tubes_dict = {} for video in annotated_data[split]: vid_annot = annotated_data[split][video] w, h = vid_annot['(width, height)'] for instance in vid_annot['action_instances']: instance_annot = annotated_data[split][video]['action_instances'][instance] keyframes_dict = instance_annot['keyframes'] keyframe_ids = np.array(list(keyframes_dict.keys())) keyframe_boxes = np.copy(np.stack(list(keyframes_dict.values()))) keyframe_boxes[:, [0, 2]] = np.copy(keyframe_boxes[:, [0, 2]]) * w keyframe_boxes[:, [1, 3]] = np.copy(keyframe_boxes[:, [1, 3]]) * h for tube_id in instance_annot['tubes']: tube = instance_annot['tubes'][tube_id] spt_iou = np.mean(utils.get_tube_iou(tube[np.in1d(tube[:, 0], keyframe_ids), 1:5], keyframe_boxes)) if spt_iou > 0.5: if video not in tubes_dict: tubes_dict[video] = {} if instance not in tubes_dict[video]: tubes_dict[video][instance] = {} tubes_dict[video][instance]['tubes'] = {} tubes_dict[video][instance]['tube_labels'] = [] tubes_dict[video][instance]['tubes'][tube_id] = tube tubes_dict[video][instance]['tube_labels'].append(instance_annot['tube_labels'][tube_id]) tubes_recall = [] for class_label in range(1, len(class_map)): # for each class running_corrects = 0 running_total = 0 for video in scored_tubes: if video not in tubes_dict: continue for instance in scored_tubes[video]: if instance not in tubes_dict[video]: continue tubes_instance = tubes_dict[video][instance] assert len(set(tubes_instance['tube_labels'])) == 1 instance_label = tubes_instance['tube_labels'][0] if class_label != instance_label: continue tube_ids = np.array(list(tubes_dict[video][instance]['tubes'].keys())) predicted_labels = scored_tubes[video][instance][tube_ids, 0] gt_labels = np.array(tubes_dict[video][instance]['tube_labels'], dtype=predicted_labels.dtype) running_corrects += np.sum(predicted_labels == gt_labels) running_total += len(gt_labels) tubes_recall.append(running_corrects / running_total) return tubes_recall
def get_objects_recall(config_path, epoch, split, num_threshold_points=100): with open(os.path.join(config_path, 'config.pkl'), 'rb') as f: cfg_dict = pickle.load(f) with open(os.path.join(cfg_dict['am_path'], cfg_dict['filename'], split, 'am_epoch_' + str(epoch) + '_keyframes' + '.pkl'), 'rb') as f: am = pickle.load(f) with open(os.path.join(cfg_dict['annot_path'], 'annotated_data.pkl'), 'rb') as f: annotated_data = pickle.load(f) with open(os.path.join(cfg_dict['annot_path'], 'daly1.1.0.pkl'), 'rb') as f: annot = pickle.load(f, encoding='latin1') obj_annotations = utils.get_obj_annotations(annotated_data, annot) classes_to_exclude = cfg_dict['classes_to_exclude'] OH = cfg_dict['out_feature_size'][0] OW = cfg_dict['out_feature_size'][1] T_fm = cfg_dict['out_feature_temp_size'] class_map = utils.class2idx_map() num_layers = cfg_dict['num_layers'] num_graphs = cfg_dict['num_graphs'] # collect tubes with IoU > 0.5 tubes_dict = {} for video in annotated_data[split]: vid_annot = annotated_data[split][video] w, h = vid_annot['(width, height)'] for instance in vid_annot['action_instances']: instance_annot = annotated_data[split][video]['action_instances'][instance] keyframes_dict = instance_annot['keyframes'] keyframe_ids = np.array(list(keyframes_dict.keys())) keyframe_boxes = np.copy(np.stack(list(keyframes_dict.values()))) keyframe_boxes[:, [0, 2]] = np.copy(keyframe_boxes[:, [0, 2]]) * w keyframe_boxes[:, [1, 3]] = np.copy(keyframe_boxes[:, [1, 3]]) * h for tube_id in instance_annot['tubes']: tube = instance_annot['tubes'][tube_id] spt_iou = np.mean(utils.get_tube_iou(tube[np.in1d(tube[:, 0], keyframe_ids), 1:5], keyframe_boxes)) if spt_iou > 0.5: if video not in tubes_dict: tubes_dict[video] = {} if instance not in tubes_dict[video]: tubes_dict[video][instance] = {} tubes_dict[video][instance]['tubes'] = {} tubes_dict[video][instance]['tube_labels'] = [] tubes_dict[video][instance]['tubes'][tube_id] = tube tubes_dict[video][instance]['tube_labels'].append(instance_annot['tube_labels'][tube_id]) objects_recall = {} thresholds = np.linspace(0, 1, num_threshold_points) for class_label in range(1, len(class_map)): # recall curve for each class (exclude background) if classes_to_exclude is not None: class_name = utils.idx2class(class_map, class_label) if class_name not in classes_to_exclude: continue # calculate total number of false negatives fn = 0 for video in obj_annotations[split]: if (video not in am.keys()) or (video not in tubes_dict): continue # no object annotations or no positive tubes vid_annot = obj_annotations[split][video] for instance in vid_annot['action_instances']: if instance not in tubes_dict[video]: continue tubes_instance = tubes_dict[video][instance] assert len(set(tubes_instance['tube_labels'])) == 1 instance_label = tubes_instance['tube_labels'][0] if class_label != instance_label: continue # skip instances of different class keyframes = list(vid_annot['action_instances'][instance].keys()) for keyframe in keyframes: fn_keyframe = 0 for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])): if (vid_annot['action_instances'][instance][keyframe][box_idx][5] == 1) or (vid_annot['action_instances'][instance][keyframe][box_idx][6] == 1): continue fn_keyframe += 1 fn += fn_keyframe * len(tubes_instance['tube_labels']) # total number of false negatives recall_values = np.zeros([len(thresholds), 2]) for idx, threshold in enumerate(thresholds): # for each threshold tp = 0 fn_ = fn for video in obj_annotations[split]: if (video not in am.keys()) or (video not in tubes_dict): continue # no object annotations or no positive tubes vid_annot = obj_annotations[split][video] W, H = obj_annotations[split][video]['(width, height)'] for instance in vid_annot['action_instances']: if instance not in tubes_dict[video]: continue # no positive tubes assert len(set(tubes_dict[video][instance]['tube_labels'])) == 1 instance_label = tubes_dict[video][instance]['tube_labels'][0] if class_label != instance_label: continue # skip instances of different class keyframes = list(vid_annot['action_instances'][instance].keys()) for tube_id in tubes_dict[video][instance]['tubes']: # for each (positive) tube for keyframe in keyframes: att_map_list = [] for layer_num in range(num_layers): for graph_num in range(num_graphs): lngn = str(layer_num) + str(graph_num) # layer number and graph number att_map = am[video][instance][keyframe][lngn][tube_id] att_map = att_map.reshape(T_fm, OH, OW)[3] att_map = att_map.reshape(-1) att_map = scipy.special.softmax(att_map) att_map = att_map.reshape(OH, OW) att_map_list.append(att_map) # get obj annotation for keyframe for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])): # for each object annotation in keyframe obj_box = vid_annot['action_instances'][instance][keyframe][box_idx][0:4] obj_box = obj_box * OH x1 = int(round(obj_box[0])) y1 = int(round(obj_box[1])) x2 = int(round(obj_box[2])) y2 = int(round(obj_box[3])) sum_list = [] att_map_idx = 0 for layer_num in range(num_layers): for graph_num in range(num_graphs): patch = att_map_list[att_map_idx][y1:y2 + 1, x1:x2 + 1] att_sum = np.sum(patch) # add attention values inside the object bounding box sum_list.append(att_sum) att_map_idx += 1 is_positive = any(np.array(sum_list) > threshold) # if any of the graphs satisfies condition if is_positive: tp += 1 fn_ -= 1 recall_values[idx, 0] = tp / (tp + fn_) recall_values[idx, 1] = threshold objects_recall[class_label] = recall_values return objects_recall
def __init__(self, args): self.img_size = 224, 224 # size of input frame (h, w) self.out_feature_size = 14, 14 # size of output feature map (Mixed_4f) (h, w) self.out_feature_temp_size = 8 # temporal output size of feature map (Mixed_4f) self.num_person_boxes = 20 # max number of tracks in action instance self.num_in_frames = 32 # number of input frames self.model_name = args.model_name # 'baseline' or 'gcn' self.data_path = args.data_path # 'data/DALY/frames/ self.annot_path = args.annot_path self.results_path = args.results_path self.scores_path = args.scores_path self.am_path = 'am/' self.features_path = 'extracted_features/' self.i3d_weights_path = 'models/' self.filename = '' if args.cpu == False: self.use_gpu = True self.device_list = args.gpu_device else: self.use_gpu = False self.num_actions = 11 # (10 + Background) self.label_tracks = False # whether to compute track annotations (True), or load them instead (False) self.training_batch_size = args.batch_size self.validation_batch_size = args.batch_size self.momentum = 0.9 self.weight_decay = 0 # learning rate with cosine annealing schedule # 'warmup' refers to linear warmp-up self.start_epoch = 0 self.total_epochs = args.total_epochs # 450 # 'total_steps' is inferred as (per epoch) 'num_steps' * 'total_epochs' self.warmup_epochs = args.warmup_epochs # 0 # 'warmup_steps' is inferred as (per epoch) 'num_steps' * 'warmup_epochs' (0-indexed) self.init_lr = args.init_lr # 4.5e-6 self.max_lr = args.max_lr # 4.7e-5 self.min_lr = args.min_lr # if self.warmup_epochs == 0: # if self.init_lr is not None: # warnings.warn("Warning: {warmup_epochs} is 0, while init_lr is greater than 0.\n Defaulting init_lr to None".format(s)) # if self.init_lr is None: # if self.warmup_epochs > 0: # warnings.warn("Warning: {warmup_epochs} is 0, while init_lr is greater than 0.\n Defaulting init_lr to None".format(s)) # raise warning # if self.model_name == 'baseline': # self.total_epochs = 150 # self.warmup_epochs = 0 # self.init_lr = None # self.max_lr = 2.5e-4 # self.min_lr = 0 self.num_features_mixed4f = 832 # number of output channels of Mixed_4f self.num_features_mixed5c = 1024 # number of output channels of Mixed_5c self.num_features_gcn = 256 self.crop_size = 7, 7 # output size of RoI Pooling self.dropout_prob = 0.5 self.num_layers = args.num_layers # number of gcn layers self.num_graphs = args.num_graphs # number of graphs per layer self.merge_function = args.merge_function # function to merge output of multiple graphs in final layer: 'sum' or 'concat' if self.model_name == 'baseline': self.use_i3d_tail = True self.zero_shot = args.zero_shot self.classes_to_exclude = None if self.model_name == 'gcn': if self.zero_shot: self.classes_to_exclude = [ 'Ironing', 'TakingPhotosOrVideos' ] # classes to exclude during training self.num_actions = self.num_actions - len( self.classes_to_exclude) elif self.model_name == 'baseline': self.zero_shot = False self.class_map = utils.class2idx_map(self.classes_to_exclude) #self.save_log = True # whether to save the model, state_dict, and loss every x-number of epochs self.set_bn_eval = False # If set to True, freeze batch normalization layers self.save_scores = args.save_scores # whether to save output (softmax) scores every x-number of epochs self.save_am = False # whether to save the adjacency matrix of every clip if self.model_name == 'baseline': self.save_am = False self.plot_grad_flow = False self.num_epochs_to_val = args.num_epochs_to_val self.resume_training = args.resume_training # Load weights from checkpoint to resume training if self.resume_training: self.checkpoint_path = args.checkpoint_path
def plot_tSNE(config_path, epoch, split, plot_type='actions', fig_width=14, fig_height=9): with open(os.path.join(config_path, 'config.pkl'), 'rb') as f: cfg_dict = pickle.load(f) with open( os.path.join(cfg_dict['features_path'], cfg_dict['filename'], split, 'features_epoch_' + str(epoch) + '.pkl'), 'rb') as f: actor_and_obj_features = pickle.load(f) with open(os.path.join(cfg_dict['annot_path'], 'daly1.1.0.pkl'), 'rb') as f: annot = pickle.load(f, encoding='latin1') filename = cfg_dict['filename'] if plot_type == 'actions': class_map = utils.class2idx_map(cfg_dict['classes_to_exclude']) action_list = list(class_map.keys()) action_list.remove('Background') palette = "tab10" cmap = get_cmap(palette) colors = cmap.colors # total of 10 colors hex_colors = [matplotlib.colors.to_hex(color) for color in colors] elif plot_type == 'objects': obj_list = annot['objectList'] palette = "Paired" cmap = get_cmap(palette) colors = cmap.colors # total of 14 colors hex_colors = [matplotlib.colors.to_hex(color) for color in colors] hex_colors.append(matplotlib.colors.to_hex('black')) hex_colors.append(matplotlib.colors.to_hex('grey')) features_dict = {} names_dict = {} for graph_num in range(len(actor_and_obj_features)): features_dict[graph_num] = [] names_dict[graph_num] = [] for graph_num in range(len(actor_and_obj_features)): for features in actor_and_obj_features[graph_num]: name = features[1] if (plot_type == 'actions') and (name not in action_list): continue if (plot_type == 'objects') and (name not in obj_list): continue features_dict[graph_num].append(features[0]) names_dict[graph_num].append(features[1]) for graph_num in features_dict: fig = plt.figure(figsize=(fig_width, fig_height)) features = np.vstack(features_dict[graph_num]) names = np.array(names_dict[graph_num]) if plot_type == 'objects': # collect names of 14 most frequent objects obj_freqs = [] for obj in obj_list: idxs = np.where(names == obj)[0] obj_freqs.append((obj, len(idxs))) most_freq_obj = sorted(obj_freqs, key=lambda tup: tup[1], reverse=True) most_freq_obj = most_freq_obj[0:14] most_freq_obj = [obj for obj, _ in most_freq_obj] # collect features of 14 most frequent objects idxs = np.in1d(names, most_freq_obj) features = features[idxs, :] names = names[idxs] np.random.seed(5186) print('Fitting t-SNE for graph', str(graph_num) + '...') features_emb = TSNE(n_components=2, perplexity=30.0, n_iter=1000, learning_rate=200).fit_transform(features) if plot_type == 'actions': for idx, action in enumerate(action_list): idxs = np.where(names == action)[0] plt.scatter(features_emb[idxs, 0], features_emb[idxs, 1], s=13, c=hex_colors[idx], label=action) elif plot_type == 'objects': for idx, obj in enumerate(most_freq_obj): idxs = np.where(names == obj)[0] plt.scatter(features_emb[idxs, 0], features_emb[idxs, 1], s=13, c=hex_colors[idx], label=obj) if graph_num == 0: plt.legend() plt.show()
def extract_features(dataloader, dataset, model, device, annot_data, annot): model.eval() obj_annotations = utils.get_obj_annotations(annot_data, annot) class_map = utils.class2idx_map(classes_to_exclude=None) features_dict = {} for idx, batch_data in enumerate(dataloader): imgs = batch_data[0] person_boxes = batch_data[1] action_labels = batch_data[2] num_boxes_per_frame = batch_data[3] video_names = batch_data[4] instances = batch_data[5] center_frames = batch_data[6] video_name = video_names[0] instance = instances[0].item() keyframe = center_frames[0].item() if video_name not in obj_annotations[dataset.split]: continue if instance not in obj_annotations[dataset.split][video_name]['action_instances']: continue if keyframe not in obj_annotations[dataset.split][video_name]['action_instances'][instance]: continue num_actors_list = [num_boxes_per_frame[b][15].item() for b in range(imgs.shape[0])] batch = [data.to(device=device) for data in [imgs, person_boxes]] batch.append(num_boxes_per_frame) batch.append(action_labels) with torch.set_grad_enabled(False): actor_features_emb_list, context_features_list = model(batch, 'return_features') for graph_num in range(len(actor_features_emb_list)): if graph_num not in features_dict: features_dict[graph_num] = [] actor_features_emb = actor_features_emb_list[graph_num].detach().cpu().numpy() tube_labels = np.copy(annot_data[dataset.split][video_name]['action_instances'][instance]['tube_labels']) for tube_id, tube_label in enumerate(tube_labels): if tube_label > 0: # not background features_dict[graph_num].append([actor_features_emb[tube_id, :], utils.idx2class(class_map, tube_label)]) vid_annot = obj_annotations[dataset.split][video_name] for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])): obj_box = vid_annot['action_instances'][instance][keyframe][box_idx][0:4] obj_box = obj_box * 14 x1 = int(round(obj_box[0])) y1 = int(round(obj_box[1])) x2 = int(round(obj_box[2])) y2 = int(round(obj_box[3])) if x1 == x2: x1 = int(np.floor(obj_box[0])) x2 = int(np.ceil(obj_box[2])) if y1 == y2: y1 = int(np.floor(obj_box[1])) y2 = int(np.ceil(obj_box[3])) for graph_num in range(len(context_features_list)): obj_features = context_features_list[graph_num][0, :, 3, y1:y2 + 1, x1:x2 + 1].detach().cpu().numpy() obj_features = np.mean(obj_features, axis=(1, 2)) obj_id = int(vid_annot['action_instances'][instance][keyframe][box_idx][4]) obj_name = annot['objectList'][obj_id] features_dict[graph_num].append([obj_features, obj_name]) return features_dict