def eval_temporal_priors(train_file, test_file, n_prop=NUM_PROPOSALS, filename=None): """Run TempPriorsNoScale over a range of number-of-proposals """ ds_train = BaselineData.fromcsv(train_file) Xtrain = ds_train.get_temporal_loc() ds_test_df = pd.read_csv(test_file, sep=' ') Ztest = np.array(ds_test_df.loc[:, 'n-frames']) for i, v in enumerate(n_prop): if v > Xtrain.shape[0]: # Use all annotations as priors ;) continue m = TempPriorsNoScale(v) m.fit(Xtrain) Ypred_centered, idx = m.proposals(Ztest, return_index=True) Ypred = segment_format(Ypred_centered, 'c2b') # Form video-proposals format [f-init, f-end, score] vid_prop_all = np.hstack([Ypred, np.zeros((Ypred.shape[0], 1))]) vid_prop = proposals_per_video(vid_prop_all, v) id_prop = dict( zip(ds_test_df.loc[:, 'video-name'].tolist(), vid_prop.tolist())) if isinstance(filename, str): idfile = filename + '.n-prop_{}'.format(v) dump_json(idfile, id_prop) return None
def load_proposals(proposal_dir, stride=128, T=256, file_filter=None, priors_filename=None): """Load proposal DataFrames from files. """ proposal_df = [] vds_true = None if file_filter: vds_true = pd.read_csv(file_filter)['video-name'].tolist() filenames = glob.glob(os.path.join(proposal_dir, '*.proposals')) priors = None if priors_filename: priors = hkl.load(priors_filename) for f in filenames: vid = os.path.basename(f).split('.')[0] if file_filter and vid not in vds_true: continue this_df = pd.read_csv(f, sep=' ', index_col=False) if priors_filename: n_proposals = priors.shape[0] n_segments = this_df.shape[0] / n_proposals this_priors = np.tile(priors, (n_segments, 1)) l_size = this_df['video-frames'].mean() f_init_array = np.arange(0, l_size - T, stride) map_array = np.stack((f_init_array, np.zeros(n_segments))) map_array = map_array.repeat(n_proposals, axis=-1).T proposals = segment_format( map_array + (this_priors.clip(0, 1) * T), 'c2b').astype(int) this_df['f-init'] = proposals[:, 0] this_df['f-end'] = proposals[:, 1] proposal_df.append(this_df) return pd.concat(proposal_df, axis=0)
def retrieve_proposals(self, c3d_stack, f_init_array, override=False): """Retrieve proposals for multiple streams. Parameters ---------- c3d_stack : ndarray 3d-ndarray [num-streams, seq-length, input-size] with visual encoder representation of each stream. Note that the first dimension is sequence agnostic so you can push as many videos as your HW allows it. f_init_array : ndarray. 1d-ndarray with initial frame of each stream. override : bool, optional. If True, override predicted locations with anchors. Make sure of initialize your instance properly in order to use the anchors. Returns ------- proposals : ndarray 3d-ndarray [num-streams, num-outputs, 2] with proposal locations in terms of f-init, f-end. conf : ndarray 2d-ndarray [num-streams, num-outputs] action likelihood of each proposal Raises ------ ValueError Mistmatch between c3d_stack.shape[0] and f_init_array.size """ if c3d_stack.ndim == 2 and c3d_stack.shape[0] == self.seq_length: c3d_stack = c3d_stack[np.newaxis, ...] if c3d_stack.shape[0] != f_init_array.size: raise ValueError('Mismatch between c3d_stack and f_init_array') n_streams = c3d_stack.shape[0] loc, score = self.forward_pass(floatX(c3d_stack)) if override and self.anchors is not None: loc[:, ...] = self.anchors.reshape(-1) # Clip proposals inside receptive field loc.clip(0, 1, out=loc) loc *= self.receptive_field # Shift center to absolute location in the video loc = loc.reshape((n_streams, -1, 2)) loc[:, :, 0] += f_init_array.reshape((n_streams, 1)) # Transform center 2 boundaries proposals = np.reshape(segment_format(loc.reshape((-1, 2)), 'c2b'), (n_streams, -1, 2)).astype(int) return proposals, score
def compute_priors(df, T, K=200, iou_thr=0.5, norm_fcn=wrapper_unit_scaling, i_thr=1.0, rng_seed=None): """Clustering of ground truth locations Parameters ---------- X : DataFrame pandas table with annotations of the dataset. It must include the following columns data_generation.REQ_INFO_CP T : int canonical temporal size of evaluation window K : int, optional number of priors iou_thr : float IOU threshold to consider that an annotation match with a prior norm_fcn : function Function to apply over ndarray [m x 2] of segments with format :=[f-init, f-end] before computing priors. i_thr : float ratio [0, 1] to include an annotation inside a segment. rng_seed : int Seed for random number generator Outputs ------- priors : ndarray 2-dim array of priors discovered. The first dimension iterates over the different priors. new_df : DataFrame Table with information about instances to use in training """ # Input validation if not isinstance(df, pd.DataFrame): raise ValueError('df argument must be a pd.DataFrame') if not set(REQ_INFO_CP).issubset(df.columns.tolist()): msg = 'df must include these column names: {}'.format(REQ_INFO_CP) raise ValueError(msg) if iou_thr > 1 or iou_thr < 0: raise ValueError('Invalid value of IOU') # Loop over videos videos = df['video-name'].unique() L = np.empty(videos.size, dtype=int) segment_lst, n_seg = [None] * videos.size, np.empty(videos.size, dtype=int) mapped_gt_lst, n_gt_lst = [None] * videos.size, [None] * videos.size for i, v in enumerate(videos): idx = df['video-name'] == v L[i] = df.loc[idx, 'video-frames'].mean() gtruth_c = df.loc[idx, ['f-init', 'n-frames']] gtruth_b = segment_format(np.array(gtruth_c), 'd2b') segment_lst[i], gt_list_i, n_gt_lst[i] = generate_segments( T, L[i], gtruth_b, method='iou', rng_seed=rng_seed, i_thr=i_thr) n_seg[i] = segment_lst[i].shape[0] if len(gt_list_i) > 0: mapped_gt_lst[i] = np.vstack(gt_list_i) else: mapped_gt_lst[i] = np.empty((0, 2)) # Standardize mapped annotations into a common reference + Normalization segments = np.vstack(segment_lst) mapped_gt = np.vstack(mapped_gt_lst) n_gt = np.hstack(n_gt_lst) X = norm_fcn(mapped_gt, T, segments, n_gt) # Clustering model = TempPriorsNoScale(K, rng_seed=rng_seed) model.fit(X) priors = model.priors # Matching score = np.empty((segments.shape[0], priors.shape[0]), dtype=int) j = 0 for i, v in enumerate(segment_lst): # Scale priors and use boundary format mapped_priors_b = segment_format(priors * T, 'c2b') s_ref = np.expand_dims(np.repeat(v[:, 0], n_gt_lst[i]), 1) # Reference mapped gt on [0 - T] interval if mapped_gt_lst[i].size == 0: continue mapped_gt_i_ref = mapped_gt_lst[i] - s_ref if (mapped_gt_i_ref[:, 0] < 0).sum() > 0: msg = ('Initial frame must be greater that zero. Running at your ' 'own risk. Debug is needed.') warnings.warn(msg) # IOU computation iou = segment_iou(mapped_priors_b, mapped_gt_i_ref) # Map IOU of priors for each segment idx = [0] + np.cumsum(n_gt_lst[i]).tolist() max_iou = np.vstack(map(lambda u, v: np.zeros(K, dtype=int) if u == v else iou[:, u:v].max(axis=1), idx[:-1], idx[1::])) score[j:j+n_seg[i], :] = max_iou > iou_thr j += n_seg[i] # Build DataFrame col_triads = ['c_{}'.format(i) for i in range(K)] new_df = pd.concat([pd.DataFrame({'video-name': videos.repeat(n_seg), 'f-init': segments[:, 0], 'duration': np.repeat(T, segments.shape[0]), 'video-frames': np.repeat(L, n_seg)}), pd.DataFrame(score, columns=col_triads)], axis=1) return priors, new_df
def wrapper_unit_scaling(x, T, s_ref, n_gt, *args, **kwargs): """Normalize segments to unit-length and use center-duration format """ xc = segment_format(x, 'b2c') init_ref = np.repeat(s_ref[:, 0], n_gt) return segment_unit_scaling(xc, T, init_ref)
def evaluate_priors(df, priors, T, stride=16, iou_thr=0.5, return_recall=False): """ Parameters ---------- df: DataFrame Pandas table with annotations of the dataset. It must include the following columns data_generation.REQ_INFO_CP priors: ndarray 2-dim array of priors discovered. The first dimension iterates over the different priors. T: int Canonical temporal size of evaluation window. stride: int, optional Size of the sliding step. iou_thr : float, optional IOU threshold to consider that an annotation match with a prior. return_recall: bool, optional Return one extra output (recall, computed at given iou_thr). Outputs ------- eval_df: DataFrame Table with information about each annotation and its matched prior. recall: float Recall at given iou threshold. """ # Sanitize input. mapped_priors_b = segment_format(priors * T, 'c2b').clip(1, T) mapped_priors_b = np.array(mapped_priors_b).astype(np.int) # Iterate over each instance. best_iou, v_pointer = np.empty(df['video-name'].size), 0 best_priors_t = np.empty((df['video-name'].size, 2)) best_priors_index = np.empty(df['video-name'].size) for i, sgm_i in df.iterrows(): # Parsing ground-truth. L = sgm_i['video-frames'] gtruth_c = np.empty((1, 2)) gtruth_c[0, :] = np.stack([sgm_i['f-init'], sgm_i['n-frames']], axis=-1) gtruth_b = segment_format(gtruth_c, 'd2b') # Slide priors over time. priors_t, k_idx = compute_priors_over_time(mapped_priors_b, T, L, stride) # Not found priors for this video. if priors_t.shape[0] == 0: best_iou[v_pointer] = 0.0 best_priors_t[v_pointer, :] = np.array([[np.nan, np.nan]]) best_priors_index[v_pointer] = np.array([np.nan]) v_pointer += 1 continue # Compute iou and keep the best one for each ground-truth instance. iou = segment_iou(gtruth_b, priors_t) max_idx = iou.argmax(axis=1) best_iou[v_pointer] = iou.flatten()[max_idx] best_priors_t[v_pointer, :] = priors_t[max_idx, :] best_priors_index[v_pointer] = k_idx[max_idx] v_pointer += 1 # Build DataFrame. s_init = best_priors_t[:, 0] n_frames = best_priors_t[:, 1] - best_priors_t[:, 0] + 1 eval_df = pd.concat([df, pd.DataFrame({'priors-f-init': s_init, 'priors-n-frames': n_frames, 'k-idx': best_priors_index, 'iou': best_iou})], axis=1) if return_recall: n_annotations = eval_df.shape[0] recall = (eval_df['iou'] >= iou_thr).sum().astype(float)/n_annotations return eval_df, recall return eval_df
def retrieve_proposals(video_name, l_size, network, T=256, stride=128, c3d_size=16, c3d_stride=8, pool_type='mean', hdf5_dataset=None, model_prm=None): """Retrieve proposals for an input video. Parameters ---------- video_name : str. Video identifier. l_size : int. Size of the video. network : (localization, conf). Lasagne layers. T : int, optional. Canonical temporal size of evaluation window. stride : int, optional. Size of the sliding step. c3d_size : int, optional. Size of temporal fiel C3D network. c3d_stride : int, optional. Size of temporal stride between extracted features. pool_type : str, optional. Global pooling strategy over a bunch of features. 'mean', 'max', 'pyr-2-mean/max', 'concat-2-mean/max' hdf5_dataset : str. Path to feature file. """ # IO interface. fobj = Feature(filename=hdf5_dataset, t_size=c3d_size, t_stride=c3d_stride, pool_type=pool_type) fobj.open_instance() # Video scanning. f_init_array = np.arange(0, l_size - T, stride) feat_stack = fobj.read_feat_batch_from_video(video_name, f_init_array, duration=T).astype(np.float32) if model_prm.startswith('lstm:'): user_prm = model_prm.split(':', 1)[1].split(',') n_outputs, seq_length, width, depth = user_prm feat_stack = feat_stack.reshape(feat_stack.shape[0], int(seq_length), feat_stack.shape[1] / int(seq_length)) # Close instance. fobj.close_instance() # Generate proposals. loc, score = forward_pass(network, feat_stack) n_proposals = score.shape[1] n_segments = score.shape[0] score = score.flatten() map_array = np.stack( (f_init_array, np.zeros(n_segments))).repeat(n_proposals, axis=-1).T proposal = segment_format(map_array + (loc.clip(0, 1) * T), 'c2b').astype(int) return proposal, score