def run(cfg, model, video_data, num_frames, step_frames, fout): frames, labels, video_idx, meta = video_data length = (frames.shape[1] // cfg.SLOWFAST.ALPHA) * cfg.SLOWFAST.ALPHA features = [] for k in range(0, length, step_frames): start = k end = min(k + num_frames, length) inputs = frames[:, start:end] slow, fast = utils.pack_pathway_output(cfg, inputs) slow = slow.unsqueeze(0).contiguous() fast = fast.unsqueeze(0).contiguous() if torch.cuda.is_available(): slow = slow.cuda(non_blocking=True) fast = fast.cuda(non_blocking=True) feat = model([slow, fast], ftype="video") features.append(feat.detach().cpu()) features = torch.cat(features, dim=0).numpy() feat_name = os.path.join(cfg.OUTPUT_DIR, meta["video_name"] + ".feat.npy") np.save(feat_name, features) meta["feature_shape"] = features.shape meta["feature_frame"] = length meta["video_feature"] = feat_name meta["step_frames"] = step_frames json_str = json.dumps(meta) fout.write(json_str + "\n") fout.flush()
def _get_model_analysis_input(cfg, is_train): """ Return a dummy input for model analysis with batch size 1. The input is used for analyzing the model (counting flops and activations etc.). Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py is_train (bool): if True, return the input for training. Otherwise, return the input for testing. Returns: inputs: the input for model analysis. """ rgb_dimension = 3 if is_train: input_tensors = torch.rand( rgb_dimension, cfg.DATA.NUM_FRAMES, cfg.DATA.TRAIN_CROP_SIZE, cfg.DATA.TRAIN_CROP_SIZE, ) else: input_tensors = torch.rand( rgb_dimension, cfg.DATA.NUM_FRAMES, cfg.DATA.TEST_CROP_SIZE, cfg.DATA.TEST_CROP_SIZE, ) model_inputs = pack_pathway_output(cfg, input_tensors) for i in range(len(model_inputs)): model_inputs[i] = model_inputs[i].unsqueeze(0).cuda(non_blocking=True) inputs = (model_inputs, ) return inputs
def _get_model_analysis_input(cfg): """ Return a dummy input for model analysis with batch size 1. The input is used for analyzing the model (counting flops and activations etc.). Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py Returns: inputs: the input for model analysis. """ spectrogram_dimension = 1 input_tensors = torch.rand( spectrogram_dimension, cfg.AUDIO_DATA.NUM_FRAMES, cfg.AUDIO_DATA.NUM_FREQUENCIES, ) model_inputs = pack_pathway_output(cfg, input_tensors) for i in range(len(model_inputs)): model_inputs[i] = model_inputs[i].unsqueeze(0) if cfg.NUM_GPUS: model_inputs[i] = model_inputs[i].cuda(non_blocking=True) inputs = (model_inputs,) return inputs
def _get_model_analysis_input(cfg, use_train_input): """ Return a dummy input for model analysis with batch size 1. The input is used for analyzing the model (counting flops and activations etc.). Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py use_train_input (bool): if True, return the input for training. Otherwise, return the input for testing. Returns: inputs: the input for model analysis. """ rgb_dimension = 3 if use_train_input: if cfg.TRAIN.DATASET in ["imagenet", "imagenetprefetch"]: input_tensors = torch.rand( rgb_dimension, cfg.DATA.TRAIN_CROP_SIZE, cfg.DATA.TRAIN_CROP_SIZE, ) else: input_tensors = torch.rand( rgb_dimension, cfg.DATA.NUM_FRAMES, cfg.DATA.TRAIN_CROP_SIZE, cfg.DATA.TRAIN_CROP_SIZE, ) else: if cfg.TEST.DATASET in ["imagenet", "imagenetprefetch"]: input_tensors = torch.rand( rgb_dimension, cfg.DATA.TEST_CROP_SIZE, cfg.DATA.TEST_CROP_SIZE, ) else: input_tensors = torch.rand( rgb_dimension, cfg.DATA.NUM_FRAMES, cfg.DATA.TEST_CROP_SIZE, cfg.DATA.TEST_CROP_SIZE, ) model_inputs = pack_pathway_output(cfg, input_tensors) for i in range(len(model_inputs)): model_inputs[i] = model_inputs[i].unsqueeze(0) if cfg.NUM_GPUS: model_inputs[i] = model_inputs[i].cuda(non_blocking=True) # If detection is enabled, count flops for one proposal. if cfg.DETECTION.ENABLE: bbox = torch.tensor([[0, 0, 1.0, 0, 1.0]]) if cfg.NUM_GPUS: bbox = bbox.cuda() inputs = (model_inputs, bbox) else: inputs = (model_inputs, ) return inputs
def __getitem__(self, index): # Decode video. Meta info is used to perform selective decoding. frames = self.sample_frames(index) # Perform color normalization. frames = utils.tensor_normalize(frames, self.cfg.DATA.MEAN, self.cfg.DATA.STD) # T H W C -> C T H W. frames = frames.permute(3, 0, 1, 2) frames = utils.pack_pathway_output(self.cfg, frames) return frames, index
def prepare_action_inference_input(self, imgs, pred_person_boxes): """ Preprocesses the inputs to feed them to our action prediction model The preprocessing of the data is analogous to preprocessing test data in tools/test_net.py Before returning, we reformat our variables to be able to directly do inference with our activity_prediction_model :param imgs: (list of ndarrays with shape (H, W, C)) (in BGR order) and [0,255]) the images that are preprocessed :param pred_person_boxes: (ndarray(float32) of shape (num_boxes, 4 =x1, y1, x2, y2)) the predicted person boxes :return: imgs: (list of tensors with shape (1=number_of_batches, C, num_frames, H, W)) the images used for inference Important: they are usually transferred to RGB, since Kinetics pre-training uses RGB pred_person_boxes: (tensor, shape(num_boxes, 5=BatchIdx, x1, y1, x2, y2)) the boxes for the current clip - not normalized. """ if self.cfg.ACTIONRECOGNIZER.IMG_PROC_BACKEND == "pytorch": # Transform images to required format for pytorch backend if all(img is not None for img in imgs): imgs = torch.as_tensor(np.stack(imgs)) # T H W C -> T C H W. imgs = imgs.permute(0, 3, 1, 2) # Preprocess images and pred_person_boxes. imgs, pred_person_boxes = self.images_and_boxes_preprocessing( imgs, boxes=pred_person_boxes) # T C H W -> C T H W. imgs = imgs.permute(1, 0, 2, 3) else: # Preprocess images and pred_person_boxes imgs, pred_person_boxes = self.images_and_boxes_preprocessing_cv2( imgs, boxes=pred_person_boxes) # Change to list. If we have a model with multi input arch, a second pathway is created on the basis of imgs # Tensor with shape (C, num_frames, H, W) -> List(s) of tensor with same shape imgs = utils.pack_pathway_output(self.cfg, imgs) # Reformat the tensors included in the list # tensor shape (C, num_frames, H, W) -> shape (1=number_of_batches, C, num_frames, H, W) if isinstance(imgs, (list, )): for i in range(len(imgs)): imgs[i] = torch.unsqueeze(imgs[i], 0) # ndarray shape (num_boxes, 4=x1, y1, x2, y2)) -> tensor shape (num_boxes, 4= x1, y1, x2, y2)) pred_person_boxes = torch.from_numpy(pred_person_boxes) # For each box, we add a the batch_id (in our case always 0) # tensor shape (num_boxes, 4= x1, y1, x2, y2)) -> tensor shape (num_boxes, 5= batch_id, x1, y1, x2, y2))) pred_person_boxes = torch.cat([ torch.full( (pred_person_boxes.shape[0], 1), float(0)), pred_person_boxes ], axis=1) return imgs, pred_person_boxes
def get_flop_stats(model, cfg, is_train): """ Compute the gflops for the current model given the config. Args: model (model): model to compute the flop counts. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py is_train (bool): if True, compute flops for training. Otherwise, compute flops for testing. Returns: float: the total number of gflops of the given model. """ rgb_dimension = 3 if is_train: input_tensors = torch.rand( rgb_dimension, cfg.DATA.NUM_FRAMES, cfg.DATA.TRAIN_CROP_SIZE, cfg.DATA.TRAIN_CROP_SIZE, ) else: input_tensors = torch.rand( rgb_dimension, cfg.DATA.NUM_FRAMES, cfg.DATA.TEST_CROP_SIZE, cfg.DATA.TEST_CROP_SIZE, ) flop_inputs = pack_pathway_output(cfg, input_tensors) for i in range(len(flop_inputs)): flop_inputs[i] = flop_inputs[i].unsqueeze(0).cuda(non_blocking=True) # If detection is enabled, count flops for one proposal. if not cfg.MODEL.LSTM: if cfg.DETECTION.ENABLE: bbox = torch.tensor([[0, 0, 1.0, 0, 1.0]]) bbox = bbox.cuda() inputs = (flop_inputs, bbox) else: inputs = (flop_inputs, ) else: label_history = torch.zeros( [1, 10, cfg.MODEL.NUM_CLASSES[0] + cfg.MODEL.NUM_CLASSES[1]]) label_history = label_history.cuda() inputs = ([flop_inputs, label_history], ) gflop_dict, _ = flop_count(model, inputs) gflops = sum(gflop_dict.values()) return gflops
def get_flop_stats(model, cfg, is_train): """ Compute the gflops for the current model given the config. Args: model (model): model to compute the flop counts. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py is_train (bool): if True, compute flops for training. Otherwise, compute flops for testing. Returns: float: the total number of gflops of the given model. """ rgb_dimension = 3 if is_train: input_tensors = torch.rand( rgb_dimension, cfg.DATA.NUM_FRAMES, cfg.DATA.TRAIN_CROP_SIZE, cfg.DATA.TRAIN_CROP_SIZE, ) else: input_tensors = torch.rand( rgb_dimension, cfg.DATA.NUM_FRAMES, cfg.DATA.TEST_CROP_SIZE, cfg.DATA.TEST_CROP_SIZE, ) whitelist_ops = [ "aten::addmm", "aten::_convolution", "aten::einsum", "aten::matmul", ] flop_inputs = pack_pathway_output(cfg, input_tensors) for i in range(len(flop_inputs)): flop_inputs[i] = flop_inputs[i].unsqueeze(0).cuda(non_blocking=True) # If detection is enabled, count flops for one proposal. if cfg.DETECTION.ENABLE: bbox = torch.tensor([[0, 0, 1.0, 0, 1.0]]) bbox = bbox.cuda() inputs = (flop_inputs, bbox) else: inputs = (flop_inputs,) gflop_dict = flop_count(model, inputs, whitelist_ops) gflops = sum(gflop_dict.values()) return gflops
def _get_model_analysis_input(cfg, is_train): """ Return a dummy input for model analysis with batch size 1. The input is used for analyzing the model (counting flops and activations etc.). Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py is_train (bool): if True, return the input for training. Otherwise, return the input for testing. Returns: inputs: the input for model analysis. """ rgb_dimension = 3 if is_train: input_tensors = torch.rand( rgb_dimension, cfg.DATA.NUM_FRAMES, cfg.DATA.TRAIN_CROP_SIZE, cfg.DATA.TRAIN_CROP_SIZE, ) else: input_tensors = torch.rand( rgb_dimension, cfg.DATA.NUM_FRAMES, cfg.DATA.TEST_CROP_SIZE, cfg.DATA.TEST_CROP_SIZE, ) input_audio = None if cfg.DATA.USE_AUDIO: chn = 2 if cfg.DATA.GET_MISALIGNED_AUDIO else 1 input_audio = torch.rand( chn, 1, cfg.DATA.AUDIO_FRAME_NUM, cfg.DATA.AUDIO_MEL_NUM, ) model_inputs = pack_pathway_output(cfg, input_tensors, input_audio) for i in range(len(model_inputs)): model_inputs[i] = model_inputs[i].unsqueeze(0).cuda(non_blocking=True) # If detection is enabled, count flops for one proposal. if cfg.DETECTION.ENABLE: bbox = torch.tensor([[0, 0, 1.0, 0, 1.0]]) bbox = bbox.cuda() inputs = (model_inputs, bbox) else: inputs = (model_inputs, ) return inputs
def run(cfg, model, video_data, num_frames, step_frames, fout, batch_size): frames, labels, video_idx, meta = video_data length = frames.shape[1] features = [] classifiers = [] batch = [] for k in range(0, length, step_frames): start = k end = min(k + num_frames, length) if len(batch) == batch_size or ((end - start < num_frames) and len(batch) > 0): # forward # batch_size x 3 num_slow_frames x 224 x 224 b0 = torch.as_tensor(np.stack([b[0] for b in batch])).contiguous() # batch_size x 3 num_fast_frames x 224 x 224 b1 = torch.as_tensor(np.stack([b[1] for b in batch])).contiguous() if torch.cuda.is_available(): b0 = b0.cuda(non_blocking=True) b1 = b1.cuda(non_blocking=True) batch = [b0, b1] feat, cls = model(batch) features.append(feat.detach().cpu()) classifiers.append(cls.detach().cpu()) batch = [] if end - start < num_frames: break inputs = frames[:, start:end] inputs = utils.pack_pathway_output(cfg, inputs) batch.append(inputs) # length of features: ceil((length - num_frames + 1)/step_frames) features = torch.cat(features, dim=0).numpy() classifiers = torch.cat(classifiers, dim=0).numpy() feat_name = os.path.join(cfg.OUTPUT_DIR, meta["video_name"] + ".feat.npy") np.save(feat_name, features) cls_name = os.path.join(cfg.OUTPUT_DIR, meta["video_name"] + ".cls.npy") np.save(cls_name, classifiers) meta["feature_shape"] = features.shape meta["cls_shape"] = classifiers.shape meta["feature_frame"] = (len(features) - 1) * step_frames + num_frames meta["video_feature"] = feat_name meta["video_classifier"] = cls_name meta["step_frames"] = step_frames json_str = json.dumps(meta) fout.write(json_str + "\n") fout.flush()
def __getitem__(self, index): """ Given the video index, return the list of frames, label, and video index if the video can be fetched and decoded successfully, otherwise repeatly find a random video that can be decoded as a replacement. Args: index (int): the video index provided by the pytorch sampler. Returns: frames (tensor): the frames of sampled from the video. The dimension is `channel` x `num frames` x `height` x `width`. label (int): the label of the current video. index (int): if the video provided by pytorch sampler can be decoded, then return the index of the video. If not, return the index of the video replacement that can be decoded. """ frame_seg = torch.zeros( ( 3, self.out_size, self.cfg.DATA.TEST_CROP_SIZE, self.cfg.DATA.TEST_CROP_SIZE, ) ).float() start = int(index - self.step_size * self.out_size / 2) end = int(index + self.step_size * self.out_size / 2) max_ind = self.__len__() - 1 for out_ind, ind in enumerate(range(start, end, self.step_size)): if ind < 0 or ind > max_ind: continue else: if self.read_vid_file: frame_seg[:, out_ind, :, :] = self.frames[:, ind, :, :] else: frame_seg[:, out_ind, :, :] = self._read_img_file( os.path.join(self.vid_path, self.vid_id), self.frames[ind] ) # create the pathways frame_list = pack_pathway_output(self.cfg, frame_seg) return frame_list
def process_cv2_inputs(frames, cfg): """ Normalize and prepare inputs as a list of tensors. Each tensor correspond to a unique pathway. Args: frames (list of array): list of input images (correspond to one clip) in range [0, 255]. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ inputs = torch.from_numpy(np.array(frames)).float() / 255 inputs = tensor_normalize(inputs, cfg.DATA.MEAN, cfg.DATA.STD) # T H W C -> C T H W. inputs = inputs.permute(3, 0, 1, 2) # Sample frames for num_frames specified. index = torch.linspace(0, inputs.shape[1] - 1, cfg.DATA.NUM_FRAMES).long() inputs = torch.index_select(inputs, 1, index) inputs = pack_pathway_output(cfg, inputs) inputs = [inp.unsqueeze(0) for inp in inputs] return inputs
def __getitem__(self, idx): """ Generate corresponding clips, boxes, labels and metadata for given idx. Args: idx (int): the video index provided by the pytorch sampler. Returns: frames (tensor): the frames of sampled from the video. The dimension is `channel` x `num frames` x `height` x `width`. label (ndarray): the label for correspond boxes for the current video. idx (int): the video index provided by the pytorch sampler. extra_data (dict): a dict containing extra data fields, like "boxes", "ori_boxes" and "metadata". """ video_idx, sec_idx, sec, center_idx = self._keyframe_indices[idx] # Get the frame idxs for current clip. seq = utils.get_sequence( center_idx, self._seq_len // 2, self._sample_rate, num_frames=len(self._image_paths[video_idx]), ) clip_label_list = self._keyframe_boxes_and_labels[video_idx][sec_idx] assert len(clip_label_list) > 0 # Get boxes and labels for current clip. boxes = [] labels = [] for box_labels in clip_label_list: boxes.append(box_labels[0]) labels.append(box_labels[1]) boxes = np.array(boxes) # Score is not used. boxes = boxes[:, :4].copy() ori_boxes = boxes.copy() # Load images of current clip. image_paths = [self._image_paths[video_idx][frame] for frame in seq] imgs = utils.retry_load_images(image_paths, backend=self.cfg.AVA.IMG_PROC_BACKEND) if self.cfg.AVA.IMG_PROC_BACKEND == "pytorch": # T H W C -> T C H W. imgs = imgs.permute(0, 3, 1, 2) # Preprocess images and boxes. imgs, boxes = self._images_and_boxes_preprocessing(imgs, boxes=boxes) # T C H W -> C T H W. imgs = imgs.permute(1, 0, 2, 3) else: # Preprocess images and boxes imgs, boxes = self._images_and_boxes_preprocessing_cv2(imgs, boxes=boxes) # Construct label arrays. label_arrs = np.zeros((len(labels), self._num_classes), dtype=np.int32) for i, box_labels in enumerate(labels): # AVA label index starts from 1. for label in box_labels: if label == -1: continue assert label >= 1 and label <= 80 label_arrs[i][label - 1] = 1 imgs = utils.pack_pathway_output(self.cfg, imgs) metadata = [[video_idx, sec]] * len(boxes) extra_data = { "boxes": boxes, "ori_boxes": ori_boxes, "metadata": metadata, } return imgs, label_arrs, idx, extra_data
def run(loader, model, cfg): model.eval() num_frames = cfg.DATA.NUM_FRAMES step_frames = int(num_frames / 2) fout = open(cfg.TEST.OUTPUT_FEATURE_FILE, "w") batch_size = cfg.TEST.BATCH_SIZE start_time = time.time() for v_ind, (frames, labels, video_idx, meta) in enumerate(loader): print("load frames time:", time.time() - start_time) # Transfer the data to the current GPU device. if v_ind % 10 == 0: print("process video index:", v_ind, "total:", len(loader)) length = frames.shape[1] features = [] classifiers = [] batch = [] for k in range(0, length, step_frames): start = k end = min(k + num_frames, length) if len(batch) == batch_size or ((end - start < num_frames) and len(batch) > 0): # forward # batch_size x 3 num_slow_frames x 224 x 224 b0 = torch.as_tensor(np.stack([b[0] for b in batch])).contiguous() # batch_size x 3 num_fast_frames x 224 x 224 b1 = torch.as_tensor(np.stack([b[1] for b in batch])).contiguous() if torch.cuda.is_available(): b0 = b0.cuda(non_blocking=True) b1 = b1.cuda(non_blocking=True) batch = [b0, b1] feat, cls = model(batch) features.append(feat.detach().cpu()) classifiers.append(cls.detach().cpu()) batch = [] if end - start < num_frames: break inputs = frames[:, start:end] inputs = utils.pack_pathway_output(cfg, inputs) batch.append(inputs) # length of features: ceil((length - num_frames + 1)/step_frames) features = torch.cat(features, dim=0).numpy() classifiers = torch.cat(classifiers, dim=0).numpy() feat_name = os.path.join(cfg.OUTPUT_DIR, meta["video_name"] + ".feat.npy") np.save(feat_name, features) cls_name = os.path.join(cfg.OUTPUT_DIR, meta["video_name"] + ".cls.npy") np.save(cls_name, classifiers) meta["feature_shape"] = features.shape meta["cls_shape"] = classifiers.shape meta["feature_frame"] = (len(features)-1) * step_frames + num_frames meta["video_feature"] = feat_name meta["video_classifier"] = cls_name meta["step_frames"] = step_frames json_str = json.dumps(meta) fout.write(json_str + "\n") fout.flush() period = time.time() - start_time print("video index: %d, length: %d, period: %.2f sec, speed: %.2f sec/f." %(v_ind, length, period, period/length)) start_time = time.time() fout.close()
def __getitem__(self, index): """ Given the video index, return the list of frames, label, and video index if the video can be fetched and decoded successfully, otherwise repeatly find a random video that can be decoded as a replacement. Args: index (int): the video index provided by the pytorch sampler. Returns: frames (tensor): the frames of sampled from the video. The dimension is `channel` x `num frames` x `height` x `width`. label (int): the label of the current video. index (int): if the video provided by pytorch sampler can be decoded, then return the index of the video. If not, return the index of the video replacement that can be decoded. """ if self.mode in ["train", "val"]: # -1 indicates random sampling. temporal_sample_index = -1 spatial_sample_index = -1 min_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[0] max_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[1] crop_size = self.cfg.DATA.TRAIN_CROP_SIZE elif self.mode in ["test"]: temporal_sample_index = (self._spatial_temporal_idx[index] // self.cfg.TEST.NUM_SPATIAL_CROPS) # spatial_sample_index is in [0, 1, 2]. Corresponding to left, # center, or right if width is larger than height, and top, middle, # or bottom if height is larger than width. spatial_sample_index = (self._spatial_temporal_idx[index] % self.cfg.TEST.NUM_SPATIAL_CROPS) min_scale, max_scale, crop_size = [self.cfg.DATA.TEST_CROP_SIZE ] * 3 # The testing is deterministic and no jitter should be performed. # min_scale, max_scale, and crop_size are expect to be the same. assert len({min_scale, max_scale, crop_size}) == 1 else: raise NotImplementedError("Does not support {} mode".format( self.mode)) # Try to decode and sample a clip from a video. If the video can not be # decoded, repeatly find a random video replacement that can be decoded. for _ in range(self._num_retries): video_container = None try: video_container = container.get_video_container( self._path_to_videos[index], self.cfg.DATA_LOADER.ENABLE_MULTI_THREAD_DECODE, ) except Exception as e: logger.info( "Failed to load video from {} with error {}".format( self._path_to_videos[index], e)) # Select a random video if the current video was not able to access. if video_container is None: index = random.randint(0, len(self._path_to_videos) - 1) continue # Decode video. Meta info is used to perform selective decoding. frames = decoder.decode( video_container, self.cfg.DATA.SAMPLING_RATE, self.cfg.DATA.NUM_FRAMES, temporal_sample_index, self.cfg.TEST.NUM_ENSEMBLE_VIEWS, video_meta=self._video_meta[index], target_fps=30, ) # If decoding failed (wrong format, video is too short, and etc), # select another video. if frames is None: index = random.randint(0, len(self._path_to_videos) - 1) continue # Perform color normalization. frames = frames.float() frames = frames / 255.0 frames = frames - torch.tensor(self.cfg.DATA.MEAN) frames = frames / torch.tensor(self.cfg.DATA.STD) # T H W C -> C T H W. frames = frames.permute(3, 0, 1, 2) # Perform data augmentation. frames = self.spatial_sampling( frames, spatial_idx=spatial_sample_index, min_scale=min_scale, max_scale=max_scale, crop_size=crop_size, ) label = self._labels[index] frames = utils.pack_pathway_output(self.cfg, frames) return frames, label, index, {} else: raise RuntimeError( "Failed to fetch video after {} retries.".format( self._num_retries))