def load_real_and_fake_frame(real, fake, random): vr = decord.VideoReader(real, ctx=decord.cpu()) frame_index = list(np.random.choice(range(len(vr)), random)) real = vr.get_batch(frame_index).asnumpy() vr = decord.VideoReader(fake, ctx=decord.cpu()) fake = vr.get_batch(frame_index).asnumpy() return real, fake, frame_index
def decord_sequential_cpu_benchmark(config): """Benchmarking decord library with seqeuential read""" device = "cpu" if device == "gpu": ctx = decord.gpu(0) else: ctx = decord.cpu() video_reader = decord.VideoReader(config["video_path"], ctx) assert config["resize_shape"] is False, "TODO: implement tranformation of image size for " \ "decord_sequential_cpu_benchmark; note it has inbuilt" \ "support for this. " assert config["downsample"] == 1, "TODO: implement downsampling," \ " note that decord has options " \ "to sample frames every N frames" \ " https://github.com/dmlc/decord#videoloader" \ "Also the video reader has " \ "video_reader.skip_frames(N) function" # video_reader = decord.VideoReader(config["video_path"], ctx, # width=resize_width, # height=resize_height) for timer in tqdm( _TIME.measure_many(inspect.currentframe().f_code.co_name, samples=config["repeats"])): frames_read = 0 with tqdm(total=config["n_frames"]) as pbar: while frames_read < config["n_frames"]: try: img = video_reader.next() except StopIteration: break img = cv2.cvtColor(img.asnumpy(), cv2.COLOR_BGR2RGB) if config["show_img"]: cv2.imshow("img", img) k = cv2.waitKey(1) if ord("q") == k: break blocking_call(config["consumer_blocking_config"]["io_limited"], config["consumer_blocking_config"]["duration"]) frames_read += 1 pbar.update() assert frames_read == config["n_frames"] timer.stop() del img del video_reader video_reader = decord.VideoReader(config["video_path"], ctx)
def __call__(self, results): """ Perform mp4 decode operations. return: List where each item is a numpy array after decoder. """ filepath = results['filename'] temporal_sample_index = results['temporal_sample_index'] temporal_num_clips = results['temporal_num_clips'] vr = de.VideoReader(filepath) videolen = len(vr) fps = vr.get_avg_fps() clip_size = self.num_frames * self.sampling_rate * fps / self.target_fps start_idx, end_idx = self.get_start_end_idx(videolen, clip_size, temporal_sample_index, temporal_num_clips) index = np.linspace(start_idx, end_idx, self.num_frames).astype("int64") index = np.clip(index, 0, videolen) frames_select = vr.get_batch(index) #1 for buffer # dearray_to_img np_frames = frames_select.asnumpy() frames_select_list = [] for i in range(np_frames.shape[0]): imgbuf = np_frames[i] frames_select_list.append(Image.fromarray(imgbuf, mode='RGB')) results['imgs'] = frames_select_list return results
def next_train_batch(self): input_batch = np.zeros(shape=(self._config.batch_size, self._config.time_dimen, self._config.frame_height, self._config.frame_width, self._config.frame_channels)) gt_batch = np.zeros(shape=(self._config.batch_size, self._config.ncls)) for b_idx, sample in enumerate( self._train_samples[self._train_batch_index * self._config.batch_size: (1 + self._train_batch_index) * self._config.batch_size]): video_path = os.path.join(self._config.trainval_set_dir, sample['video_path']) label = sample['label'] video = decord.VideoReader(video_path) assert len(video) == self._config.time_dimen for t_idx, frame in enumerate(video): frame = frame.asnumpy() # (height, width, channels) assert frame.shape[0] == self._config.frame_height assert frame.shape[1] == self._config.frame_width input_batch[b_idx][t_idx] = frame gt_batch[b_idx][self._label_mapping[label]] = 1.0 self._train_batch_index += 1 return input_batch, gt_batch
def next_val_batch(self): input_batch = np.zeros(shape=(self._config.batch_size, self._config.time_dimen, self._config.frame_height, self._config.frame_width, self._config.frame_channels)) gt_batch = np.zeros(shape=(self._config.batch_size, self._config.ncls)) for b_idx, sample in enumerate( self._val_samples[self._val_batch_index * self._config.batch_size: (1 + self._val_batch_index) * self._config.batch_size]): video_path = os.path.join(self._config.trainval_set_dir, 'videos', sample['video_name'] + '.mp4') label = sample['gt'] video = decord.VideoReader(video_path) sampled_frame_index_list = self.sample_t_dimen( len(video), target_frames=self._config.time_dimen) for t_idx, index in enumerate(sampled_frame_index_list): frame = video[index] frame = frame.asnumpy() # (height, width, channels) resized_frame = cv2.resize( frame, (self._config.frame_width, self._config.frame_height)) input_batch[b_idx][t_idx] = resized_frame gt_batch[b_idx][self._label_mapping[label]] = 1.0 self._val_batch_index += 1 return input_batch, gt_batch
def _get_record(self, idx): record = self.video_infos[idx] video_reader = decord.VideoReader(osp.join(self.img_prefix, record.path)) record.num_frames = len(video_reader) return record, video_reader
def __getitem__(self, index): record = self.video_list[index] if('something' in self.dataset): decode_boo = False video_list = os.listdir(record.path) else: decode_boo = True try: directory = record.path if directory[-4:] != ".mp4": video_path = directory+".mp4" else: video_path = directory video_list = decord.VideoReader(video_path) except UnicodeDecodeError: decode_boo = False video_list = os.listdir(record.path) if not self.test_mode: if self.I3D_sample : segment_indices = self._sample_indices(video_list) else: segment_indices = self._sample_indices(video_list) if self.random_shift else self._get_val_indices(video_list) else: if self.dataset == 'kinetics': segment_indices = self._sample_indices(video_list) else: segment_indices = self._get_test_indices(video_list) return self.get(record,video_list, segment_indices,decode_boo)
def read_data(video_name, transform): decord_vr = decord.VideoReader(video_name, width=opt['new_width'], height=opt['new_height']) duration = len(decord_vr) opt['skip_length'] = opt['new_length'] * opt['new_step'] segment_indices, skip_offsets = sample_indices(duration) if opt['video_loader']: if opt['slowfast']: clip_input = video_TSN_decord_slowfast_loader( video_name, decord_vr, duration, segment_indices, skip_offsets) else: clip_input = video_TSN_decord_batch_loader(video_name, decord_vr, duration, segment_indices, skip_offsets) clip_input = transform(clip_input) if opt['slowfast']: sparse_sampels = len(clip_input) // (opt['num_segments'] * opt['num_crop']) clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1,) + (sparse_sampels, 3, opt['input_size'], opt['input_size'])) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) else: clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1,) + (opt['new_length'], 3, opt['input_size'], opt['input_size'])) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) if opt['new_length'] == 1: clip_input = np.squeeze(clip_input, axis=2) # this is for 2D input case return nd.array(clip_input)
def __getitem__(self, idx: int) -> Tuple[torch.tensor, int]: """ Return: (clips (torch.tensor), label (int)) """ record = self.video_records[idx] try: video_reader = decord.VideoReader( "{}.{}".format(os.path.join(self.root, record.path), self.video_ext), # TODO try to add `ctx=decord.ndarray.gpu(0) or .cuda(0)` ) except: print("{}.{}".format(os.path.join(self.root, record.path), self.video_ext)) record._num_frames = len(video_reader) offsets = self._sample_indices(record) clips = np.array([self._get_frames(video_reader, o) for o in offsets]) if self.num_samples == 1: return ( # [T, H, W, C] -> [C, T, H, W] self.transforms(torch.from_numpy(clips[0])), record.label, record.path) else: return ( # [S, T, H, W, C] -> [S, C, T, H, W] torch.stack( [self.transforms(torch.from_numpy(c)) for c in clips]), record.label, record.path)
def key_frames( video_file=None, out_dir=None, ctx=None, sub_clip=False, start_seconds=None, end_seconds=None, ): if sub_clip and start_seconds is not None and end_seconds is not None: video_file = extract_subclip(video_file, start_seconds, end_seconds) vr = de.VideoReader(video_file) key_idxs = vr.get_key_indices() video_name = Path(video_file).stem video_name = video_name.replace(' ', '_') if out_dir is None: out_dir = Path(video_file).parent / video_name out_dir = out_dir.with_suffix('') out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) for ki in key_idxs: frame = vr[ki].asnumpy() out_frame_file = out_dir / \ f"{(video_name).replace(' ','_')}_{ki:08}.png" frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) cv2.imwrite(str(out_frame_file), frame) print(f"Saved {str(out_frame_file)}") print(f"Please check your frames located at {out_dir}") return out_dir
def __call__(self, results): try: import decord except ImportError: raise ImportError( 'Please run "pip install decord" to install Decord first.') decord.logging.set_level(5) if results['frame_inds'].ndim != 1: results['frame_inds'] = np.squeeze(results['frame_inds']) try: container = decord.VideoReader( results['filename'], num_threads=self.num_threads) num_frames = len(container) # decord num_frames frame_inds = results['frame_inds'] # Generate frame index mapping in order # frame_dict = {idx: container[idx % num_frames].asnumpy() for idx in np.unique(frame_inds)} # img_group = [frame_dict[idx] for idx in frame_inds] img_group = container.get_batch( [idx % num_frames for idx in frame_inds]).asnumpy() del container results['img_group'] = img_group results['ori_shape'] = img_group[0].shape results['img_shape'] = img_group[0].shape except Exception as e: print("Failed to decode {} with exception: {}".format( results['filename'], e)) return None return results
def __getitem__(self, i): try: vr = decord.VideoReader(self.videos[i], ctx=decord.cpu()) start = np.random.choice(len(vr)-self.NUMFRAMES) vid = vr.get_batch(list(range(start, start+self.NUMFRAMES))).asnumpy() except Exception as e: return 0 return (vid, start)
def load_video(filepath, num_frames, scale_factor): vr = decord.VideoReader(filepath, ctx=decord.cpu()) vid = vr[:NUMFRAMES].asnumpy() if scale_factor != 1: vid = zoom(vid, [1, scale_factor, scale_factor, 1], prefilter=False, order=0) return vid
def __getitem__(self, i): try: vr = decord.VideoReader(self.videos[i], ctx=decord.cpu()) vid = np.asarray([vr[i].asnumpy() for i in range(self.NUMFRAMES)]) start = 0 except Exception as e: print(e) return 0 return (vid, start)
def __init__(self, video_url=None): super().__init__() self.video_url = video_url self.run_flag = True self.back_flag = False self.pause_flag = False self.vr = de.VideoReader(self.video_url, ctx=de.cpu(0)) self.frame_numbers = range(len(self.vr))
def load_frames(vid, clip_len, root): vname = os.path.join(root, f'{vid}') with open(vname, 'rb') as f: vr = decord.VideoReader(f, width=342, height=256, num_threads=1) frame_rate = int(np.floor(float(len(vr)) / float(clip_len))) start_frame = 1 # start_frame = random.randint(1, num_frames - clip_len * frame_rate + 1) idx_list = [start_frame + i * frame_rate for i in range(clip_len)] frames = vr.get_batch(idx_list) frames = frames.type(torch.float32) / 255. return frames.permute((3, 0, 1, 2))
def __init__(self, path: str, extractor: str): """ Parameters ---------- path: str The path to this video extractor: str The name of the feature extractor currently used """ self._path = path self.reader = decord.VideoReader(self._path) self.features = Features(self._path, extractor)
def analyze(self): for index, sample in enumerate(self._samples): video_path = os.path.join(self._config.trainval_set_dir, 'videos', sample['video_name'] + '.mp4') video = decord.VideoReader(video_path) print(index + 1, sample['video_name'], len(video), video[0].shape) frame = video[0] frame = frame.asnumpy() # (height, width, channels) print(frame.shape) resized = cv2.resize( frame, (self._config.frame_width, self._config.frame_height)) print(resized.shape)
def preprocess(self, data): videos = [] for row in data: video = row.get('data') or row.get('body') if isinstance(video, str): video = base64.b64decode(video) # First save the bytes as a tmp file with open('/tmp/tmp.mp4', 'wb') as fout: fout.write(video) video = decord.VideoReader('/tmp/tmp.mp4') frames = [x.asnumpy() for x in video] videos.append(np.stack(frames)) return videos
def __getitem__(self, index): sample = self.samples[index] try: vr = decord.VideoReader( str(sample.video_path), width=self.video_width, height=self.video_height, num_threads=1, ) except: print('no validate video ') print(f'{sample.video_path}') print('fallback') return self.prev num_frames = len(vr) if num_frames == 0: raise Exception(f'Empty video: {sample.video_path}') frame_indices = np.arange(num_frames) # [0, 1, 2, ..., N - 1] if self.frame_rate is not None: frame_indices = self.resample_fps(frame_indices, vr.get_avg_fps()) # same temporal frame but different spatial transform clip_frame_indices_list = [self.temporal_transform(frame_indices)] clip_frame_indices_list = clip_frame_indices_list * self.num_clips_per_sample # print('#' * 20) # print(clip_frame_indices_list[0]) # print(clip_frame_indices_list[1]) # print('#' * 20) # Fetch all frames in one `vr.get_batch` call clip_frame_indices = np.concatenate( clip_frame_indices_list) # [a1, a2, ..., an, b1, b2, ...,bn] clips: torch.Tensor = vr.get_batch( clip_frame_indices) # [N*T, H, W, C] clip_list = clips.chunk(len(clip_frame_indices_list), dim=0) # List[Tensor[T, H, W, C]] clip_list = [self.spatial_transform(clip) for clip in clip_list] for c in clip_list: assert isinstance(c, torch.Tensor) self.prev = (clip_list, sample.class_index) return clip_list, sample.class_index
def read_data(opt, video_name, transform): decord_vr = decord.VideoReader(video_name, width=opt.new_width, height=opt.new_height) duration = len(decord_vr) opt.skip_length = opt.new_length * opt.new_step segment_indices, skip_offsets = sample_indices(opt, duration) if opt.video_loader: if opt.slowfast: clip_input = video_TSN_decord_slowfast_loader( opt, video_name, decord_vr, duration, segment_indices, skip_offsets) else: clip_input = video_TSN_decord_batch_loader(opt, video_name, decord_vr, duration, segment_indices, skip_offsets) clip_input = transform(clip_input) if opt.slowfast: sparse_sampels = len(clip_input) // (opt.num_segments * opt.num_crop) clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1, ) + (sparse_sampels, 3, opt.input_size, opt.input_size)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) else: clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1, ) + (opt.new_length, 3, opt.input_size, opt.input_size)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) if opt.new_length == 1: clip_input = np.squeeze(clip_input, axis=2) # this is for 2D input case return nd.array(clip_input)
def __call__(self, results): """Perform the PyAV loading. Args: results (dict): The resulting dict to be modified and passed to the next transform in pipeline. """ try: import decord except ImportError: raise ImportError( 'Please run "pip install decord" to install Decord first.') if self.file_client is None: self.file_client = FileClient(self.io_backend, **self.kwargs) file_obj = io.BytesIO(self.file_client.get(results['filename'])) container = decord.VideoReader(file_obj, num_threads=self.num_threads) results['video_reader'] = container results['total_frames'] = len(container) return results
def __getitem__(self, idx): """ Return: clips (torch.tensor), label (int) """ record = self.video_records[idx] video_reader = decord.VideoReader( "{}.{}".format(os.path.join(self.video_dir, record.path), self.video_ext), # TODO try to add `ctx=decord.ndarray.gpu(0) or .cuda(0)` ) record._num_frames = len(video_reader) offsets = self._sample_indices(record) clips = np.array([self._get_frames(video_reader, o) for o in offsets]) if self.num_segments == 1: # [T, H, W, C] -> [C, T, H, W] return self.transforms(torch.from_numpy(clips[0])), record.label else: # [S, T, H, W, C] -> [S, C, T, H, W] return (torch.stack([ self.transforms(torch.from_numpy(c)) for c in clips ]), record.label)
def play_video(results: Dict[int, List[TrackingBbox]], input_video: str) -> None: """ Plot the predicted tracks on the input video. Displays to front-end as sequence of images stringed together in a video. Args: results: dictionary mapping frame id to a list of predicted TrackingBboxes input_video: path to the input video """ results = OrderedDict(sorted(results.items())) # assign bbox color per id unique_ids = list( set([bb.track_id for frame in results.values() for bb in frame])) color_map = assign_colors(unique_ids) # read video and initialize new tracking video video_reader = decord.VideoReader(input_video) # set up ipython jupyter display d_video = IPython.display.display("", display_id=1) # Read each frame, add bbox+track id, display frame for frame_idx in range(len(results) - 1): cur_tracks = results[frame_idx] im = video_reader.next().asnumpy() if len(cur_tracks) > 0: cur_image = draw_boxes(im, cur_tracks, color_map) f = io.BytesIO() im = Image.fromarray(im) im.save(f, "jpeg") d_video.update(IPython.display.Image(data=f.getvalue())) sleep(0.000001)
def __getitem__(self, idx): record = self.video_infos[idx] label = record.label if self.use_decord: video_reader = decord.VideoReader('{}.{}'.format( osp.join(self.img_prefix, record.path), self.video_ext)) record.num_frames = len(video_reader) else: video_reader = mmcv.VideoReader('{}.{}'.format( osp.join(self.img_prefix, record.path), self.video_ext)) record.num_frames = len(video_reader) # record.num_frames = 231 if self.test_mode: segment_indices, skip_offsets = self._get_test_indices(record) else: segment_indices, skip_offsets = self._sample_indices( record) if self.random_shift else self._get_val_indices(record) # handle the first modality modality = self.modalities[0] image_tmpl = self.image_tmpls[0] img_group = self._get_frames(record, video_reader, image_tmpl, modality, segment_indices, skip_offsets) flip = True if np.random.rand() < self.flip_ratio else False if (self.img_scale_dict is not None and record.path in self.img_scale_dict): img_scale = self.img_scale_dict[record.path] else: img_scale = self.img_scale (img_group, img_shape, pad_shape, scale_factor, crop_quadruple) = self.img_group_transform( img_group, img_scale, crop_history=None, flip=flip, keep_ratio=self.resize_keep_ratio, div_255=self.div_255, is_flow=True if modality == 'Flow' else False) ori_shape = (256, 340, 3) img_meta = dict(ori_shape=ori_shape, img_shape=img_shape, pad_shape=pad_shape, scale_factor=scale_factor, crop_quadruple=crop_quadruple, flip=flip) # [M x C x H x W] # M = 1 * N_oversample * N_seg * L if self.input_format == "NCTHW": img_group = img_group.reshape((-1, self.num_segments, self.new_length) + img_group.shape[1:]) # N_over x N_seg x L x C x H x W img_group = np.transpose(img_group, (0, 1, 3, 2, 4, 5)) # N_over x N_seg x C x L x H x W img_group = img_group.reshape((-1, ) + img_group.shape[2:]) # M' x C x L x H x W # handle the rest modalities using the same for i, (modality, image_tmpl) in enumerate( zip(self.modalities[1:], self.image_tmpls[1:])): print('handle the rest modalities using the same') img_group = self._get_frames(record, video_reader, image_tmpl, modality, segment_indices, skip_offsets) # apply transforms flip = True if np.random.rand() < self.flip_ratio else False (img_group, img_shape, pad_shape, scale_factor, crop_quadruple) = self.img_group_transform( img_group, img_scale, crop_history=data['img_meta']['crop_quadruple'], flip=data['img_meta']['flip'], keep_ratio=self.resize_keep_ratio, div_255=self.div_255, is_flow=True if modality == 'Flow' else False) if self.input_format == "NCTHW": # Convert [M x C x H x W] to [M' x C x T x H x W] # M = 1 * N_oversample * N_seg * L # M' = 1 * N_oversample * N_seg, T = L img_group = img_group.reshape((-1, self.num_segments, self.new_length) + img_group.shape[1:]) img_group = np.transpose(img_group, (0, 1, 3, 2, 4, 5)) img_group = img_group.reshape((-1, ) + img_group.shape[2:]) # return img_group, label return img_group, label
parser.add_argument('--gpu', type=int, default=-1, help='context to run, use --gpu=-1 to use cpu only') parser.add_argument('--file', type=str, default='/tmp/testsrc_h264_100s_default.mp4', help='Test video') parser.add_argument('--seed', type=int, default=666, help='numpy random seed for random access indices') parser.add_argument('--random-frames', type=int, default=300, help='number of random frames to run') parser.add_argument('--width', type=int, default=320, help='resize frame width') parser.add_argument('--height', type=int, default=240, help='resize frame height') args = parser.parse_args() test_video = args.file if args.gpu > -1: ctx = de.gpu(args.gpu) else: ctx = de.cpu() vr = de.VideoReader(test_video, ctx, width=args.width, height=args.height) cnt = 0 tic = time.time() while True: try: frame = vr.next() except StopIteration: break cnt += 1 print(cnt, ' frames, elapsed time for sequential read: ', time.time() - tic) np.random.seed(args.seed) # fix seed for all random tests acc_indices = np.arange(len(vr)) np.random.shuffle(acc_indices) if args.random_frames > len(vr): warnings.warn('Number of random frames reduced to {} to fit test video'.format(len(vr)))
def decord_video_loader(self, path): video_loader = decord.VideoReader(path) return video_loader, len(video_loader)
def __init__(self, video_path): self.video_path = video_path self.video = decord.VideoReader(video_path)
def get_output(video_path, out_filename, label, fps=30, font_scale=0.5, font_color='white', target_resolution=None, resize_algorithm='bicubic', use_frames=False): """Get demo output using ``moviepy``. This function will generate video file or gif file from raw video or frames, by using ``moviepy``. For more information of some parameters, you can refer to: https://github.com/Zulko/moviepy. Args: video_path (str): The video file path or the rawframes directory path. If ``use_frames`` is set to True, it should be rawframes directory path. Otherwise, it should be video file path. out_filename (str): Output filename for the generated file. label (str): Predicted label of the generated file. fps (int): Number of picture frames to read per second. Default: 30. font_scale (float): Font scale of the label. Default: 0.5. font_color (str): Font color of the label. Default: 'white'. target_resolution (None | tuple[int | None]): Set to (desired_width desired_height) to have resized frames. If either dimension is None, the frames are resized by keeping the existing aspect ratio. Default: None. resize_algorithm (str): Support "bicubic", "bilinear", "neighbor", "lanczos", etc. Default: 'bicubic'. For more information, see https://ffmpeg.org/ffmpeg-scaler.html use_frames: Determine Whether to use rawframes as input. Default:False. """ if video_path.startswith(('http://', 'https://')): raise NotImplementedError try: from moviepy.editor import ImageSequenceClip except ImportError: raise ImportError('Please install moviepy to enable output file.') # Channel Order is BGR if use_frames: frame_list = sorted( [osp.join(video_path, x) for x in os.listdir(video_path)]) frames = [cv2.imread(x) for x in frame_list] else: video = decord.VideoReader(video_path) frames = [x.asnumpy()[..., ::-1] for x in video] if target_resolution: w, h = target_resolution frame_h, frame_w, _ = frames[0].shape if w == -1: w = int(h / frame_h * frame_w) if h == -1: h = int(w / frame_w * frame_h) frames = [cv2.resize(f, (w, h)) for f in frames] textsize = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, font_scale, 1)[0] textheight = textsize[1] padding = 10 location = (padding, padding + textheight) if isinstance(font_color, str): font_color = webcolors.name_to_rgb(font_color)[::-1] frames = [np.array(frame) for frame in frames] for frame in frames: cv2.putText(frame, label, location, cv2.FONT_HERSHEY_DUPLEX, font_scale, font_color, 1) # RGB order frames = [x[..., ::-1] for x in frames] video_clips = ImageSequenceClip(frames, fps=fps) out_type = osp.splitext(out_filename)[1][1:] if out_type == 'gif': video_clips.write_gif(out_filename) else: video_clips.write_videofile(out_filename, remove_temp=True)
def __call__(self, results): """ Perform mp4 decode operations. return: List where each item is a numpy array after decoder. """ file_path = results['filename'] results['format'] = 'video' results['backend'] = self.backend if self.backend == 'cv2': cap = cv2.VideoCapture(file_path) videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) sampledFrames = [] for i in range(videolen): ret, frame = cap.read() # maybe first frame is empty if ret == False: continue img = frame[:, :, ::-1] sampledFrames.append(img) results['frames'] = sampledFrames results['frames_len'] = len(sampledFrames) elif self.backend == 'decord': container = de.VideoReader(file_path) frames_len = len(container) results['frames'] = container results['frames_len'] = frames_len elif self.backend == 'pyav': # for TimeSformer if self.mode in ["train", "valid"]: clip_idx = -1 elif self.mode in ["test"]: clip_idx = 0 else: raise NotImplementedError container = av.open(file_path) num_clips = 1 # always be 1 # decode process fps = float(container.streams.video[0].average_rate) frames_length = container.streams.video[0].frames duration = container.streams.video[0].duration if duration is None: # If failed to fetch the decoding information, decode the entire video. decode_all_video = True video_start_pts, video_end_pts = 0, math.inf else: decode_all_video = False start_idx, end_idx = get_start_end_idx( frames_length, self.sampling_rate * self.num_seg / self.target_fps * fps, clip_idx, num_clips) timebase = duration / frames_length video_start_pts = int(start_idx * timebase) video_end_pts = int(end_idx * timebase) frames = None # If video stream was found, fetch video frames from the video. if container.streams.video: margin = 1024 seek_offset = max(video_start_pts - margin, 0) container.seek(seek_offset, any_frame=False, backward=True, stream=container.streams.video[0]) tmp_frames = {} buffer_count = 0 max_pts = 0 for frame in container.decode(**{"video": 0}): max_pts = max(max_pts, frame.pts) if frame.pts < video_start_pts: continue if frame.pts <= video_end_pts: tmp_frames[frame.pts] = frame else: buffer_count += 1 tmp_frames[frame.pts] = frame if buffer_count >= 0: break video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)] container.close() frames = [ frame.to_rgb().to_ndarray() for frame in video_frames ] clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps start_idx, end_idx = get_start_end_idx( len(frames), # frame_len clip_sz, clip_idx if decode_all_video else 0, # If decode all video, -1 in train and valid, 0 in test; # else, always 0 in train, valid and test, as we has selected clip size frames when decode. 1) results['frames'] = frames results['frames_len'] = len(frames) results['start_idx'] = start_idx results['end_idx'] = end_idx else: raise NotImplementedError return results