def test_write_video_with_audio(self): f_name = os.path.join(VIDEO_DIR, "R6llTwEh07w.mp4") video_tensor, audio_tensor, info = io.read_video(f_name, pts_unit="sec") with get_tmp_dir() as tmpdir: out_f_name = os.path.join(tmpdir, "testing.mp4") io.video.write_video( out_f_name, video_tensor, round(info["video_fps"]), video_codec="libx264rgb", options={'crf': '0'}, audio_array=audio_tensor, audio_fps=info["audio_fps"], audio_codec="aac", ) out_video_tensor, out_audio_tensor, out_info = io.read_video( out_f_name, pts_unit="sec") assert info["video_fps"] == out_info["video_fps"] assert_equal(video_tensor, out_video_tensor) audio_stream = av.open(f_name).streams.audio[0] out_audio_stream = av.open(out_f_name).streams.audio[0] assert info["audio_fps"] == out_info["audio_fps"] assert audio_stream.rate == out_audio_stream.rate assert pytest.approx(out_audio_stream.frames, rel=0.0, abs=1) == audio_stream.frames assert audio_stream.frame_size == out_audio_stream.frame_size
def test_read_partial_video_pts_unit_sec(self): with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): pts, _ = io.read_video_timestamps(f_name, pts_unit='sec') for start in range(5): for l in range(1, 4): lv, _, _ = io.read_video(f_name, pts[start], pts[start + l - 1], pts_unit='sec') s_data = data[start:(start + l)] self.assertEqual(len(lv), l) self.assertTrue(s_data.equal(lv)) container = av.open(f_name) stream = container.streams[0] lv, _, _ = io.read_video( f_name, int(pts[4] * (1.0 / stream.time_base) + 1) * stream.time_base, pts[7], pts_unit='sec') if get_video_backend() == "pyav": # for "video_reader" backend, we don't decode the closest early frame # when the given start pts is not matching any frame pts self.assertEqual(len(lv), 4) self.assertTrue(data[4:8].equal(lv)) container.close()
def test_write_video_with_audio(self): f_name = os.path.join(VIDEO_DIR, "R6llTwEh07w.mp4") video_tensor, audio_tensor, info = io.read_video(f_name, pts_unit="sec") with get_tmp_dir() as tmpdir: out_f_name = os.path.join(tmpdir, "testing.mp4") io.video.write_video( out_f_name, video_tensor, round(info["video_fps"]), video_codec="libx264rgb", options={'crf': '0'}, audio_array=audio_tensor, audio_fps=info["audio_fps"], audio_codec="aac", ) out_video_tensor, out_audio_tensor, out_info = io.read_video( out_f_name, pts_unit="sec" ) self.assertEqual(info["video_fps"], out_info["video_fps"]) assert_equal(video_tensor, out_video_tensor) audio_stream = av.open(f_name).streams.audio[0] out_audio_stream = av.open(out_f_name).streams.audio[0] self.assertEqual(info["audio_fps"], out_info["audio_fps"]) self.assertEqual(audio_stream.rate, out_audio_stream.rate) self.assertAlmostEqual(audio_stream.frames, out_audio_stream.frames, delta=1) self.assertEqual(audio_stream.frame_size, out_audio_stream.frame_size)
def test_read_partial_video_pts_unit_sec(self, start, offset): with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): pts, _ = io.read_video_timestamps(f_name, pts_unit='sec') lv, _, _ = io.read_video(f_name, pts[start], pts[start + offset - 1], pts_unit='sec') s_data = data[start:(start + offset)] assert len(lv) == offset assert_equal(s_data, lv) with av.open(f_name) as container: stream = container.streams[0] lv, _, _ = io.read_video(f_name, int(pts[4] * (1.0 / stream.time_base) + 1) * stream.time_base, pts[7], pts_unit='sec') if get_video_backend() == "pyav": # for "video_reader" backend, we don't decode the closest early frame # when the given start pts is not matching any frame pts assert len(lv) == 4 assert_equal(data[4:8], lv)
def test_invalid_file(self): set_video_backend('video_reader') with self.assertRaises(RuntimeError): io.read_video('foo.mp4') set_video_backend('pyav') with self.assertRaises(RuntimeError): io.read_video('foo.mp4')
def test_invalid_file(self): set_video_backend("video_reader") with pytest.raises(RuntimeError): io.read_video("foo.mp4") set_video_backend("pyav") with pytest.raises(RuntimeError): io.read_video("foo.mp4")
def test_read_partial_video(self): with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): pts, _ = io.read_video_timestamps(f_name) for start in range(5): for l in range(1, 4): lv, _, _ = io.read_video(f_name, pts[start], pts[start + l - 1]) s_data = data[start:(start + l)] self.assertEqual(len(lv), l) self.assertTrue(s_data.equal(lv)) lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7]) self.assertEqual(len(lv), 4) self.assertTrue(data[4:8].equal(lv))
def test_read_partial_video_bframes(self): # do not use lossless encoding, to test the presence of B-frames options = {'bframes': '16', 'keyint': '10', 'min-keyint': '4'} with temp_video(100, 300, 300, 5, options=options) as (f_name, data): pts, _ = io.read_video_timestamps(f_name) for start in range(0, 80, 20): for l in range(1, 4): lv, _, _ = io.read_video(f_name, pts[start], pts[start + l - 1]) s_data = data[start:(start + l)] self.assertEqual(len(lv), l) self.assertTrue((s_data.float() - lv.float()).abs().max() < self.TOLERANCE) lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7]) self.assertEqual(len(lv), 4) self.assertTrue((data[4:8].float() - lv.float()).abs().max() < self.TOLERANCE)
def blur_background(video_path, respth='./res/test_res', cp='model_final_diss.pth'): frames, audio, info = read_video(video_path, 61, 65, pts_unit="sec") scale_labels = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((frames.shape[1], frames.shape[2]), interpolation=Image.NEAREST), transforms.ToTensor() ]) labels = label_images(frames, cp) new_frames = [] for frame_inx in tqdm.tqdm( list(range(frames.shape[0])), desc="generating segmented frames with background"): scaled_label = scale_labels(labels[frame_inx].type( torch.uint8)).squeeze(0) scaled_label = torch.stack([scaled_label, scaled_label, scaled_label], dim=2) blurred_img = torch.from_numpy( cv2.blur(frames[frame_inx].numpy(), (15, 15))) new_frames.append( torch.where(scaled_label > 0, frames[frame_inx], blurred_img)) new_frames = torch.stack(new_frames) write_video( os.path.join(respth, "blurred_background" + os.path.basename(video_path)) + ".mp4", new_frames, round(info["video_fps"]))
def __getitem__(self, idx): sample = {} row = self.clip_metadata_df.iloc[idx] filename, fps = row['filename'], row['fps'] filename, fps, clip_t_start, is_last_clip = row['filename'], row[ 'fps'], row['clip-t-start'], row['is-last-clip'] # compute clip_t_start and clip_t_end clip_length_in_sec = self.clip_length / self.frame_rate clip_t_end = clip_t_start + clip_length_in_sec # get a tensor [clip_length, H, W, C] of the video frames between clip_t_start and clip_t_end seconds vframes, _, _ = read_video(filename=filename, start_pts=clip_t_start, end_pts=clip_t_end, pts_unit='sec') idxs = EvalVideoDataset._resample_video_idx(self.clip_length, fps, self.frame_rate) vframes = vframes[ idxs][:self. clip_length] # [:self.clip_length] for removing extra frames if isinstance(idxs, slice) if vframes.shape[0] != self.clip_length: raise RuntimeError( f'<EvalVideoDataset>: got clip of length {vframes.shape[0]} != {self.clip_length}.' f'filename={filename}, clip_t_start={clip_t_start}, clip_t_end={clip_t_end}, ' f'fps={fps}') sample['clip'] = self.transforms(vframes) sample['filename'] = filename sample['is-last-clip'] = is_last_clip return sample
def load_rgbf(args,is_cropped,path): render_size = args.inference_size rgb_frames,_,_ = read_video(path,pts_unit='sec') rgb_frames= rgb_frames.to(torch.float32) # Step 1 crop the images frame_size = rgb_frames[0].shape if (render_size[0] < 0) or (render_size[1] < 0) or (frame_size[0]%64) or (frame_size[1]%64): render_size[0] = ( (frame_size[0])//64 ) * 64 render_size[1] = ( (frame_size[1])//64 ) * 64 cropped_frames = [] for rgb_frame in rgb_frames: image_size = rgb_frame.shape[:2] if is_cropped: cropper = StaticRandomCrop(image_size, crop_size) else: cropper = StaticCenterCrop(image_size, render_size) frame = cropper.crop(rgb_frame) cropped_frames.append(frame) rgb_frames = cropped_frames # Step 2 load the flow images flow_frames = flow_from_frames(args,rgb_frames) frames = [] # Step 3 create the rgbf images for (index, flow_img) in enumerate(flow_frames): flow_img = flow_img.cpu().numpy() mcm = MCM(flow_img) rgb_img = rgb_frames[index] rgbf = RGBF(rgb_img, mcm) frames.append(rgbf) return frames
def bug_due_to_unit_in_audio_align(): fname = osp.expanduser('~/tv_host.mp4') vframes, aframes, info = read_video(fname, start_pts=0, end_pts=1, pts_unit='sec') return vframes
def _get_video_frame(self, video_year, video_ID, frame_id): p = self._get_video_path(video_year, video_ID) # videos are all at 25 fps, so to get rought seconds cound we divide that s_entry = (frame_id - 1) / 25 video, _, _ = IO.read_video(p, s_entry, s_entry + 0.5, pts_unit="sec") return F.to_pil_image(video[0, ...].permute(2, 0, 1))
def vgpt(invid): try: os.remove("output.mp4") except FileNotFoundError: pass clip = VideoFileClip(invid) rate = clip.fps sequence_length = int(clip.fps * clip.duration) pts = read_video_timestamps(invid, pts_unit='sec')[0] video = read_video(invid, pts_unit='sec', start_pts=pts[0], end_pts=pts[sequence_length - 1])[0] video = preprocess(video, resolution, sequence_length).unsqueeze(0).to(device) with torch.no_grad(): encodings = vqvae.encode(video) video_recon = vqvae.decode(encodings) video_recon = torch.clamp(video_recon, -0.5, 0.5) videos = video_recon[0].permute(1, 2, 3, 0) # CTHW -> THWC videos = ((videos + 0.5) * 255).cpu().numpy().astype('uint8') imageio.mimwrite('output.mp4', videos, fps=int(rate)) return './output.mp4'
def __init__(self, video_path, start_time=0, end_time=None, stride=None, transforms=None): """ Args: video_path: path to the video file start_time (seconds): the start time to read the video end_time (seconds): the end time to read the video stride (seconds): time interval between frames transforms (torchvision.transforms): transform to apply to each frame """ assert path.exists(video_path), f'wrong video path' self.video_path = video_path self.start_time = start_time self.end_time = end_time self.stride = stride self.transforms = transforms self.video_frames, _, self.info = read_video(filename=video_path, start_pts=self.start_time, end_pts=self.end_time, pts_unit='sec') self.frame_stride = int(stride * self.info['video_fps']) self.video_frames = self.video_frames[ list(range(0, self.video_frames.shape[0], self.frame_stride)), ...]
def test_read_video_pts_unit_sec(self): with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): lv, _, info = io.read_video(f_name, pts_unit='sec') assert_equal(data, lv) assert info["video_fps"] == 5 assert info == {"video_fps": 5}
def get_clip(self, idx): """ Gets a subclip from a list of videos. Arguments: idx (int): index of the subclip. Must be between 0 and num_clips(). Returns: video (Tensor) audio (Tensor) info (Dict) video_idx (int): index of the video in `video_paths` """ if idx >= self.num_clips(): raise IndexError("Index {} out of range " "({} number of clips)".format( idx, self.num_clips())) video_idx, clip_idx = self.get_clip_location(idx) video_path = self.video_paths[video_idx] clip_pts = self.clips[video_idx][clip_idx] start_pts = clip_pts[0].item() end_pts = clip_pts[-1].item() video, audio, info = read_video(video_path, start_pts, end_pts) if self.frame_rate is not None: resampling_idx = self.resampling_idxs[video_idx][clip_idx] if isinstance(resampling_idx, torch.Tensor): resampling_idx = resampling_idx - resampling_idx[0] video = video[resampling_idx] info["video_fps"] = self.frame_rate assert len(video) == self.num_frames, "{} x {}".format( video.shape, self.num_frames) return video, audio, info, video_idx
def test_read_partial_video(self, start, offset): with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): pts, _ = io.read_video_timestamps(f_name) lv, _, _ = io.read_video(f_name, pts[start], pts[start + offset - 1]) s_data = data[start:(start + offset)] assert len(lv) == offset assert_equal(s_data, lv) if get_video_backend() == "pyav": # for "video_reader" backend, we don't decode the closest early frame # when the given start pts is not matching any frame pts lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7]) assert len(lv) == 4 assert_equal(data[4:8], lv)
def __getitem__(self, idx): """ obtain the image (read and transform) :param idx: index of the file required :return: img => image array """ rand = np.random.RandomState() video = self.files[idx] target = self.get_class(video) video, audio, info = io.read_video(video) video_duration = video.shape[0] // info['video_fps'] # Less than one second video if video_duration == 0: return self.preprocess_video(video), self.preprocess_audio(audio, info['audio_fps']), target else: second_idx = rand.randint(0, video_duration - 1) second_idx_vid = int(second_idx * info['video_fps']) second_idx_aud = int(second_idx * info['audio_fps']) video = video[second_idx_vid:second_idx_vid + int(info['video_fps'])] audio = audio[:, second_idx_aud:second_idx_aud + int(info['audio_fps'])] return self.preprocess_video(video), self.preprocess_audio(audio, info['audio_fps']), target
def test_read_video_pts_unit_sec(self): with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): lv, _, info = io.read_video(f_name, pts_unit='sec') self.assertTrue(data.equal(lv)) self.assertEqual(info["video_fps"], 5) self.assertEqual(info, {"video_fps": 5})
def __init__(self, args, is_cropped, path='/path/to/frames/only/folder', replicates=1): self.args = args self.is_cropped = is_cropped self.crop_size = args.crop_size self.render_size = args.inference_size self.replicates = replicates self.frames = [] frames, _, _ = read_video(path, pts_unit='sec') frames = frames.to(torch.float32) for i in range(len(frames) - 1): im1 = frames[i] im2 = frames[i + 1] self.frames += [[im1, im2]] self.size = len(self.frames) self.frame_size = self.frames[0][0].shape if (self.render_size[0] < 0) or (self.render_size[1] < 0) or ( self.frame_size[0] % 64) or (self.frame_size[1] % 64): self.render_size[0] = ((self.frame_size[0]) // 64) * 64 self.render_size[1] = ((self.frame_size[1]) // 64) * 64 args.inference_size = self.render_size
def test_read_partial_video(self): with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): pts, _ = io.read_video_timestamps(f_name) for start in range(5): for l in range(1, 4): lv, _, _ = io.read_video(f_name, pts[start], pts[start + l - 1]) s_data = data[start:(start + l)] self.assertEqual(len(lv), l) self.assertTrue(s_data.equal(lv)) if get_video_backend() == "pyav": # for "video_reader" backend, we don't decode the closest early frame # when the given start pts is not matching any frame pts lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7]) self.assertEqual(len(lv), 4) self.assertTrue(data[4:8].equal(lv))
def batch_iter(self, batch_size): for dir_name in os.listdir(self._data_task_dir): fp = self._data_task_dir / dir_name / "recording.mp4" vframes, _, _ = read_video(fp.as_posix(), pts_unit="sec") for batch in torch.split(vframes, batch_size): if batch.size(0) == batch_size: yield batch
def __init__(self, args, path='/path/to/frames/only/folder'): self.args = args self.render_size = args.inference_size self.frames, _, _ = read_video(path, pts_unit='sec') self.frames = self.frames.to(torch.float32) self.size = len(self.frames) self.frame_size = self.frames[0][0].shape args.inference_size = self.render_size
def get_clip(self, idx): """ Gets a subclip from a list of videos. Arguments: idx (int): index of the subclip. Must be between 0 and num_clips(). Returns: video (Tensor) audio (Tensor) info (Dict) video_idx (int): index of the video in `video_paths` """ if idx >= self.num_clips(): raise IndexError("Index {} out of range " "({} number of clips)".format( idx, self.num_clips())) video_path = self.video_paths[idx] clip_pts = self.clips[idx] from torchvision import get_video_backend backend = get_video_backend() if backend == "pyav": # check for invalid options if self._video_width != 0: raise ValueError( "pyav backend doesn't support _video_width != 0") if self._video_height != 0: raise ValueError( "pyav backend doesn't support _video_height != 0") if self._video_min_dimension != 0: raise ValueError( "pyav backend doesn't support _video_min_dimension != 0") if self._video_max_dimension != 0: raise ValueError( "pyav backend doesn't support _video_max_dimension != 0") if self._audio_samples != 0: raise ValueError( "pyav backend doesn't support _audio_samples != 0") if backend == "pyav": assert len(clip_pts) > 0 start_pts = clip_pts[0].item() end_pts = clip_pts[-1].item() video, audio, info = read_video(video_path, start_pts, end_pts) else: raise NotImplementedError(f"backend {backend} is not implemented.") resampling_idx = self.resampling_idxs[idx] if isinstance(resampling_idx, torch.Tensor): resampling_idx = resampling_idx - resampling_idx[0] video = video[resampling_idx] info["video_fps"] = self.frame_rate assert len(video) == self.num_frames, "{} x {}".format( video.shape, self.num_frames) return video, audio, info
def test_read_partial_video_pts_unit_sec(self): with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): pts, _ = io.read_video_timestamps(f_name, pts_unit='sec') for start in range(5): for l in range(1, 4): lv, _, _ = io.read_video(f_name, pts[start], pts[start + l - 1], pts_unit='sec') s_data = data[start:(start + l)] self.assertEqual(len(lv), l) self.assertTrue(s_data.equal(lv)) container = av.open(f_name) stream = container.streams[0] lv, _, _ = io.read_video(f_name, int(pts[4] * (1.0 / stream.time_base) + 1) * stream.time_base, pts[7], pts_unit='sec') self.assertEqual(len(lv), 4) self.assertTrue(data[4:8].equal(lv))
def read_frame(self, timestamp = None): """Reads the next frame or from timestamp. If no timestamp is provided this method just returns the next frame from the video. This is significantly (up to 10x) faster if the `video_loader` backend is available. If a timestamp is provided we first have to seek to the right position and then load the frame. Args: timestamp: Specific timestamp of frame in seconds or None (default: None) Returns: A PIL Image """ if timestamp is not None: self.current_timestamp_idx = self.timestamps.index(timestamp) else: # no timestamp provided -> set current timestamp index to next frame if self.current_timestamp_idx < len(self.timestamps): self.current_timestamp_idx += 1 if self.reader: if timestamp is not None: # Calling seek is slow. If we read next frame we can skip it! if self.timestamps.index(timestamp) != self.last_timestamp_idx + 1: self.reader.seek(timestamp) # make sure we have the tensor in correct shape (we want H x W x C) frame = next(self.reader)['data'].permute(1,2,0) self.last_timestamp_idx = self.current_timestamp_idx else: # fallback on pyav if timestamp is None: # read next frame if no timestamp is provided timestamp = self.timestamps[self.current_timestamp_idx] frame, _, _ = io.read_video(self.path, start_pts=timestamp, end_pts=timestamp, pts_unit=self.pts_unit) self.last_timestamp_idx = self.timestamps.index(timestamp) if len(frame.shape) < 3: raise ValueError('Unexpected error during loading of frame') # sometimes torchvision returns multiple frames for one timestamp (bug?) if len(frame.shape) > 3 and frame.shape[0] > 1: frame = frame[0] # make sure we return a H x W x C tensor and not (1 x H x W x C) if len(frame.shape) == 4: frame = frame.squeeze() # convert to PIL image image = Image.fromarray(frame.numpy()) return image
def test_read_video_corrupted_file(self): with tempfile.NamedTemporaryFile(suffix='.mp4') as f: f.write(b'This is not an mpg4 file') video, audio, info = io.read_video(f.name) self.assertIsInstance(video, torch.Tensor) self.assertIsInstance(audio, torch.Tensor) self.assertEqual(video.numel(), 0) self.assertEqual(audio.numel(), 0) self.assertEqual(info, {})
def test_read_video_corrupted_file(self): with tempfile.NamedTemporaryFile(suffix='.mp4') as f: f.write(b'This is not an mpg4 file') video, audio, info = io.read_video(f.name) assert isinstance(video, torch.Tensor) assert isinstance(audio, torch.Tensor) assert video.numel() == 0 assert audio.numel() == 0 assert info == {}
def _read_video(filename, start_pts=0, end_pts=None): if _video_backend == "pyav": return io.read_video(filename, start_pts, end_pts) else: if end_pts is None: end_pts = -1 return io._read_video_from_file( filename, video_pts_range=(start_pts, end_pts), )