Пример #1
0
    def test_define_clip_structure_generator(self):
        seconds_per_clip = 5
        define_clip_structure_fn = (
            EpicKitchenRecognition._define_clip_structure_generator(
                seconds_per_clip=5,
                clip_sampling=ClipSampling.RandomOffsetUniform))
        frame_videos = {
            "P01_003":
            FrameVideo.from_frame_paths(
                [f"root/P01_003/frame_{i}" for i in range(100)], 10),
            "P02_004":
            FrameVideo.from_frame_paths(
                [f"root/P02_004/frame_{i}" for i in range(300)], 10),
            "P11_010":
            FrameVideo.from_frame_paths(
                [f"root/P11_010/frame_{i}" for i in range(600)], 30),
        }
        actions = {video_id: [] for video_id in frame_videos}
        random_value = 0.5
        with unittest.mock.patch("random.random",
                                 return_value=random_value) as _:
            clips = define_clip_structure_fn(frame_videos, actions)
            sorted_clips = sorted(clips,
                                  key=lambda c: c.start_time)  # For stability

            for clip in sorted_clips:
                self.assertEqual(clip.stop_time - clip.start_time,
                                 seconds_per_clip)

            clips_P01_003 = [
                c for c in sorted_clips if c.video_id == "P01_003"
            ]
            self.assertEqual(len(clips_P01_003), 1)
            for i in range(len(clips_P01_003)):
                self.assertEqual(clips_P01_003[i].start_time,
                                 seconds_per_clip * (i + random_value))

            clips_P02_004 = [
                c for c in sorted_clips if c.video_id == "P02_004"
            ]
            self.assertEqual(len(clips_P02_004), 5)
            for i in range(len(clips_P02_004)):
                self.assertEqual(clips_P02_004[i].start_time,
                                 seconds_per_clip * (i + random_value))

            clips_P11_010 = [
                c for c in sorted_clips if c.video_id == "P11_010"
            ]
            self.assertEqual(len(clips_P11_010), 3)
            for i in range(len(clips_P11_010)):
                self.assertEqual(clips_P11_010[i].start_time,
                                 seconds_per_clip * (i + random_value))
Пример #2
0
    def video_from_path(self,
                        filepath,
                        decode_audio=False,
                        decoder="pyav",
                        fps=30):
        try:
            is_file = g_pathmgr.isfile(filepath)
            is_dir = g_pathmgr.isdir(filepath)
        except NotImplementedError:

            # Not all PathManager handlers support is{file,dir} functions, when this is the
            # case, we default to assuming the path is a file.
            is_file = True
            is_dir = False

        if is_file:
            from pytorchvideo.data.encoded_video import EncodedVideo

            return EncodedVideo.from_path(filepath, decode_audio, decoder)
        elif is_dir:
            from pytorchvideo.data.frame_video import FrameVideo

            assert not decode_audio, "decode_audio must be False when using FrameVideo"
            return FrameVideo.from_directory(
                filepath, fps, path_order_cache=self.path_order_cache)
        else:
            raise FileNotFoundError(f"{filepath} not found.")
    def test_frame_video_works(self):
        frame_names = [f"{str(i)}.png" for i in range(3)]
        with temp_frame_video(frame_names) as (f_name, data):
            frame_paths = [f_name / x for x in frame_names]
            test_video = FrameVideo.from_frame_paths(frame_paths)
            expected_duration = (
                0.1  # Total duration of 3 frames at 30fps is 0.1 seconds.
            )
            self.assertEqual(test_video.duration, expected_duration)

            # All frames (0 - 0.1 seconds)
            clip = test_video.get_clip(0, 0.1)
            frames, indices = clip["video"], clip["frame_indices"]
            self.assertTrue(frames.equal(data))
            self.assertEqual(indices, [0, 1, 2])

            # All frames (0 - 0.1 seconds), filtred to middle frame
            clip = test_video.get_clip(0, 0.1, lambda lst: lst[1:2])
            frames, indices = clip["video"], clip["frame_indices"]
            self.assertTrue(frames.equal(data[:, 1:2]))
            self.assertEqual(indices, [1])

            # 2 frames (0 - 0.066 seconds)
            clip = test_video.get_clip(0, 0.066)
            frames, indices = clip["video"], clip["frame_indices"]
            self.assertTrue(frames.equal(data[:, :2]))
            self.assertEqual(indices, [0, 1])

            # No frames (3 - 5 seconds)
            result = test_video.get_clip(3, 5)
            self.assertEqual(result, None)
Пример #4
0
    def __next__(self) -> dict:
        """
        Retrieves the next clip based on the clip sampling strategy and video sampler.

        Returns:
            A dictionary with the following format.

            .. code-block:: text

                {
                    'video': <video_tensor>,
                    'label': <index_label>,
                    'video_label': <index_label>
                    'video_index': <video_index>,
                    'clip_index': <clip_index>,
                    'aug_index': <aug_index>,
                }
        """
        if not self._video_sampler_iter:
            # Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned.
            self._video_sampler_iter = iter(
                MultiProcessSampler(self._video_sampler))

        if self._loaded_video:
            video, video_index = self._loaded_video
        else:
            video_index = next(self._video_sampler_iter)
            path_to_video_frames = self._path_to_videos[video_index]
            video = FrameVideo.from_frame_paths(path_to_video_frames)
            self._loaded_video = (video, video_index)

        clip_start, clip_end, clip_index, aug_index, is_last_clip = self._clip_sampler(
            self._next_clip_start_time, video.duration, {})
        # Only load the clip once and reuse previously stored clip if there are multiple
        # views for augmentations to perform on the same clip.
        if aug_index == 0:
            self._loaded_clip = video.get_clip(0, video.duration,
                                               self._frame_filter)

        self._next_clip_start_time = clip_end

        if is_last_clip:
            self._loaded_video = None
            self._next_clip_start_time = 0.0

        sample_dict = {
            "video": self._loaded_clip["video"],
            "label": self._labels[video_index],
            "video_name": str(video_index),
            "video_index": video_index,
            "clip_index": clip_index,
            "aug_index": aug_index,
        }
        if self._transform is not None:
            sample_dict = self._transform(sample_dict)

        return sample_dict
 def _load_frame_videos(
     frame_manifest_file_path: str,
     video_infos: Dict[str, VideoInfo],
     multithreaded_io: bool,
 ):
     video_frames: Dict[str, VideoFrameInfo] = load_dataclass_dict_from_csv(
         frame_manifest_file_path, VideoFrameInfo, "video_id")
     VideoDataset._remove_video_info_missing_or_incomplete_videos(
         video_frames, video_infos)
     return {
         video_id: FrameVideo(
             video_frame_paths=VideoDataset._frame_number_to_filepaths(
                 video_id, video_frames, video_infos),
             duration=video_infos[video_id].duration,
             fps=video_infos[video_id].fps,
             multithreaded_io=multithreaded_io,
         )
         for video_id in video_infos
     }
Пример #6
0
    def __next__(self) -> dict:
        """
        Retrieves the next clip based on the clip sampling strategy and video sampler.

        Returns:
            A video clip with the following format if transform is None:
                {
                    'video': <video_tensor>,
                    'label': <index_label> for clip-level label,
                    'video_label': <index_label> for video-level label,
                    'video_index': <video_index>,
                    'clip_index': <clip_index>,
                    'aug_index': <aug_index>, augmentation index as augmentations
                        might generate multiple views for one clip.
                }
            Otherwise, the transform defines the clip output.
        """
        if not self._video_sampler_iter:
            # Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned.
            self._video_sampler_iter = iter(
                MultiProcessSampler(self._video_sampler))

        if self._loaded_video:
            video, video_index = self._loaded_video
        else:
            video_index = next(self._video_sampler_iter)
            path_to_video_frames = self._path_to_videos[video_index]
            video = FrameVideo.from_frame_paths(path_to_video_frames)
            self._loaded_video = (video, video_index)

        clip_start, clip_end, clip_index, aug_index, is_last_clip = self._clip_sampler(
            self._next_clip_start_time, video.duration)
        # Only load the clip once and reuse previously stored clip if there are multiple
        # views for augmentations to perform on the same clip.
        if aug_index == 0:
            self._loaded_clip = video.get_clip(clip_start, clip_end,
                                               self._frame_filter)
        frames, frame_indices = (
            self._loaded_clip["video"],
            self._loaded_clip["frame_indices"],
        )
        self._next_clip_start_time = clip_end

        if is_last_clip:
            self._loaded_video = None
            self._next_clip_start_time = 0.0

        # Merge unique labels from each frame into clip label.
        labels_by_frame = [
            self._labels[video_index][i]
            for i in range(min(frame_indices),
                           max(frame_indices) + 1)
        ]
        sample_dict = {
            "video": frames,
            "label": labels_by_frame,
            "video_label": self._video_labels[video_index],
            "video_name": str(video_index),
            "video_index": video_index,
            "clip_index": clip_index,
            "aug_index": aug_index,
        }
        if self._transform is not None:
            sample_dict = self._transform(sample_dict)

        return sample_dict
    def test_define_clip_structure_generator(self):
        frame_videos = {
            "P01_003":
            FrameVideo.from_frame_paths(
                [f"root/P01_003/frame_{i}" for i in range(200)], 10),
            "P02_004":
            FrameVideo.from_frame_paths(
                [f"root/P02_004/frame_{i}" for i in range(300)], 10),
            "P11_010":
            FrameVideo.from_frame_paths(
                [f"root/P11_010/frame_{i}" for i in range(600)], 30),
        }
        actions = {
            "P01_003": [
                ActionData(
                    "P01",
                    "P01_003",
                    "turn off light",
                    "00:00:01.00",
                    "00:00:02.00",
                    262,
                    370,
                    "turn-off",
                    12,
                    "light",
                    113,
                    "['light']",
                    "[113]",
                ),
                ActionData(
                    "P01",
                    "P01_003",
                    "turn on light",
                    "00:00:04.00",
                    "00:00:05.00",
                    262,
                    370,
                    "turn-on",
                    12,
                    "light",
                    113,
                    "['light']",
                    "[113]",
                ),
                ActionData(
                    "P01",
                    "P01_003",
                    "close door",
                    "00:00:06.00",
                    "00:00:07.00",
                    418,
                    569,
                    "close",
                    3,
                    "door",
                    8,
                    "['door']",
                    "[8]",
                ),
                ActionData(
                    "P01",
                    "P01_003",
                    "slam door",
                    "00:00:10.00",
                    "00:00:11.00",
                    408,
                    509,
                    "slam",
                    3,
                    "door",
                    8,
                    "['door']",
                    "[8]",
                ),
            ],
            "P02_004": [
                ActionData(
                    "P02",
                    "P02_004",
                    "turn off light",
                    "00:00:04.00",
                    "00:00:05.00",
                    262,
                    370,
                    "turn-off",
                    12,
                    "light",
                    113,
                    "['light']",
                    "[113]",
                ),
                ActionData(
                    "P02",
                    "P02_004",
                    "turn on light",
                    "00:00:05.00",
                    "00:00:06.00",
                    262,
                    370,
                    "turn-on",
                    12,
                    "light",
                    113,
                    "['light']",
                    "[113]",
                ),
                ActionData(
                    "P02",
                    "P02_004",
                    "close door",
                    "00:00:08.00",
                    "00:00:09.00",
                    418,
                    569,
                    "close",
                    3,
                    "door",
                    8,
                    "['door']",
                    "[8]",
                ),
                ActionData(
                    "P02",
                    "P02_004",
                    "slam door",
                    "00:00:10.00",
                    "00:00:11.00",
                    408,
                    509,
                    "slam",
                    3,
                    "door",
                    8,
                    "['door']",
                    "[8]",
                ),
            ],
            "P11_010": [
                ActionData(
                    "P11",
                    "P11_010",
                    "turn off light",
                    "00:00:01.00",
                    "00:00:02.00",
                    262,
                    370,
                    "turn-off",
                    12,
                    "light",
                    113,
                    "['light']",
                    "[113]",
                ),
                ActionData(
                    "P11",
                    "P11_010",
                    "turn on light",
                    "00:00:04.00",
                    "00:00:05.50",
                    262,
                    370,
                    "turn-on",
                    12,
                    "light",
                    113,
                    "['light']",
                    "[113]",
                ),
                ActionData(
                    "P11",
                    "P11_010",
                    "turn on light",
                    "00:00:04.00",
                    "00:00:06.00",
                    262,
                    370,
                    "turn-on",
                    12,
                    "light",
                    113,
                    "['light']",
                    "[113]",
                ),
                ActionData(
                    "P11",
                    "P11_010",
                    "close door",
                    "00:00:06.00",
                    "00:00:07.00",
                    418,
                    569,
                    "close",
                    3,
                    "door",
                    8,
                    "['door']",
                    "[8]",
                ),
                ActionData(
                    "P11",
                    "P11_010",
                    "slam door",
                    "00:00:10.00",
                    "00:00:11.00",
                    408,
                    509,
                    "slam",
                    3,
                    "door",
                    8,
                    "['door']",
                    "[8]",
                ),
            ],
        }
        random_value = 0.5
        with unittest.mock.patch("random.random",
                                 return_value=random_value) as _:
            define_clip_structure_fn = (
                EpicKitchenForecasting._define_clip_structure_generator(
                    seconds_per_clip=1,
                    clip_time_stride=3,
                    num_input_clips=2,
                    num_forecast_actions=2,
                    clip_sampling=ClipSampling.Random,
                ))
            clips = define_clip_structure_fn(frame_videos, actions)
            sorted_clips = sorted(clips,
                                  key=lambda c: c.start_time)  # For stability
            for clip in sorted_clips:
                self.assertEqual(clip.stop_time - clip.start_time, 4.0)

            clips_P01_003 = [
                c for c in sorted_clips if c.video_id == "P01_003"
            ]
            self.assertEqual(len(clips_P01_003), 1)

            clips_P01_003[0].start_time == actions["P01_003"][1].stop_time

            clips_P02_004 = [
                c for c in sorted_clips if c.video_id == "P02_004"
            ]
            self.assertEqual(len(clips_P02_004), 2)
            clips_P02_004[0].start_time == actions["P02_004"][0].stop_time
            clips_P02_004[1].start_time == actions["P02_004"][1].stop_time

            clips_P11_010 = [
                c for c in sorted_clips if c.video_id == "P11_010"
            ]
            self.assertEqual(len(clips_P11_010), 1)
            clips_P11_010[0].start_time == actions["P11_010"][1].stop_time
 def test_empty_frames_failure(self):
     with pytest.raises(AssertionError):
         FrameVideo.from_frame_paths([])
 def test_open_video_failure(self):
     test_video = FrameVideo.from_frame_paths(["non_existent_file.txt"])
     with pytest.raises(Exception):
         test_video.get_clip(0,
                             0.01)  # duration is 1 / 30 because one frame