def make_ucf11_datamodule(root='./', **kwargs):
    data_path = Path(root) / 'action_youtube_naudio'
    if not data_path.exists():
        download_and_unzip(_ucf11_url, root, False)

    # Collect all class names, scene folders, and label2id mapping
    classes = sorted(x.name for x in data_path.glob("*") if x.is_dir())
    label2id = {}
    scene_folders = []
    for class_id, class_name in enumerate(classes):
        label2id[class_name] = class_id
        class_folder = data_path / class_name
        scene_folders.extend(
            list(filter(Path.is_dir, class_folder.glob('v_*'))))

    shuffle(scene_folders)

    num_train_scenes = int(0.8 * len(scene_folders))
    train_paths, val_paths = [], []
    for i, scene in enumerate(scene_folders):
        class_id = label2id[scene.parent.name]
        labeled_paths = [(video, class_id) for video in scene.glob('*.avi')]
        if i < num_train_scenes:
            train_paths.extend(labeled_paths)
        else:
            val_paths.extend(labeled_paths)

    return LabeledVideoDataModule(LabeledVideoPaths(train_paths),
                                  LabeledVideoPaths(val_paths),
                                  label2id=label2id,
                                  classes=classes,
                                  **kwargs)
def make_mini_kinetics_datamodule(root='./', **kwargs):
    kinetics_path = Path(root) / 'kinetics'
    if not kinetics_path.exists():
        download_and_unzip(_mini_kinetics_url, root)

    return LabeledVideoDataModule(
        LabeledVideoPaths.from_path(kinetics_path / 'train'),
        LabeledVideoPaths.from_path(kinetics_path / 'val'), **kwargs)
def run_distributed(rank, size, decoder, clip_duration, data_name,
                    return_dict):
    """
    This function is run by each distributed process. It samples videos
    based on the distributed split (determined by the
    DistributedSampler) and returns the dataset clips in the return_dict.
    """
    os.environ["MASTER_ADDR"] = "127.0.0.1"
    os.environ["MASTER_PORT"] = "29500"
    dist.init_process_group("gloo", rank=rank, world_size=size)
    clip_sampler = make_clip_sampler("uniform", clip_duration)
    labeled_video_paths = LabeledVideoPaths.from_path(data_name)
    dataset = LabeledVideoDataset(
        labeled_video_paths,
        clip_sampler=clip_sampler,
        video_sampler=DistributedSampler,
        decode_audio=False,
        decoder=decoder,
    )
    test_dataloader = DataLoader(dataset, batch_size=None, num_workers=1)

    # Run two epochs, simulating use in a training loop
    dataset.video_sampler.set_epoch(0)
    epoch_1 = [(sample["label"], sample["video"])
               for sample in test_dataloader]
    dataset.video_sampler.set_epoch(1)
    epoch_2 = [(sample["label"], sample["video"])
               for sample in test_dataloader]
    return_dict[rank] = {"epoch_1": epoch_1, "epoch_2": epoch_2}
    def test_video_name_with_whitespace_works(self, decoder):
        num_frames = 10
        fps = 5
        with temp_encoded_video(num_frames=num_frames,
                                fps=fps,
                                prefix="pre fix") as (
                                    video_file_name,
                                    data,
                                ):
            with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f:
                f.write(f"{video_file_name} 0\n".encode())
                f.write(f"{video_file_name} 1\n".encode())

            total_duration = num_frames / fps
            clip_sampler = make_clip_sampler("uniform", total_duration)
            labeled_video_paths = LabeledVideoPaths.from_path(f.name)
            dataset = LabeledVideoDataset(
                labeled_video_paths,
                clip_sampler=clip_sampler,
                video_sampler=SequentialSampler,
                decode_audio=False,
                decoder=decoder,
            )

            expected = [(0, data), (1, data)]
            for i, sample in enumerate(dataset):
                self.assertTrue(sample["video"].equal(expected[i][1]))
                self.assertEqual(sample["label"], expected[i][0])
    def test_sampling_with_more_processes_than_videos(self, decoder):
        with mock_encoded_video_dataset_file() as (
                mock_csv,
                label_videos,
                total_duration,
        ):
            half_duration = total_duration / 2 - self._EPS
            clip_sampler = make_clip_sampler("uniform", half_duration)
            labeled_video_paths = LabeledVideoPaths.from_path(mock_csv)
            dataset = LabeledVideoDataset(
                labeled_video_paths,
                clip_sampler=clip_sampler,
                video_sampler=SequentialSampler,
                decode_audio=False,
                decoder=decoder,
            )

            # Split each full video into two clips.
            expected = []
            for label, data in label_videos:
                num_frames = data.shape[0]
                half_frames = num_frames // 2
                first_half_data = data[:, :half_frames]
                second_half_data = data[:, half_frames:]
                expected.append((label, first_half_data))
                expected.append((label, second_half_data))

            test_dataloader = DataLoader(dataset,
                                         batch_size=None,
                                         num_workers=16)
            actual = [(sample["label"], sample["video"])
                      for sample in test_dataloader]
            assert_unordered_list_compare_true(self, expected, actual)
    def test_reading_from_directory_structure(self, decoder):
        # For an unknown reason this import has to be here for `buck test` to work.
        import torchvision.io as io

        with tempfile.TemporaryDirectory() as root_dir:

            # Create test directory structure with two classes and a video in each.
            root_dir_name = pathlib.Path(root_dir)
            test_class_1 = root_dir_name / "running"
            test_class_1.mkdir()
            data_1 = create_dummy_video_frames(15, 10, 10)
            test_class_2 = root_dir_name / "cleaning windows"
            test_class_2.mkdir()
            data_2 = create_dummy_video_frames(20, 15, 15)
            with tempfile.NamedTemporaryFile(
                    suffix=".mp4",
                    dir=test_class_1) as f_1, tempfile.NamedTemporaryFile(
                        suffix=".mp4", dir=test_class_2) as f_2:
                f_1.close()
                f_2.close()

                # Write lossless video for each class.
                io.write_video(
                    f_1.name,
                    data_1,
                    fps=30,
                    video_codec="libx264rgb",
                    options={"crf": "0"},
                )
                io.write_video(
                    f_2.name,
                    data_2,
                    fps=30,
                    video_codec="libx264rgb",
                    options={"crf": "0"},
                )

                clip_sampler = make_clip_sampler("uniform", 3)
                labeled_video_paths = LabeledVideoPaths.from_path(root_dir)
                dataset = LabeledVideoDataset(
                    labeled_video_paths,
                    clip_sampler=clip_sampler,
                    video_sampler=SequentialSampler,
                    decode_audio=False,
                    decoder=decoder,
                )

                # Videos are sorted alphabetically so "cleaning windows" (i.e. data_2)
                # will be first.
                sample_1 = next(dataset)
                self.assertEqual(sample_1["label"], 0)
                self.assertTrue(sample_1["video"].equal(
                    thwc_to_cthw(data_2).to(torch.float32)))

                sample_2 = next(dataset)
                self.assertEqual(sample_2["label"], 1)
                self.assertTrue(sample_2["video"].equal(
                    thwc_to_cthw(data_1).to(torch.float32)))
    def test_sampling_with_non_divisible_processes_by_clips(self, decoder):

        # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2
        # clips respectively.
        num_frames = 10
        fps = 5
        with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as (
                video_file_name_1,
                data_1,
        ):
            with temp_encoded_video(num_frames=num_frames, fps=fps) as (
                    video_file_name_2,
                    data_2,
            ):
                with tempfile.NamedTemporaryFile(delete=False,
                                                 suffix=".txt") as f:
                    f.write(f"{video_file_name_1} 0\n".encode())
                    f.write(f"{video_file_name_2} 1\n".encode())

                total_duration = num_frames / fps
                half_duration = total_duration / 2 - self._EPS
                clip_sampler = make_clip_sampler("uniform", half_duration)
                labeled_video_paths = LabeledVideoPaths.from_path(f.name)
                dataset = LabeledVideoDataset(
                    labeled_video_paths,
                    clip_sampler=clip_sampler,
                    video_sampler=SequentialSampler,
                    decode_audio=False,
                    decoder=decoder,
                )

                half_frames = num_frames // 2
                expected = {
                    (0, data_1[:, half_frames * 2:]),  # 1/3 clip
                    (0, data_1[:, half_frames:half_frames * 2]),  # 2/3 clip
                    (0, data_1[:, :half_frames]),  # 3/3/ clip
                    (1, data_2[:, :half_frames]),  # First half
                    (1, data_2[:, half_frames:]),  # Second half
                }

                test_dataloader = DataLoader(dataset,
                                             batch_size=None,
                                             num_workers=2)
                actual = [(sample["label"], sample["video"])
                          for sample in test_dataloader]
                assert_unordered_list_compare_true(self, expected, actual)
    def test_constant_clips_per_video_sampling_works(self, decoder):
        # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2
        # clips respectively.
        num_frames = 10
        fps = 5
        with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as (
                video_file_name_1,
                data_1,
        ):
            with temp_encoded_video(num_frames=num_frames, fps=fps) as (
                    video_file_name_2,
                    data_2,
            ):
                with tempfile.NamedTemporaryFile(delete=False,
                                                 suffix=".txt") as f:
                    f.write(f"{video_file_name_1} 0\n".encode())
                    f.write(f"{video_file_name_2} 1\n".encode())

                clip_frames = 2
                duration_for_frames = clip_frames / fps - self._EPS
                clip_sampler = make_clip_sampler("constant_clips_per_video",
                                                 duration_for_frames, 2)
                labeled_video_paths = LabeledVideoPaths.from_path(f.name)
                dataset = LabeledVideoDataset(
                    labeled_video_paths,
                    clip_sampler=clip_sampler,
                    video_sampler=SequentialSampler,
                    decode_audio=False,
                    decoder=decoder,
                )

                # Dataset has 2 videos. Each video has two evenly spaced clips of size
                # clip_frames sampled. The first clip of each video will always be
                # sampled at second 0. The second clip of the video is the next frame
                # from time: (total_duration - clip_duration) / 2
                half_frames_1 = math.ceil((data_1.shape[1] - clip_frames) / 2)
                half_frames_2 = math.ceil((data_2.shape[1] - clip_frames) / 2)
                expected = [
                    (0, data_1[:, :clip_frames]),
                    (0, data_1[:, half_frames_1:half_frames_1 + clip_frames]),
                    (1, data_2[:, :clip_frames]),
                    (1, data_2[:, half_frames_2:half_frames_2 + clip_frames]),
                ]
                for i, sample in enumerate(dataset):
                    self.assertTrue(sample["video"].equal(expected[i][1]))
                    self.assertEqual(sample["label"], expected[i][0])
示例#9
0
    def _make_encoded_video_dataset(self, data: SampleCollection) -> 'EncodedVideoDataset':
        classes = self._get_classes(data)
        label_to_class_mapping = dict(enumerate(classes))
        class_to_label_mapping = {c: lab for lab, c in label_to_class_mapping.items()}

        filepaths = data.values("filepath")
        labels = data.values(self.label_field + ".label")
        targets = [class_to_label_mapping[lab] for lab in labels]
        labeled_video_paths = LabeledVideoPaths(list(zip(filepaths, targets)))

        ds: EncodedVideoDataset = EncodedVideoDataset(
            labeled_video_paths,
            self.clip_sampler,
            video_sampler=self.video_sampler,
            decode_audio=self.decode_audio,
            decoder=self.decoder,
        )
        return ds
    def test_random_clip_sampling_works(self, decoder):
        with mock_encoded_video_dataset_file() as (
                mock_csv,
                label_videos,
                total_duration,
        ):
            half_duration = total_duration / 2 - self._EPS
            clip_sampler = make_clip_sampler("random", half_duration)
            labeled_video_paths = LabeledVideoPaths.from_path(mock_csv)
            dataset = LabeledVideoDataset(
                labeled_video_paths,
                clip_sampler=clip_sampler,
                video_sampler=SequentialSampler,
                decode_audio=False,
                decoder=decoder,
            )

            expected_labels = [label for label, _ in label_videos]
            for i, sample in enumerate(dataset):
                expected_t_shape = 5
                self.assertEqual(sample["video"].shape[1], expected_t_shape)
                self.assertEqual(sample["label"], expected_labels[i])
示例#11
0
 def load_data(
     self,
     files: List[PATH_TYPE],
     targets: List[Any],
     clip_sampler: Union[str, "ClipSampler"] = "random",
     clip_duration: float = 2,
     clip_sampler_kwargs: Dict[str, Any] = None,
     video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
     decode_audio: bool = False,
     decoder: str = "pyav",
     target_formatter: Optional[TargetFormatter] = None,
 ) -> "LabeledVideoDataset":
     dataset = LabeledVideoDataset(
         LabeledVideoPaths(list(zip(files, targets))),
         _make_clip_sampler(clip_sampler, clip_duration, clip_sampler_kwargs),
         video_sampler=video_sampler,
         decode_audio=decode_audio,
         decoder=decoder,
     )
     if not self.predicting:
         self.load_target_metadata(
             [sample[1] for sample in dataset._labeled_videos._paths_and_labels], target_formatter=target_formatter
         )
     return dataset
示例#12
0
def Ptvkinetics(cfg, mode):
    """
    Construct the Kinetics video loader with a given csv file. The format of
    the csv file is:
    ```
    path_to_video_1 label_1
    path_to_video_2 label_2
    ...
    path_to_video_N label_N
    ```
    For `train` and `val` mode, a single clip is randomly sampled from every video
    with random cropping, scaling, and flipping. For `test` mode, multiple clips are
    uniformaly sampled from every video with center cropping.
    Args:
        cfg (CfgNode): configs.
        mode (string): Options includes `train`, `val`, or `test` mode.
            For the train and val mode, the data loader will take data
            from the train or val set, and sample one clip per video.
            For the test mode, the data loader will take data from test set,
            and sample multiple clips per video.
    """
    # Only support train, val, and test mode.
    assert mode in [
        "train",
        "val",
        "test",
    ], "Split '{}' not supported".format(mode)

    logger.info("Constructing Ptvkinetics {}...".format(mode))

    clip_duration = (cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE /
                     cfg.DATA.TARGET_FPS)
    path_to_file = os.path.join(cfg.DATA.PATH_TO_DATA_DIR,
                                "{}.csv".format(mode))
    labeled_video_paths = LabeledVideoPaths.from_path(path_to_file)
    num_videos = len(labeled_video_paths)
    labeled_video_paths.path_prefix = cfg.DATA.PATH_PREFIX
    logger.info("Constructing kinetics dataloader (size: {}) from {}".format(
        num_videos, path_to_file))

    if mode in ["train", "val"]:
        num_clips = 1
        num_crops = 1

        transform = Compose([
            ApplyTransformToKey(
                key="video",
                transform=Compose([
                    UniformTemporalSubsample(cfg.DATA.NUM_FRAMES),
                    Lambda(div255),
                    NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
                    RandomShortSideScale(
                        min_size=cfg.DATA.TRAIN_JITTER_SCALES[0],
                        max_size=cfg.DATA.TRAIN_JITTER_SCALES[1],
                    ),
                    RandomCropVideo(cfg.DATA.TRAIN_CROP_SIZE),
                ] + ([RandomHorizontalFlipVideo(
                    p=0.5)] if cfg.DATA.RANDOM_FLIP else []) +
                                  [PackPathway(cfg)]),
            ),
            DictToTuple(num_clips, num_crops),
        ])

        clip_sampler = make_clip_sampler("random", clip_duration)
        if cfg.NUM_GPUS > 1:
            video_sampler = DistributedSampler
        else:
            video_sampler = (RandomSampler
                             if mode == "train" else SequentialSampler)
    else:
        num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS
        num_crops = cfg.TEST.NUM_SPATIAL_CROPS

        transform = Compose([
            ApplyTransformToKey(
                key="video",
                transform=Compose([
                    UniformTemporalSubsample(cfg.DATA.NUM_FRAMES),
                    Lambda(div255),
                    NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
                    ShortSideScale(size=cfg.DATA.TRAIN_JITTER_SCALES[0]),
                ]),
            ),
            UniformCropVideo(size=cfg.DATA.TEST_CROP_SIZE),
            ApplyTransformToKey(key="video", transform=PackPathway(cfg)),
            DictToTuple(num_clips, num_crops),
        ])
        clip_sampler = make_clip_sampler(
            "constant_clips_per_video",
            clip_duration,
            num_clips,
            num_crops,
        )
        video_sampler = (DistributedSampler
                         if cfg.NUM_GPUS > 1 else SequentialSampler)

    return PTVDatasetWrapper(
        num_videos=num_videos,
        clips_per_video=num_clips,
        crops_per_clip=num_crops,
        dataset=LabeledVideoDataset(
            labeled_video_paths=labeled_video_paths,
            clip_sampler=clip_sampler,
            video_sampler=video_sampler,
            transform=transform,
            decode_audio=False,
        ),
    )