Exemplo n.º 1
0
    def default_transforms(self) -> Dict[str, Callable]:
        if self.training:
            post_tensor_transform = [
                RandomShortSideScale(min_size=256, max_size=320),
                RandomCrop(244),
                RandomHorizontalFlip(p=0.5),
            ]
        else:
            post_tensor_transform = [
                ShortSideScale(256),
            ]

        return {
            "post_tensor_transform": Compose([
                ApplyTransformToKey(
                    key="video",
                    transform=Compose([UniformTemporalSubsample(8)] + post_tensor_transform),
                ),
            ]),
            "per_batch_transform_on_device": Compose([
                ApplyTransformToKey(
                    key="video",
                    transform=K.VideoSequential(
                        K.Normalize(torch.tensor([0.45, 0.45, 0.45]), torch.tensor([0.225, 0.225, 0.225])),
                        data_format="BCTHW",
                        same_on_frame=False
                    )
                ),
            ]),
        }
Exemplo n.º 2
0
    def test_compose_with_video_transforms(self):
        video = thwc_to_cthw(create_dummy_video_frames(
            20, 30, 40)).to(dtype=torch.float32)
        test_clip = {"video": video, "label": 0}

        # Compose using torchvision and pytorchvideo transformst to ensure they interact
        # correctly.
        num_subsample = 10
        transform = Compose([
            ApplyTransformToKey(
                key="video",
                transform=Compose([
                    UniformTemporalSubsample(num_subsample),
                    NormalizeVideo([video.mean()] * 3, [video.std()] * 3),
                    RandomShortSideScale(min_size=15, max_size=25),
                    RandomCropVideo(10),
                    RandomHorizontalFlipVideo(p=0.5),
                ]),
            )
        ])

        actual = transform(test_clip)
        c, t, h, w = actual["video"].shape
        self.assertEqual(c, 3)
        self.assertEqual(t, num_subsample)
        self.assertEqual(h, 10)
        self.assertEqual(w, 10)
Exemplo n.º 3
0
    def get_tfms(self):
        tfms_list = [
            UniformTemporalSubsample(self.transform_params["num_frames"]),
            Lambda(lambda x: x / 255.0),
            Normalize(self.mean, self.std),
        ]
        if self.resize:
            tfms_list += [
                ShortSideScale(size=self.transform_params["side_size"]),
                CenterCropVideo(crop_size=(self.transform_params["crop_size"],
                                           self.transform_params["crop_size"]))
            ]

        # Note that this transform is specific to the x3d model.
        tfms = ApplyTransformToKey(
            key="video",
            transform=Compose(tfms_list),
        )

        # The duration of the input clip is also specific to the model.
        clip_duration = (
            self.transform_params["num_frames"] *
            self.transform_params["sampling_rate"]) / self.frames_per_second

        return tfms, clip_duration
Exemplo n.º 4
0
 def _video_transform(self, mode: str):
     """
     This function contains example transforms using both PyTorchVideo and TorchVision
     in the same Callable. For 'train' mode, we use augmentations (prepended with
     'Random'), for 'val' mode we use the respective determinstic function.
     """
     args = self.args
     return ApplyTransformToKey(
         key="video",
         transform=Compose(
             [
                 UniformTemporalSubsample(args.video_num_subsampled),
                 Normalize(args.video_means, args.video_stds),
             ]
             + (
                 [
                     RandomShortSideScale(
                         min_size=args.video_min_short_side_scale,
                         max_size=args.video_max_short_side_scale,
                     ),
                     RandomCrop(args.video_crop_size),
                     RandomHorizontalFlip(p=args.video_horizontal_flip_p),
                 ]
                 if mode == "train"
                 else [
                     ShortSideScale(args.video_min_short_side_scale),
                     CenterCrop(args.video_crop_size),
                 ]
             )
         ),
     )
    def per_sample_transform(self) -> Callable:
        per_sample_transform = [CenterCrop(self.image_size)]

        return ApplyToKeys(
            "video",
            Compose([
                UniformTemporalSubsample(self.temporal_sub_sample), normalize
            ] + per_sample_transform),
        )
    def train_per_sample_transform(self) -> Callable:
        per_sample_transform = [
            RandomCrop(self.image_size, pad_if_needed=True)
        ]

        return ApplyToKeys(
            "video",
            Compose([
                UniformTemporalSubsample(self.temporal_sub_sample), normalize
            ] + per_sample_transform),
        )
Exemplo n.º 7
0
    def __init__(self,
                 train_paths,
                 val_paths,
                 clip_duration: int = 2,
                 batch_size: int = 4,
                 num_workers: int = 2,
                 **kwargs):
        super().__init__()
        self.train_paths = train_paths
        self.val_paths = val_paths
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.clip_duration = clip_duration
        self.num_labels = len(
            {path[1]
             for path in train_paths._paths_and_labels})
        for k, v in kwargs.items():
            setattr(self, k, v)

        self.train_transforms = ApplyTransformToKey(
            key='video',
            transform=Compose([
                UniformTemporalSubsample(8),
                Lambda(lambda x: x / 255.0),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                RandomShortSideScale(min_size=256, max_size=320),
                RandomCrop(224),
                RandomHorizontalFlip(p=0.5),
            ]))
        self.val_transforms = ApplyTransformToKey(
            key='video',
            transform=Compose([
                UniformTemporalSubsample(8),
                Lambda(lambda x: x / 255.0),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                ShortSideScale(256),
                CenterCrop(224)
            ]))
Exemplo n.º 8
0
def get_transform():

    transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(8),
                #Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                RandomShortSideScale(min_size=256, max_size=320),
                RandomCrop(244),
                RandomHorizontalFlip(p=0.5),
            ]),
        ),
    ])

    return transform
Exemplo n.º 9
0
    def test_torchscriptable_input_output(self):
        video = thwc_to_cthw(create_dummy_video_frames(20, 30, 40)).to(
            dtype=torch.float32
        )

        # Test all the torchscriptable tensors.
        for transform in [UniformTemporalSubsample(10), RandomShortSideScale(10, 20)]:

            transform_script = torch.jit.script(transform)
            self.assertTrue(isinstance(transform_script, torch.jit.ScriptModule))

            # Seed before each transform to force determinism.
            torch.manual_seed(0)
            output = transform(video)
            torch.manual_seed(0)
            script_output = transform_script(video)
            self.assertTrue(output.equal(script_output))
Exemplo n.º 10
0
 def _audio_transform(self):
     """
     This function contains example transforms using both PyTorchVideo and TorchAudio
     in the same Callable.
     """
     args = self.args
     n_fft = int(
         float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size
     )
     hop_length = int(
         float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size
     )
     eps = 1e-10
     return ApplyTransformToKey(
         key="audio",
         transform=Compose(
             [
                 Resample(
                     orig_freq=args.audio_raw_sample_rate,
                     new_freq=args.audio_resampled_rate,
                 ),
                 MelSpectrogram(
                     sample_rate=args.audio_resampled_rate,
                     n_fft=n_fft,
                     hop_length=hop_length,
                     n_mels=args.audio_num_mels,
                     center=False,
                 ),
                 Lambda(lambda x: x.clamp(min=eps)),
                 Lambda(torch.log),
                 UniformTemporalSubsample(args.audio_mel_num_subsample),
                 Lambda(lambda x: x.transpose(1, 0)),  # (F, T) -> (T, F)
                 Lambda(
                     lambda x: x.view(1, x.size(0), 1, x.size(1))
                 ),  # (T, F) -> (1, T, 1, F)
                 Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)),
             ]
         ),
     )
Exemplo n.º 11
0
def test_video_classifier_finetune_fiftyone(tmpdir):

    with mock_encoded_video_dataset_folder(tmpdir) as (
        dir_name,
        total_duration,
    ):

        half_duration = total_duration / 2 - 1e-9

        train_dataset = fo.Dataset.from_dir(
            dir_name,
            dataset_type=fo.types.VideoClassificationDirectoryTree,
        )
        datamodule = VideoClassificationData.from_fiftyone(
            train_dataset=train_dataset,
            clip_sampler="uniform",
            clip_duration=half_duration,
            video_sampler=SequentialSampler,
            decode_audio=False,
        )

        for sample in datamodule.train_dataset.data:
            expected_t_shape = 5
            assert sample["video"].shape[1] == expected_t_shape

        assert len(VideoClassifier.available_backbones()) > 5

        train_transform = {
            "post_tensor_transform": Compose([
                ApplyTransformToKey(
                    key="video",
                    transform=Compose([
                        UniformTemporalSubsample(8),
                        RandomShortSideScale(min_size=256, max_size=320),
                        RandomCrop(244),
                        RandomHorizontalFlip(p=0.5),
                    ]),
                ),
            ]),
            "per_batch_transform_on_device": Compose([
                ApplyTransformToKey(
                    key="video",
                    transform=K.VideoSequential(
                        K.Normalize(torch.tensor([0.45, 0.45, 0.45]), torch.tensor([0.225, 0.225, 0.225])),
                        K.augmentation.ColorJitter(0.1, 0.1, 0.1, 0.1, p=1.0),
                        data_format="BCTHW",
                        same_on_frame=False
                    )
                ),
            ]),
        }

        datamodule = VideoClassificationData.from_fiftyone(
            train_dataset=train_dataset,
            clip_sampler="uniform",
            clip_duration=half_duration,
            video_sampler=SequentialSampler,
            decode_audio=False,
            train_transform=train_transform
        )

        model = VideoClassifier(num_classes=datamodule.num_classes, pretrained=False)

        trainer = flash.Trainer(fast_dev_run=True)

        trainer.finetune(model, datamodule=datamodule)
Exemplo n.º 12
0
def Ptvkinetics(cfg, mode):
    """
    Construct the Kinetics video loader with a given csv file. The format of
    the csv file is:
    ```
    path_to_video_1 label_1
    path_to_video_2 label_2
    ...
    path_to_video_N label_N
    ```
    For `train` and `val` mode, a single clip is randomly sampled from every video
    with random cropping, scaling, and flipping. For `test` mode, multiple clips are
    uniformaly sampled from every video with center cropping.
    Args:
        cfg (CfgNode): configs.
        mode (string): Options includes `train`, `val`, or `test` mode.
            For the train and val mode, the data loader will take data
            from the train or val set, and sample one clip per video.
            For the test mode, the data loader will take data from test set,
            and sample multiple clips per video.
    """
    # Only support train, val, and test mode.
    assert mode in [
        "train",
        "val",
        "test",
    ], "Split '{}' not supported".format(mode)

    logger.info("Constructing Ptvkinetics {}...".format(mode))

    clip_duration = (cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE /
                     cfg.DATA.TARGET_FPS)
    path_to_file = os.path.join(cfg.DATA.PATH_TO_DATA_DIR,
                                "{}.csv".format(mode))
    labeled_video_paths = LabeledVideoPaths.from_path(path_to_file)
    num_videos = len(labeled_video_paths)
    labeled_video_paths.path_prefix = cfg.DATA.PATH_PREFIX
    logger.info("Constructing kinetics dataloader (size: {}) from {}".format(
        num_videos, path_to_file))

    if mode in ["train", "val"]:
        num_clips = 1
        num_crops = 1

        transform = Compose([
            ApplyTransformToKey(
                key="video",
                transform=Compose([
                    UniformTemporalSubsample(cfg.DATA.NUM_FRAMES),
                    Lambda(div255),
                    NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
                    RandomShortSideScale(
                        min_size=cfg.DATA.TRAIN_JITTER_SCALES[0],
                        max_size=cfg.DATA.TRAIN_JITTER_SCALES[1],
                    ),
                    RandomCropVideo(cfg.DATA.TRAIN_CROP_SIZE),
                ] + ([RandomHorizontalFlipVideo(
                    p=0.5)] if cfg.DATA.RANDOM_FLIP else []) +
                                  [PackPathway(cfg)]),
            ),
            DictToTuple(num_clips, num_crops),
        ])

        clip_sampler = make_clip_sampler("random", clip_duration)
        if cfg.NUM_GPUS > 1:
            video_sampler = DistributedSampler
        else:
            video_sampler = (RandomSampler
                             if mode == "train" else SequentialSampler)
    else:
        num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS
        num_crops = cfg.TEST.NUM_SPATIAL_CROPS

        transform = Compose([
            ApplyTransformToKey(
                key="video",
                transform=Compose([
                    UniformTemporalSubsample(cfg.DATA.NUM_FRAMES),
                    Lambda(div255),
                    NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
                    ShortSideScale(size=cfg.DATA.TRAIN_JITTER_SCALES[0]),
                ]),
            ),
            UniformCropVideo(size=cfg.DATA.TEST_CROP_SIZE),
            ApplyTransformToKey(key="video", transform=PackPathway(cfg)),
            DictToTuple(num_clips, num_crops),
        ])
        clip_sampler = make_clip_sampler(
            "constant_clips_per_video",
            clip_duration,
            num_clips,
            num_crops,
        )
        video_sampler = (DistributedSampler
                         if cfg.NUM_GPUS > 1 else SequentialSampler)

    return PTVDatasetWrapper(
        num_videos=num_videos,
        clips_per_video=num_clips,
        crops_per_clip=num_crops,
        dataset=LabeledVideoDataset(
            labeled_video_paths=labeled_video_paths,
            clip_sampler=clip_sampler,
            video_sampler=video_sampler,
            transform=transform,
            decode_audio=False,
        ),
    )
    from torchvision.transforms import CenterCrop, Compose, RandomCrop, RandomHorizontalFlip
else:
    print("Please, run `pip install torchvideo kornia`")
    sys.exit(1)

if __name__ == '__main__':

    # 1. Download a video clip dataset. Find more dataset at https://pytorchvideo.readthedocs.io/en/latest/data.html
    download_data("https://pl-flash-data.s3.amazonaws.com/kinetics.zip")

    # 2. [Optional] Specify transforms to be used during training.
    # Flash helps you to place your transform exactly where you want.
    # Learn more at:
    # https://lightning-flash.readthedocs.io/en/latest/general/data.html#flash.core.data.process.Preprocess
    post_tensor_transform = [
        UniformTemporalSubsample(8),
        RandomShortSideScale(min_size=256, max_size=320)
    ]
    per_batch_transform_on_device = [
        K.Normalize(torch.tensor([0.45, 0.45, 0.45]),
                    torch.tensor([0.225, 0.225, 0.225]))
    ]

    train_post_tensor_transform = post_tensor_transform + [
        RandomCrop(244), RandomHorizontalFlip(p=0.5)
    ]
    val_post_tensor_transform = post_tensor_transform + [CenterCrop(244)]
    train_per_batch_transform_on_device = per_batch_transform_on_device

    def make_transform(
        post_tensor_transform: List[Callable] = post_tensor_transform,
    for k, v in kinetics_classnames.items():
        kinetics_id_to_classname[v] = str(k).replace('"', "")

    # Input Transform
    # slow_r50モデルに固有なパラメータであることに注意!!
    side_size = 256
    mean = [0.45, 0.45, 0.45]
    std = [0.225, 0.225, 0.225]
    crop_size = 256
    num_frames = 8
    sampling_rate = 8
    frames_per_second = 30

    transform = ApplyTransformToKey(key='video',
                                    transform=Compose([
                                        UniformTemporalSubsample(num_frames),
                                        Lambda(lambda x: x / 255.0),
                                        NormalizeVideo(mean, std),
                                        ShortSideScale(size=side_size),
                                        CenterCropVideo(crop_size=(crop_size,
                                                                   crop_size))
                                    ]))

    clip_duration = (num_frames * sampling_rate) / frames_per_second

    # Load Video
    video_path = 'archery.mp4'
    start_sec = 0
    end_sec = start_sec + clip_duration

    video = EncodedVideo.from_path(video_path)
Exemplo n.º 15
0
    "x3d_m": {
        "side_size": 256,
        "crop_size": 256,
        "num_frames": 16,
        "sampling_rate": 5,
    }
}

# Get transform parameters based on model
transform_params = model_transform_params[model_name]

# Note that this transform is specific to the slow_R50 model.
transform = ApplyTransformToKey(
    key="video",
    transform=Compose([
        UniformTemporalSubsample(transform_params["num_frames"]),
        Lambda(lambda x: x / 255.0),
        NormalizeVideo(mean, std),
        ShortSideScale(size=transform_params["side_size"]),
        CenterCropVideo(crop_size=(transform_params["crop_size"],
                                   transform_params["crop_size"]))
    ]),
)

# The duration of the input clip is also specific to the model.
clip_duration = (transform_params["num_frames"] *
                 transform_params["sampling_rate"]) / frames_per_second


def x3dpred(video):