def test_create_slowfast_with_callable(self):
        """
        Test builder `create_slowfast` with callable inputs.
        """
        for (norm, activation) in itertools.product(
            (nn.BatchNorm3d, None), (nn.ReLU, nn.Sigmoid, None)):
            input_clip_length = 32
            input_crop_size = 224
            input_channel = 3

            model = create_slowfast(
                slowfast_channel_reduction_ratio=8,
                slowfast_conv_channel_fusion_ratio=2,
                slowfast_fusion_conv_kernel_size=(7, 1, 1),
                slowfast_fusion_conv_stride=(4, 1, 1),
                input_channels=(input_channel, ) * 2,
                model_depth=18,
                model_num_class=400,
                dropout_rate=0,
                norm=norm,
                activation=activation,
            )

            # Test forwarding.
            for tensor in TestSlowFast._get_inputs(input_channel,
                                                   input_clip_length,
                                                   input_crop_size):
                with torch.no_grad():
                    if tensor[0].shape[1] != input_channel:
                        with self.assertRaises(RuntimeError):
                            model(tensor)
                        continue

                    model(tensor)
예제 #2
0
def _slowfast(
    pretrained: bool = False,
    progress: bool = True,
    checkpoint_path: str = "",
    **kwargs: Any,
) -> nn.Module:
    model = create_slowfast(**kwargs)
    if pretrained:
        checkpoint = load_state_dict_from_url(checkpoint_path, progress=progress)
        state_dict = checkpoint["model_state"]
        model.load_state_dict(state_dict)
    return model
예제 #3
0
def _slowfast(
    pretrained: bool = False,
    progress: bool = True,
    checkpoint_path: str = "",
    **kwargs: Any,
) -> nn.Module:
    model = create_slowfast(**kwargs)
    if pretrained:
        # All models are loaded onto CPU by default
        checkpoint = load_state_dict_from_url(
            checkpoint_path, progress=progress, map_location="cpu"
        )
        state_dict = checkpoint["model_state"]
        model.load_state_dict(state_dict)
    return model
예제 #4
0
    def _construct_network(self, cfg):
        """
        Builds a SlowFast model.

        Args:
            cfg (CfgNode): model building configs, details are in the
                comments of the config file.
        """

        # Params from configs.
        norm_module = get_norm(cfg)
        pool_size = _POOL1[cfg.MODEL.ARCH]
        num_groups = cfg.RESNET.NUM_GROUPS
        width_per_group = cfg.RESNET.WIDTH_PER_GROUP
        spatial_dilations = cfg.RESNET.SPATIAL_DILATIONS
        spatial_strides = cfg.RESNET.SPATIAL_STRIDES
        temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH]

        self.model = create_slowfast(
            # SlowFast configs.
            slowfast_channel_reduction_ratio=cfg.SLOWFAST.BETA_INV,
            slowfast_conv_channel_fusion_ratio=cfg.SLOWFAST.
            FUSION_CONV_CHANNEL_RATIO,
            slowfast_fusion_conv_kernel_size=(
                cfg.SLOWFAST.FUSION_KERNEL_SZ,
                1,
                1,
            ),
            slowfast_fusion_conv_stride=(cfg.SLOWFAST.ALPHA, 1, 1),
            # Input clip configs.
            input_channels=cfg.DATA.INPUT_CHANNEL_NUM,
            # Model configs.
            model_depth=cfg.RESNET.DEPTH,
            model_num_class=cfg.MODEL.NUM_CLASSES,
            dropout_rate=cfg.MODEL.DROPOUT_RATE,
            # Normalization configs.
            norm=norm_module,
            # Activation configs.
            activation=partial(nn.ReLU, inplace=cfg.RESNET.INPLACE_RELU),
            # Stem configs.
            stem_dim_outs=(
                width_per_group,
                width_per_group // cfg.SLOWFAST.BETA_INV,
            ),
            stem_conv_kernel_sizes=(
                (temp_kernel[0][0][0], 7, 7),
                (temp_kernel[0][1][0], 7, 7),
            ),
            stem_conv_strides=((1, 2, 2), (1, 2, 2)),
            stem_pool=nn.MaxPool3d,
            stem_pool_kernel_sizes=((1, 3, 3), (1, 3, 3)),
            stem_pool_strides=((1, 2, 2), (1, 2, 2)),
            # Stage configs.
            stage_conv_a_kernel_sizes=(
                (
                    (temp_kernel[1][0][0], 1, 1),
                    (temp_kernel[2][0][0], 1, 1),
                    (temp_kernel[3][0][0], 1, 1),
                    (temp_kernel[4][0][0], 1, 1),
                ),
                (
                    (temp_kernel[1][1][0], 1, 1),
                    (temp_kernel[2][1][0], 1, 1),
                    (temp_kernel[3][1][0], 1, 1),
                    (temp_kernel[4][1][0], 1, 1),
                ),
            ),
            stage_conv_b_kernel_sizes=(
                ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
                ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
            ),
            stage_conv_b_num_groups=(
                (num_groups, num_groups, num_groups, num_groups),
                (num_groups, num_groups, num_groups, num_groups),
            ),
            stage_conv_b_dilations=(
                (
                    (1, spatial_dilations[0][0], spatial_dilations[0][0]),
                    (1, spatial_dilations[1][0], spatial_dilations[1][0]),
                    (1, spatial_dilations[2][0], spatial_dilations[2][0]),
                    (1, spatial_dilations[3][0], spatial_dilations[3][0]),
                ),
                (
                    (1, spatial_dilations[0][1], spatial_dilations[0][1]),
                    (1, spatial_dilations[1][1], spatial_dilations[1][1]),
                    (1, spatial_dilations[1][1], spatial_dilations[1][1]),
                    (1, spatial_dilations[1][1], spatial_dilations[1][1]),
                ),
            ),
            stage_spatial_strides=(
                (
                    spatial_strides[0][0],
                    spatial_strides[1][0],
                    spatial_strides[2][0],
                    spatial_strides[3][0],
                ),
                (
                    spatial_strides[0][1],
                    spatial_strides[1][1],
                    spatial_strides[2][1],
                    spatial_strides[3][1],
                ),
            ),
            stage_temporal_strides=((1, 1, 1, 1), (1, 1, 1, 1)),
            bottleneck=create_bottleneck_block,
            # Head configs.
            head_pool=nn.AvgPool3d,
            head_pool_kernel_sizes=(
                (
                    cfg.DATA.NUM_FRAMES // cfg.SLOWFAST.ALPHA //
                    pool_size[0][0],
                    cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][1],
                    cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][2],
                ),
                (
                    cfg.DATA.NUM_FRAMES // pool_size[1][0],
                    cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[1][1],
                    cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[1][2],
                ),
            ),
            head_activation=None,
            head_output_with_global_average=False,
        )

        self.post_act = get_head_act(cfg.MODEL.HEAD_ACT)
    def _construct_network(self, cfg):
        """
        Builds a SlowFast model.

        Args:
            cfg (CfgNode): model building configs, details are in the
                comments of the config file.
        """
        _MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3), 101: (3, 4, 23, 3)}

        # Params from configs.
        norm_module = get_norm(cfg)
        pool_size = _POOL1[cfg.MODEL.ARCH]
        num_groups = cfg.RESNET.NUM_GROUPS
        width_per_group = cfg.RESNET.WIDTH_PER_GROUP
        spatial_dilations = cfg.RESNET.SPATIAL_DILATIONS
        spatial_strides = cfg.RESNET.SPATIAL_STRIDES
        temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH]
        num_block_temp_kernel = cfg.RESNET.NUM_BLOCK_TEMP_KERNEL
        stage_depth = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH]

        stage_conv_a_kernel_sizes = [[], []]
        for pathway in range(2):
            for stage in range(4):
                stage_conv_a_kernel_sizes[pathway].append(
                    ((temp_kernel[stage + 1][pathway][0], 1, 1),)
                    * num_block_temp_kernel[stage][pathway]
                    + ((1, 1, 1),)
                    * (
                        stage_depth[stage]
                        - num_block_temp_kernel[stage][pathway]
                    )
                )

        # Head from config
        # Number of stages = 4
        stage_dim_in = cfg.RESNET.WIDTH_PER_GROUP * 2 ** (4 + 1)
        head_in_features = stage_dim_in + stage_dim_in // cfg.SLOWFAST.BETA_INV

        if cfg.DETECTION.ENABLE:
            self.detection_head = create_res_roi_pooling_head(
                in_features=head_in_features,
                out_features=cfg.MODEL.NUM_CLASSES,
                pool=None,
                output_size=(1, 1, 1),
                dropout_rate=cfg.MODEL.DROPOUT_RATE,
                activation=None,
                output_with_global_average=False,
                pool_spatial=nn.MaxPool2d,
                resolution=[cfg.DETECTION.ROI_XFORM_RESOLUTION] * 2,
                spatial_scale=1.0 / float(cfg.DETECTION.SPATIAL_SCALE_FACTOR),
                sampling_ratio=0,
                roi=ROIAlign,
            )
            head_pool_kernel_sizes = (
                (
                    cfg.DATA.NUM_FRAMES
                    // cfg.SLOWFAST.ALPHA
                    // pool_size[0][0],
                    1,
                    1,
                ),
                (cfg.DATA.NUM_FRAMES // pool_size[1][0], 1, 1),
            )
        else:
            head_pool_kernel_sizes = (
                (
                    cfg.DATA.NUM_FRAMES
                    // cfg.SLOWFAST.ALPHA
                    // pool_size[0][0],
                    cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][1],
                    cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][2],
                ),
                (
                    cfg.DATA.NUM_FRAMES // pool_size[1][0],
                    cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[1][1],
                    cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[1][2],
                ),
            )

        self.model = create_slowfast(
            # SlowFast configs.
            slowfast_channel_reduction_ratio=cfg.SLOWFAST.BETA_INV,
            slowfast_conv_channel_fusion_ratio=cfg.SLOWFAST.FUSION_CONV_CHANNEL_RATIO,
            slowfast_fusion_conv_kernel_size=(
                cfg.SLOWFAST.FUSION_KERNEL_SZ,
                1,
                1,
            ),
            slowfast_fusion_conv_stride=(cfg.SLOWFAST.ALPHA, 1, 1),
            # Input clip configs.
            input_channels=cfg.DATA.INPUT_CHANNEL_NUM,
            # Model configs.
            model_depth=cfg.RESNET.DEPTH,
            model_num_class=cfg.MODEL.NUM_CLASSES,
            dropout_rate=cfg.MODEL.DROPOUT_RATE,
            # Normalization configs.
            norm=norm_module,
            # Activation configs.
            activation=partial(nn.ReLU, inplace=cfg.RESNET.INPLACE_RELU),
            # Stem configs.
            stem_dim_outs=(
                width_per_group,
                width_per_group // cfg.SLOWFAST.BETA_INV,
            ),
            stem_conv_kernel_sizes=(
                (temp_kernel[0][0][0], 7, 7),
                (temp_kernel[0][1][0], 7, 7),
            ),
            stem_conv_strides=((1, 2, 2), (1, 2, 2)),
            stem_pool=nn.MaxPool3d,
            stem_pool_kernel_sizes=((1, 3, 3), (1, 3, 3)),
            stem_pool_strides=((1, 2, 2), (1, 2, 2)),
            # Stage configs.
            stage_conv_a_kernel_sizes=stage_conv_a_kernel_sizes,
            stage_conv_b_kernel_sizes=(
                ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
                ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
            ),
            stage_conv_b_num_groups=(
                (num_groups, num_groups, num_groups, num_groups),
                (num_groups, num_groups, num_groups, num_groups),
            ),
            stage_conv_b_dilations=(
                (
                    (1, spatial_dilations[0][0], spatial_dilations[0][0]),
                    (1, spatial_dilations[1][0], spatial_dilations[1][0]),
                    (1, spatial_dilations[2][0], spatial_dilations[2][0]),
                    (1, spatial_dilations[3][0], spatial_dilations[3][0]),
                ),
                (
                    (1, spatial_dilations[0][1], spatial_dilations[0][1]),
                    (1, spatial_dilations[1][1], spatial_dilations[1][1]),
                    (1, spatial_dilations[1][1], spatial_dilations[1][1]),
                    (1, spatial_dilations[1][1], spatial_dilations[1][1]),
                ),
            ),
            stage_spatial_strides=(
                (
                    spatial_strides[0][0],
                    spatial_strides[1][0],
                    spatial_strides[2][0],
                    spatial_strides[3][0],
                ),
                (
                    spatial_strides[0][1],
                    spatial_strides[1][1],
                    spatial_strides[2][1],
                    spatial_strides[3][1],
                ),
            ),
            stage_temporal_strides=((1, 1, 1, 1), (1, 1, 1, 1)),
            bottleneck=create_bottleneck_block,
            # Head configs.
            head=create_res_basic_head if not self.detection_mode else None,
            head_pool=nn.AvgPool3d,
            head_pool_kernel_sizes=head_pool_kernel_sizes,
            head_activation=None,
            head_output_with_global_average=False,
        )

        self.post_act = get_head_act(cfg.MODEL.HEAD_ACT)