Пример #1
0
def create_slowfast(
    *,
    # SlowFast configs.
    slowfast_channel_reduction_ratio: Union[Tuple[int], int] = (8, ),
    slowfast_conv_channel_fusion_ratio: int = 2,
    slowfast_fusion_conv_kernel_size: Tuple[int] = (
        7,
        1,
        1,
    ),  # deprecated, use fusion_builder
    slowfast_fusion_conv_stride: Tuple[int] = (
        4,
        1,
        1,
    ),  # deprecated, use fusion_builder
    fusion_builder: Callable[
        [int, int], nn.Module] = None,  # Args: fusion_dim_in, stage_idx
    # Input clip configs.
    input_channels: Tuple[int] = (3, 3),
    # Model configs.
    model_depth: int = 50,
    model_num_class: int = 400,
    dropout_rate: float = 0.5,
    # Normalization configs.
    norm: Callable = nn.BatchNorm3d,
    # Activation configs.
    activation: Callable = nn.ReLU,
    # Stem configs.
    stem_function: Tuple[Callable] = (
        create_res_basic_stem,
        create_res_basic_stem,
    ),
    stem_dim_outs: Tuple[int] = (64, 8),
    stem_conv_kernel_sizes: Tuple[Tuple[int]] = ((1, 7, 7), (5, 7, 7)),
    stem_conv_strides: Tuple[Tuple[int]] = ((1, 2, 2), (1, 2, 2)),
    stem_pool: Union[Callable, Tuple[Callable]] = (nn.MaxPool3d, nn.MaxPool3d),
    stem_pool_kernel_sizes: Tuple[Tuple[int]] = ((1, 3, 3), (1, 3, 3)),
    stem_pool_strides: Tuple[Tuple[int]] = ((1, 2, 2), (1, 2, 2)),
    # Stage configs.
    stage_conv_a_kernel_sizes: Tuple[Tuple[Tuple[int]]] = (
        ((1, 1, 1), (1, 1, 1), (3, 1, 1), (3, 1, 1)),
        ((3, 1, 1), (3, 1, 1), (3, 1, 1), (3, 1, 1)),
    ),
    stage_conv_b_kernel_sizes: Tuple[Tuple[Tuple[int]]] = (
        ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
        ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
    ),
    stage_conv_b_num_groups: Tuple[Tuple[int]] = ((1, 1, 1, 1), (1, 1, 1, 1)),
    stage_conv_b_dilations: Tuple[Tuple[Tuple[int]]] = (
        ((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
        ((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
    ),
    stage_spatial_strides: Tuple[Tuple[int]] = ((1, 2, 2, 2), (1, 2, 2, 2)),
    stage_temporal_strides: Tuple[Tuple[int]] = ((1, 1, 1, 1), (1, 1, 1, 1)),
    bottleneck: Union[Callable, Tuple[Tuple[Callable]]] = (
        (
            create_bottleneck_block,
            create_bottleneck_block,
            create_bottleneck_block,
            create_bottleneck_block,
        ),
        (
            create_bottleneck_block,
            create_bottleneck_block,
            create_bottleneck_block,
            create_bottleneck_block,
        ),
    ),
    # Head configs.
    head_pool: Callable = nn.AvgPool3d,
    head_pool_kernel_sizes: Tuple[Tuple[int]] = ((8, 7, 7), (32, 7, 7)),
    head_output_size: Tuple[int] = (1, 1, 1),
    head_activation: Callable = None,
    head_output_with_global_average: bool = True,
) -> nn.Module:
    """
    Build SlowFast model for video recognition, SlowFast model involves a Slow pathway,
    operating at low frame rate, to capture spatial semantics, and a Fast pathway,
    operating at high frame rate, to capture motion at fine temporal resolution. The
    Fast pathway can be made very lightweight by reducing its channel capacity, yet can
    learn useful temporal information for video recognition. Details can be found from
    the paper:

    Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
    "SlowFast networks for video recognition."
    https://arxiv.org/pdf/1812.03982.pdf

    ::

                             Slow Input  Fast Input
                                  ↓           ↓
                                 Stem       Stem
                                  ↓ ⭠ Fusion- ↓
                               Stage 1     Stage 1
                                  ↓ ⭠ Fusion- ↓
                                  .           .
                                  ↓           ↓
                               Stage N     Stage N
                                  ↓ ⭠ Fusion- ↓
                                         ↓
                                       Head

    Args:
        slowfast_channel_reduction_ratio (int): Corresponds to the inverse of the channel
            reduction ratio, $\beta$ between the Slow and Fast pathways.
        slowfast_conv_channel_fusion_ratio (int): Ratio of channel dimensions
            between the Slow and Fast pathways.
        DEPRECATED slowfast_fusion_conv_kernel_size (tuple): the convolutional kernel
            size used for fusion.
        DEPRECATED slowfast_fusion_conv_stride (tuple): the convolutional stride size
            used for fusion.
        fusion_builder (Callable[[int, int], nn.Module]): Builder function for generating
            the fusion modules based on stage dimension and index

        input_channels (tuple): number of channels for the input video clip.

        model_depth (int): the depth of the resnet.
        model_num_class (int): the number of classes for the video dataset.
        dropout_rate (float): dropout rate.

        norm (callable): a callable that constructs normalization layer.

        activation (callable): a callable that constructs activation layer.

        stem_function (Tuple[Callable]): a callable that constructs stem layer.
            Examples include create_res_basic_stem. Indexed by pathway
        stem_dim_outs (tuple): output channel size to stem.
        stem_conv_kernel_sizes (tuple): convolutional kernel size(s) of stem.
        stem_conv_strides (tuple): convolutional stride size(s) of stem.
        stem_pool (Tuple[Callable]): a callable that constructs resnet head pooling layer.
            Indexed by pathway
        stem_pool_kernel_sizes (tuple): pooling kernel size(s).
        stem_pool_strides (tuple): pooling stride size(s).

        stage_conv_a_kernel_sizes (tuple): convolutional kernel size(s) for conv_a.
        stage_conv_b_kernel_sizes (tuple): convolutional kernel size(s) for conv_b.
        stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
            for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
        stage_conv_b_dilations (tuple): dilation for 3D convolution for conv_b.
        stage_spatial_strides (tuple): the spatial stride for each stage.
        stage_temporal_strides (tuple): the temporal stride for each stage.
        bottleneck (Tuple[Tuple[Callable]]): a callable that constructs bottleneck
            block layer. Examples include: create_bottleneck_block.
            Indexed by pathway and stage index

        head_pool (callable): a callable that constructs resnet head pooling layer.
        head_output_sizes (tuple): the size of output tensor for head.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.
    Returns:
        (nn.Module): SlowFast model.
    """

    # Number of blocks for different stages given the model depth.
    _num_pathway = len(input_channels)
    _MODEL_STAGE_DEPTH = {
        18: (1, 1, 1, 1),
        50: (3, 4, 6, 3),
        101: (3, 4, 23, 3),
        152: (3, 8, 36, 3),
    }
    assert (model_depth in _MODEL_STAGE_DEPTH.keys()
            ), f"{model_depth} is not in {_MODEL_STAGE_DEPTH.keys()}"
    stage_depths = _MODEL_STAGE_DEPTH[model_depth]

    # Fix up inputs
    if isinstance(slowfast_channel_reduction_ratio, int):
        slowfast_channel_reduction_ratio = (slowfast_channel_reduction_ratio, )
    if isinstance(stem_pool, Callable):
        stem_pool = (stem_pool, ) * _num_pathway
    if isinstance(bottleneck, Callable):
        bottleneck = (bottleneck, ) * len(stage_depths)
        bottleneck = (bottleneck, ) * _num_pathway
    if fusion_builder is None:
        fusion_builder = FastToSlowFusionBuilder(
            slowfast_channel_reduction_ratio=slowfast_channel_reduction_ratio[
                0],
            conv_fusion_channel_ratio=slowfast_conv_channel_fusion_ratio,
            conv_kernel_size=slowfast_fusion_conv_kernel_size,
            conv_stride=slowfast_fusion_conv_stride,
            norm=norm,
            activation=activation,
            max_stage_idx=len(stage_depths) - 1,
        ).create_module

    # Build stem blocks.
    stems = []
    for pathway_idx in range(_num_pathway):
        stems.append(stem_function[pathway_idx](
            in_channels=input_channels[pathway_idx],
            out_channels=stem_dim_outs[pathway_idx],
            conv_kernel_size=stem_conv_kernel_sizes[pathway_idx],
            conv_stride=stem_conv_strides[pathway_idx],
            conv_padding=[
                size // 2 for size in stem_conv_kernel_sizes[pathway_idx]
            ],
            pool=stem_pool[pathway_idx],
            pool_kernel_size=stem_pool_kernel_sizes[pathway_idx],
            pool_stride=stem_pool_strides[pathway_idx],
            pool_padding=[
                size // 2 for size in stem_pool_kernel_sizes[pathway_idx]
            ],
            norm=norm,
            activation=activation,
        ))

    stages = []
    stages.append(
        MultiPathWayWithFuse(
            multipathway_blocks=nn.ModuleList(stems),
            multipathway_fusion=fusion_builder(
                fusion_dim_in=stem_dim_outs[0],
                stage_idx=0,
            ),
        ))

    # Build stages blocks.
    stage_dim_in = stem_dim_outs[0]
    stage_dim_out = stage_dim_in * 4
    for idx in range(len(stage_depths)):
        pathway_stage_dim_in = [
            stage_dim_in + stage_dim_in * slowfast_conv_channel_fusion_ratio //
            slowfast_channel_reduction_ratio[0],
        ]
        pathway_stage_dim_inner = [
            stage_dim_out // 4,
        ]
        pathway_stage_dim_out = [
            stage_dim_out,
        ]
        for reduction_ratio in slowfast_channel_reduction_ratio:
            pathway_stage_dim_in = pathway_stage_dim_in + [
                stage_dim_in // reduction_ratio
            ]
            pathway_stage_dim_inner = pathway_stage_dim_inner + [
                stage_dim_out // 4 // reduction_ratio
            ]
            pathway_stage_dim_out = pathway_stage_dim_out + [
                stage_dim_out // reduction_ratio
            ]

        stage = []
        for pathway_idx in range(_num_pathway):
            depth = stage_depths[idx]

            stage_conv_a_stride = (stage_temporal_strides[pathway_idx][idx], 1,
                                   1)
            stage_conv_b_stride = (
                1,
                stage_spatial_strides[pathway_idx][idx],
                stage_spatial_strides[pathway_idx][idx],
            )
            stage.append(
                create_res_stage(
                    depth=depth,
                    dim_in=pathway_stage_dim_in[pathway_idx],
                    dim_inner=pathway_stage_dim_inner[pathway_idx],
                    dim_out=pathway_stage_dim_out[pathway_idx],
                    bottleneck=bottleneck[pathway_idx][idx],
                    conv_a_kernel_size=stage_conv_a_kernel_sizes[pathway_idx]
                    [idx],
                    conv_a_stride=stage_conv_a_stride,
                    conv_a_padding=[
                        size // 2
                        for size in stage_conv_a_kernel_sizes[pathway_idx][idx]
                    ],
                    conv_b_kernel_size=stage_conv_b_kernel_sizes[pathway_idx]
                    [idx],
                    conv_b_stride=stage_conv_b_stride,
                    conv_b_padding=[
                        size // 2
                        for size in stage_conv_b_kernel_sizes[pathway_idx][idx]
                    ],
                    conv_b_num_groups=stage_conv_b_num_groups[pathway_idx]
                    [idx],
                    conv_b_dilation=stage_conv_b_dilations[pathway_idx][idx],
                    norm=norm,
                    activation=activation,
                ))
        stages.append(
            MultiPathWayWithFuse(
                multipathway_blocks=nn.ModuleList(stage),
                multipathway_fusion=fusion_builder(
                    fusion_dim_in=stage_dim_out,
                    stage_idx=idx + 1,
                ),
            ))
        stage_dim_in = stage_dim_out
        stage_dim_out = stage_dim_out * 2

    if head_pool is None:
        pool_model = None
    elif head_pool == nn.AdaptiveAvgPool3d:
        pool_model = [
            head_pool(head_output_size[idx]) for idx in range(_num_pathway)
        ]
    elif head_pool == nn.AvgPool3d:
        pool_model = [
            head_pool(
                kernel_size=head_pool_kernel_sizes[idx],
                stride=(1, 1, 1),
                padding=(0, 0, 0),
            ) for idx in range(_num_pathway)
        ]
    else:
        raise NotImplementedError(f"Unsupported pool_model type {pool_model}")

    stages.append(
        PoolConcatPathway(retain_list=False, pool=nn.ModuleList(pool_model)))
    head_in_features = stage_dim_in
    for reduction_ratio in slowfast_channel_reduction_ratio:
        head_in_features = head_in_features + stage_dim_in // reduction_ratio
    stages.append(
        create_res_basic_head(
            in_features=head_in_features,
            out_features=model_num_class,
            pool=None,
            output_size=head_output_size,
            dropout_rate=dropout_rate,
            activation=head_activation,
            output_with_global_average=head_output_with_global_average,
        ))
    return Net(blocks=nn.ModuleList(stages))
Пример #2
0
def create_r2plus1d(
    *,
    # Input clip configs.
    input_channel: int = 3,
    # Model configs.
    model_depth: int = 50,
    model_num_class: int = 400,
    dropout_rate: float = 0.0,
    # Normalization configs.
    norm: Callable = nn.BatchNorm3d,
    norm_eps: float = 1e-5,
    norm_momentum: float = 0.1,
    # Activation configs.
    activation: Callable = nn.ReLU,
    # Stem configs.
    stem_dim_out: int = 64,
    stem_conv_kernel_size: Tuple[int] = (1, 7, 7),
    stem_conv_stride: Tuple[int] = (1, 2, 2),
    # Stage configs.
    stage_conv_a_kernel_size: Tuple[Tuple[int]] = (
        (1, 1, 1),
        (1, 1, 1),
        (1, 1, 1),
        (1, 1, 1),
    ),
    stage_conv_b_kernel_size: Tuple[Tuple[int]] = (
        (3, 3, 3),
        (3, 3, 3),
        (3, 3, 3),
        (3, 3, 3),
    ),
    stage_conv_b_num_groups: Tuple[int] = (1, 1, 1, 1),
    stage_conv_b_dilation: Tuple[Tuple[int]] = (
        (1, 1, 1),
        (1, 1, 1),
        (1, 1, 1),
        (1, 1, 1),
    ),
    stage_spatial_stride: Tuple[int] = (2, 2, 2, 2),
    stage_temporal_stride: Tuple[int] = (1, 1, 2, 2),
    stage_bottleneck: Tuple[Callable] = (
        create_2plus1d_bottleneck_block,
        create_2plus1d_bottleneck_block,
        create_2plus1d_bottleneck_block,
        create_2plus1d_bottleneck_block,
    ),
    # Head configs.
    head_pool: Callable = nn.AvgPool3d,
    head_pool_kernel_size: Tuple[int] = (4, 7, 7),
    head_output_size: Tuple[int] = (1, 1, 1),
    head_activation: Callable = nn.Softmax,
    head_output_with_global_average: bool = True,
) -> nn.Module:
    """
    Build the R(2+1)D network from::
    A closer look at spatiotemporal convolutions for action recognition.
    Du Tran, Heng Wang, Lorenzo Torresani, Jamie Ray, Yann LeCun, Manohar Paluri. CVPR 2018.

    R(2+1)D follows the ResNet style architecture including three parts: Stem,
    Stages and Head. The three parts are assembled in the following order:

    ::

                                         Input
                                           ↓
                                         Stem
                                           ↓
                                         Stage 1
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                         Stage N
                                           ↓
                                         Head

    Args:

        input_channel (int): number of channels for the input video clip.

        model_depth (int): the depth of the resnet.
        model_num_class (int): the number of classes for the video dataset.
        dropout_rate (float): dropout rate.

        norm (callable): a callable that constructs normalization layer.
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer.

        stem_dim_out (int): output channel size for stem.
        stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
        stem_conv_stride (tuple): convolutional stride size(s) of stem.

        stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
            for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
        stage_conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
        stage_spatial_stride (tuple): the spatial stride for each stage.
        stage_temporal_stride (tuple): the temporal stride for each stage.
        stage_bottleneck (tuple): a callable that constructs bottleneck block layer
            for each stage. Examples include: create_bottleneck_block,
            create_2plus1d_bottleneck_block.

        head_pool (callable): a callable that constructs resnet head pooling layer.
        head_pool_kernel_size (tuple): the pooling kernel size.
        head_output_size (tuple): the size of output tensor for head.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.

    Returns:
        (nn.Module): basic resnet.
    """
    # Number of blocks for different stages given the model depth.
    _MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3), 101: (3, 4, 23, 3), 152: (3, 8, 36, 3)}

    # Given a model depth, get the number of blocks for each stage.
    assert (
        model_depth in _MODEL_STAGE_DEPTH.keys()
    ), f"{model_depth} is not in {_MODEL_STAGE_DEPTH.keys()}"
    stage_depths = _MODEL_STAGE_DEPTH[model_depth]

    blocks = []
    # Create stem for R(2+1)D.
    stem = create_res_basic_stem(
        in_channels=input_channel,
        out_channels=stem_dim_out,
        conv_kernel_size=stem_conv_kernel_size,
        conv_stride=stem_conv_stride,
        conv_padding=[size // 2 for size in stem_conv_kernel_size],
        pool=None,
        norm=norm,
        activation=activation,
    )
    blocks.append(stem)

    stage_dim_in = stem_dim_out
    stage_dim_out = stage_dim_in * 4

    # Create each stage for R(2+1)D.
    for idx in range(len(stage_depths)):
        stage_dim_inner = stage_dim_out // 4
        depth = stage_depths[idx]

        stage_conv_b_stride = (
            stage_temporal_stride[idx],
            stage_spatial_stride[idx],
            stage_spatial_stride[idx],
        )

        stage = create_res_stage(
            depth=depth,
            dim_in=stage_dim_in,
            dim_inner=stage_dim_inner,
            dim_out=stage_dim_out,
            bottleneck=stage_bottleneck[idx],
            conv_a_kernel_size=stage_conv_a_kernel_size[idx],
            conv_a_stride=[1, 1, 1],
            conv_a_padding=[size // 2 for size in stage_conv_a_kernel_size[idx]],
            conv_b_kernel_size=stage_conv_b_kernel_size[idx],
            conv_b_stride=stage_conv_b_stride,
            conv_b_padding=[size // 2 for size in stage_conv_b_kernel_size[idx]],
            conv_b_num_groups=stage_conv_b_num_groups[idx],
            conv_b_dilation=stage_conv_b_dilation[idx],
            norm=norm,
            activation=activation,
        )

        blocks.append(stage)
        stage_dim_in = stage_dim_out
        stage_dim_out = stage_dim_out * 2

    # Create head for R(2+1)D.
    head = create_res_basic_head(
        in_features=stage_dim_in,
        out_features=model_num_class,
        pool=head_pool,
        output_size=head_output_size,
        pool_kernel_size=head_pool_kernel_size,
        dropout_rate=dropout_rate,
        activation=head_activation,
        output_with_global_average=head_output_with_global_average,
    )
    blocks.append(head)
    return Net(blocks=nn.ModuleList(blocks))
Пример #3
0
def create_csn(
    *,
    # Input clip configs.
    input_channel: int = 3,
    # Model configs.
    model_depth: int = 50,
    model_num_class: int = 400,
    dropout_rate: float = 0,
    # Normalization configs.
    norm: Callable = nn.BatchNorm3d,
    # Activation configs.
    activation: Callable = nn.ReLU,
    # Stem configs.
    stem_dim_out: int = 64,
    stem_conv_kernel_size: Tuple[int] = (3, 7, 7),
    stem_conv_stride: Tuple[int] = (1, 2, 2),
    stem_pool: Callable = None,
    stem_pool_kernel_size: Tuple[int] = (1, 3, 3),
    stem_pool_stride: Tuple[int] = (1, 2, 2),
    # Stage configs.
    stage_conv_a_kernel_size: Tuple[int] = (1, 1, 1),
    stage_conv_b_kernel_size: Tuple[int] = (3, 3, 3),
    stage_conv_b_width_per_group: int = 1,
    stage_spatial_stride: Tuple[int] = (1, 2, 2, 2),
    stage_temporal_stride: Tuple[int] = (1, 2, 2, 2),
    bottleneck: Callable = create_bottleneck_block,
    bottleneck_ratio: int = 4,
    # Head configs.
    head_pool: Callable = nn.AvgPool3d,
    head_pool_kernel_size: Tuple[int] = (1, 7, 7),
    head_output_size: Tuple[int] = (1, 1, 1),
    head_activation: Callable = None,
    head_output_with_global_average: bool = True,
) -> nn.Module:
    """
    Build Channel-Separated Convolutional Networks (CSN):
    Video classification with channel-separated convolutional networks.
    Du Tran, Heng Wang, Lorenzo Torresani, Matt Feiszli. ICCV 2019.

    CSN follows the ResNet style architecture including three parts: Stem,
    Stages and Head. The three parts are assembled in the following order:

    ::

                                         Input
                                           ↓
                                         Stem
                                           ↓
                                         Stage 1
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                         Stage N
                                           ↓
                                         Head

    CSN uses depthwise convolution. To further reduce the computational cost, it uses
    low resolution (112x112), short clips (4 frames), different striding and kernel
    size, etc.

    Args:

        input_channel (int): number of channels for the input video clip.

        model_depth (int): the depth of the resnet. Options include: 50, 101, 152.
            model_num_class (int): the number of classes for the video dataset.
            dropout_rate (float): dropout rate.

        norm (callable): a callable that constructs normalization layer.

        activation (callable): a callable that constructs activation layer.

        stem_dim_out (int): output channel size to stem.
        stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
        stem_conv_stride (tuple): convolutional stride size(s) of stem.
        stem_pool (callable): a callable that constructs resnet head pooling layer.
        stem_pool_kernel_size (tuple): pooling kernel size(s).
        stem_pool_stride (tuple): pooling stride size(s).

        stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        stage_conv_b_width_per_group(int): the width of each group for conv_b. Set
            it to 1 for depthwise convolution.
        stage_spatial_stride (tuple): the spatial stride for each stage.
        stage_temporal_stride (tuple): the temporal stride for each stage.
        bottleneck (callable): a callable that constructs bottleneck block layer.
            Examples include: create_bottleneck_block.
        bottleneck_ratio (int): the ratio between inner and outer dimensions for
            the bottleneck block.

        head_pool (callable): a callable that constructs resnet head pooling layer.
        head_pool_kernel_size (tuple): the pooling kernel size.
        head_output_size (tuple): the size of output tensor for head.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.

    Returns:
        (nn.Module): the csn model.
    """

    torch._C._log_api_usage_once("PYTORCHVIDEO.model.create_csn")

    # Number of blocks for different stages given the model depth.
    _MODEL_STAGE_DEPTH = {
        50: (3, 4, 6, 3),
        101: (3, 4, 23, 3),
        152: (3, 8, 36, 3)
    }

    # Given a model depth, get the number of blocks for each stage.
    assert (model_depth in _MODEL_STAGE_DEPTH.keys()
            ), f"{model_depth} is not in {_MODEL_STAGE_DEPTH.keys()}"
    stage_depths = _MODEL_STAGE_DEPTH[model_depth]

    blocks = []
    # Create stem for CSN.
    stem = create_res_basic_stem(
        in_channels=input_channel,
        out_channels=stem_dim_out,
        conv_kernel_size=stem_conv_kernel_size,
        conv_stride=stem_conv_stride,
        conv_padding=[size // 2 for size in stem_conv_kernel_size],
        pool=stem_pool,
        pool_kernel_size=stem_pool_kernel_size,
        pool_stride=stem_pool_stride,
        pool_padding=[size // 2 for size in stem_pool_kernel_size],
        norm=norm,
        activation=activation,
    )
    blocks.append(stem)

    stage_dim_in = stem_dim_out
    stage_dim_out = stage_dim_in * 4

    # Create each stage for CSN.
    for idx in range(len(stage_depths)):
        stage_dim_inner = stage_dim_out // bottleneck_ratio
        depth = stage_depths[idx]

        stage_conv_b_stride = (
            stage_temporal_stride[idx],
            stage_spatial_stride[idx],
            stage_spatial_stride[idx],
        )

        stage = create_res_stage(
            depth=depth,
            dim_in=stage_dim_in,
            dim_inner=stage_dim_inner,
            dim_out=stage_dim_out,
            bottleneck=bottleneck,
            conv_a_kernel_size=stage_conv_a_kernel_size,
            conv_a_stride=(1, 1, 1),
            conv_a_padding=[size // 2 for size in stage_conv_a_kernel_size],
            conv_b_kernel_size=stage_conv_b_kernel_size,
            conv_b_stride=stage_conv_b_stride,
            conv_b_padding=[size // 2 for size in stage_conv_b_kernel_size],
            conv_b_num_groups=(stage_dim_inner //
                               stage_conv_b_width_per_group),
            conv_b_dilation=(1, 1, 1),
            norm=norm,
            activation=activation,
        )

        blocks.append(stage)
        stage_dim_in = stage_dim_out
        stage_dim_out = stage_dim_out * 2

    # Create head for CSN.
    head = create_res_basic_head(
        in_features=stage_dim_in,
        out_features=model_num_class,
        pool=head_pool,
        output_size=head_output_size,
        pool_kernel_size=head_pool_kernel_size,
        dropout_rate=dropout_rate,
        activation=head_activation,
        output_with_global_average=head_output_with_global_average,
    )
    blocks.append(head)
    return Net(blocks=nn.ModuleList(blocks))
Пример #4
0
    def test_build_head_with_callable(self):
        """
        Test builder `create_res_basic_head`.
        """
        for (pool, activation) in itertools.product(
            (nn.AvgPool3d, nn.MaxPool3d, nn.AdaptiveAvgPool3d, None),
            (nn.ReLU, nn.Softmax, nn.Sigmoid, None),
        ):
            if activation is None:
                activation_model = None
            elif activation == nn.Softmax:
                activation_model = activation(dim=1)
            else:
                activation_model = activation()

            if pool is None:
                pool_model = None
            elif pool == nn.AdaptiveAvgPool3d:
                pool_model = pool(1)
            else:
                pool_model = pool(kernel_size=[5, 7, 7], stride=[1, 1, 1])

            model = create_res_basic_head(
                in_features=16,
                out_features=32,
                pool=pool,
                pool_kernel_size=(5, 7, 7),
                output_size=(1, 1, 1),
                dropout_rate=0.0,
                activation=activation,
                output_with_global_average=True,
            )
            model_gt = ResNetBasicHead(
                proj=nn.Linear(16, 32),
                activation=activation_model,
                pool=pool_model,
                dropout=None,
                output_pool=nn.AdaptiveAvgPool3d(1),
            )
            model.load_state_dict(model_gt.state_dict(),
                                  strict=True)  # explicitly use strict mode.

            # Test forwarding.
            for input_tensor in TestHeadHelper._get_inputs(input_dim=16):
                with torch.no_grad():
                    if input_tensor.shape[1] != 16:
                        with self.assertRaises(RuntimeError):
                            output_tensor = model(input_tensor)
                        continue
                    else:
                        output_tensor = model(input_tensor)
                        output_tensor_gt = model_gt(input_tensor)
                self.assertEqual(
                    output_tensor.shape,
                    output_tensor_gt.shape,
                    "Output shape {} is different from expected shape {}".
                    format(output_tensor.shape, output_tensor_gt.shape),
                )
                self.assertTrue(
                    np.allclose(output_tensor.numpy(),
                                output_tensor_gt.numpy()))
Пример #5
0
def create_resnet(
    *,
    # Input clip configs.
    input_channel: int = 3,
    # Model configs.
    model_depth: int = 50,
    model_num_class: int = 400,
    dropout_rate: float = 0.5,
    # Normalization configs.
    norm: Callable = nn.BatchNorm3d,
    # Activation configs.
    activation: Callable = nn.ReLU,
    # Stem configs.
    stem_dim_out: int = 64,
    stem_conv_kernel_size: Tuple[int] = (3, 7, 7),
    stem_conv_stride: Tuple[int] = (1, 2, 2),
    stem_pool: Callable = nn.MaxPool3d,
    stem_pool_kernel_size: Tuple[int] = (1, 3, 3),
    stem_pool_stride: Tuple[int] = (1, 2, 2),
    stem: Callable = create_res_basic_stem,
    # Stage configs.
    stage1_pool: Callable = None,
    stage1_pool_kernel_size: Tuple[int] = (2, 1, 1),
    stage_conv_a_kernel_size: Union[Tuple[int], Tuple[Tuple[int]]] = (
        (1, 1, 1),
        (1, 1, 1),
        (3, 1, 1),
        (3, 1, 1),
    ),
    stage_conv_b_kernel_size: Union[Tuple[int], Tuple[Tuple[int]]] = (
        (1, 3, 3),
        (1, 3, 3),
        (1, 3, 3),
        (1, 3, 3),
    ),
    stage_conv_b_num_groups: Tuple[int] = (1, 1, 1, 1),
    stage_conv_b_dilation: Union[Tuple[int], Tuple[Tuple[int]]] = (
        (1, 1, 1),
        (1, 1, 1),
        (1, 1, 1),
        (1, 1, 1),
    ),
    stage_spatial_h_stride: Tuple[int] = (1, 2, 2, 2),
    stage_spatial_w_stride: Tuple[int] = (1, 2, 2, 2),
    stage_temporal_stride: Tuple[int] = (1, 1, 1, 1),
    bottleneck: Union[Tuple[Callable], Callable] = create_bottleneck_block,
    # Head configs.
    head_pool: Callable = nn.AvgPool3d,
    head_pool_kernel_size: Tuple[int] = (4, 7, 7),
    head_output_size: Tuple[int] = (1, 1, 1),
    head_activation: Callable = None,
    head_output_with_global_average: bool = True,
) -> nn.Module:
    """
    Build ResNet style models for video recognition. ResNet has three parts:
    Stem, Stages and Head. Stem is the first Convolution layer (Conv1) with an
    optional pooling layer. Stages are grouped residual blocks. There are usually
    multiple stages and each stage may include multiple residual blocks. Head
    may include pooling, dropout, a fully-connected layer and global spatial
    temporal averaging. The three parts are assembled in the following order:

    ::

                                         Input
                                           ↓
                                         Stem
                                           ↓
                                         Stage 1
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                         Stage N
                                           ↓
                                         Head

    Args:

        input_channel (int): number of channels for the input video clip.

        model_depth (int): the depth of the resnet. Options include: 50, 101, 152.
        model_num_class (int): the number of classes for the video dataset.
        dropout_rate (float): dropout rate.


        norm (callable): a callable that constructs normalization layer.

        activation (callable): a callable that constructs activation layer.

        stem_dim_out (int): output channel size to stem.
        stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
        stem_conv_stride (tuple): convolutional stride size(s) of stem.
        stem_pool (callable): a callable that constructs resnet head pooling layer.
        stem_pool_kernel_size (tuple): pooling kernel size(s).
        stem_pool_stride (tuple): pooling stride size(s).
        stem (callable): a callable that constructs stem layer.
            Examples include: create_res_video_stem.

        stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
            for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
        stage_conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
        stage_spatial_h_stride (tuple): the spatial height stride for each stage.
        stage_spatial_w_stride (tuple): the spatial width stride for each stage.
        stage_temporal_stride (tuple): the temporal stride for each stage.
        bottleneck (callable): a callable that constructs bottleneck block layer.
            Examples include: create_bottleneck_block.

        head_pool (callable): a callable that constructs resnet head pooling layer.
        head_pool_kernel_size (tuple): the pooling kernel size.
        head_output_size (tuple): the size of output tensor for head.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.

    Returns:
        (nn.Module): basic resnet.
    """

    torch._C._log_api_usage_once("PYTORCHVIDEO.model.create_resnet")

    # Number of blocks for different stages given the model depth.
    _MODEL_STAGE_DEPTH = {
        50: (3, 4, 6, 3),
        101: (3, 4, 23, 3),
        152: (3, 8, 36, 3)
    }

    # Given a model depth, get the number of blocks for each stage.
    assert (model_depth in _MODEL_STAGE_DEPTH.keys()
            ), f"{model_depth} is not in {_MODEL_STAGE_DEPTH.keys()}"
    stage_depths = _MODEL_STAGE_DEPTH[model_depth]

    # Broadcast single element to tuple if given.
    if isinstance(stage_conv_a_kernel_size[0], int):
        stage_conv_a_kernel_size = (
            stage_conv_a_kernel_size, ) * len(stage_depths)

    if isinstance(stage_conv_b_kernel_size[0], int):
        stage_conv_b_kernel_size = (
            stage_conv_b_kernel_size, ) * len(stage_depths)

    if isinstance(stage_conv_b_dilation[0], int):
        stage_conv_b_dilation = (stage_conv_b_dilation, ) * len(stage_depths)

    if isinstance(bottleneck, Callable):
        bottleneck = [
            bottleneck,
        ] * len(stage_depths)

    blocks = []
    # Create stem for resnet.
    stem = stem(
        in_channels=input_channel,
        out_channels=stem_dim_out,
        conv_kernel_size=stem_conv_kernel_size,
        conv_stride=stem_conv_stride,
        conv_padding=[size // 2 for size in stem_conv_kernel_size],
        pool=stem_pool,
        pool_kernel_size=stem_pool_kernel_size,
        pool_stride=stem_pool_stride,
        pool_padding=[size // 2 for size in stem_pool_kernel_size],
        norm=norm,
        activation=activation,
    )
    blocks.append(stem)

    stage_dim_in = stem_dim_out
    stage_dim_out = stage_dim_in * 4

    # Create each stage for resnet.
    for idx in range(len(stage_depths)):
        stage_dim_inner = stage_dim_out // 4
        depth = stage_depths[idx]

        stage_conv_a_kernel = stage_conv_a_kernel_size[idx]
        stage_conv_a_stride = (stage_temporal_stride[idx], 1, 1)
        stage_conv_a_padding = ([size // 2 for size in stage_conv_a_kernel]
                                if isinstance(stage_conv_a_kernel[0], int) else
                                [[size // 2 for size in sizes]
                                 for sizes in stage_conv_a_kernel])

        stage_conv_b_stride = (
            1,
            stage_spatial_h_stride[idx],
            stage_spatial_w_stride[idx],
        )

        stage = create_res_stage(
            depth=depth,
            dim_in=stage_dim_in,
            dim_inner=stage_dim_inner,
            dim_out=stage_dim_out,
            bottleneck=bottleneck[idx],
            conv_a_kernel_size=stage_conv_a_kernel,
            conv_a_stride=stage_conv_a_stride,
            conv_a_padding=stage_conv_a_padding,
            conv_b_kernel_size=stage_conv_b_kernel_size[idx],
            conv_b_stride=stage_conv_b_stride,
            conv_b_padding=[
                size // 2 for size in stage_conv_b_kernel_size[idx]
            ],
            conv_b_num_groups=stage_conv_b_num_groups[idx],
            conv_b_dilation=stage_conv_b_dilation[idx],
            norm=norm,
            activation=activation,
        )

        blocks.append(stage)
        stage_dim_in = stage_dim_out
        stage_dim_out = stage_dim_out * 2

        if idx == 0 and stage1_pool is not None:
            blocks.append(
                stage1_pool(
                    kernel_size=stage1_pool_kernel_size,
                    stride=stage1_pool_kernel_size,
                    padding=(0, 0, 0),
                ))

    head = create_res_basic_head(
        in_features=stage_dim_in,
        out_features=model_num_class,
        pool=head_pool,
        output_size=head_output_size,
        pool_kernel_size=head_pool_kernel_size,
        dropout_rate=dropout_rate,
        activation=head_activation,
        output_with_global_average=head_output_with_global_average,
    )
    blocks.append(head)
    return Net(blocks=nn.ModuleList(blocks))