def slow_r50( pretrained: bool = False, progress: bool = True, **kwargs: Any ) -> nn.Module: r""" Slow R50 model architecture [1] with pretrained weights based on 8x8 setting on the Kinetics dataset. Model with pretrained weights has top1 accuracy of 74.58. [1] Christoph Feichtenhofer et al, "SlowFast Networks for Video Recognition" https://arxiv.org/pdf/1812.03982.pdf Args: pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset progress (bool): If True, displays a progress bar of the download to stderr kwargs: use these to modify any of the other model settings. All the options are defined in pytorchvideo/models/resnet.py NOTE: to use the pretrained model, do not modify the model configuration via the kwargs. Only modify settings via kwargs to initialize a new model without pretrained weights. """ model = create_resnet( stem_conv_kernel_size=(1, 7, 7), head_pool_kernel_size=(8, 7, 7), model_depth=50, **kwargs, ) if pretrained: path = checkpoint_paths["slow_r50"] checkpoint = load_state_dict_from_url(path, progress=progress) state_dict = checkpoint["model_state"] model.load_state_dict(state_dict) return model
def make_kinetics_resnet(): resnet_model = resnet.create_resnet(input_channel=3, model_depth=50, model_num_class=3, norm=nn.BatchNorm3d, activation=nn.ReLU) return resnet_model
def _construct_network(self, cfg): """ Builds a single pathway ResNet model. Args: cfg (CfgNode): model building configs, details are in the comments of the config file. """ # Params from configs. norm_module = get_norm(cfg) head_act = get_head_act(cfg.MODEL.HEAD_ACT) pool_size = _POOL1[cfg.MODEL.ARCH] num_groups = cfg.RESNET.NUM_GROUPS spatial_dilations = cfg.RESNET.SPATIAL_DILATIONS spatial_strides = cfg.RESNET.SPATIAL_STRIDES temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH] stage1_pool = pool_size[0][0] != 1 or len(set(pool_size[0])) > 1 stage_spatial_stride = ( spatial_strides[0][0], spatial_strides[1][0], spatial_strides[2][0], spatial_strides[3][0], ) if cfg.MODEL.ARCH == "i3d": stage_conv_a_kernel_size = ( (3, 1, 1), [(3, 1, 1), (1, 1, 1)], [(3, 1, 1), (1, 1, 1)], [(1, 1, 1), (3, 1, 1)], ) else: stage_conv_a_kernel_size = ( (temp_kernel[1][0][0], 1, 1), (temp_kernel[2][0][0], 1, 1), (temp_kernel[3][0][0], 1, 1), (temp_kernel[4][0][0], 1, 1), ) self.model = create_resnet( # Input clip configs. input_channel=cfg.DATA.INPUT_CHANNEL_NUM[0], # Model configs. model_depth=cfg.RESNET.DEPTH, model_num_class=cfg.MODEL.NUM_CLASSES, dropout_rate=cfg.MODEL.DROPOUT_RATE, # Normalization configs. norm=norm_module, # Activation configs. activation=partial(nn.ReLU, inplace=cfg.RESNET.INPLACE_RELU), # Stem configs. stem_dim_out=cfg.RESNET.WIDTH_PER_GROUP, stem_conv_kernel_size=(temp_kernel[0][0][0], 7, 7), stem_conv_stride=(1, 2, 2), stem_pool=nn.MaxPool3d, stem_pool_kernel_size=(1, 3, 3), stem_pool_stride=(1, 2, 2), # Stage configs. stage1_pool=nn.MaxPool3d if stage1_pool else None, stage1_pool_kernel_size=pool_size[0], stage_conv_a_kernel_size=stage_conv_a_kernel_size, stage_conv_b_kernel_size=( (1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3), ), stage_conv_b_num_groups=( num_groups, num_groups, num_groups, num_groups, ), stage_conv_b_dilation=( (1, spatial_dilations[0][0], spatial_dilations[0][0]), (1, spatial_dilations[1][0], spatial_dilations[1][0]), (1, spatial_dilations[2][0], spatial_dilations[2][0]), (1, spatial_dilations[3][0], spatial_dilations[3][0]), ), stage_spatial_h_stride=stage_spatial_stride, stage_spatial_w_stride=stage_spatial_stride, stage_temporal_stride=(1, 1, 1, 1), bottleneck=create_bottleneck_block, # Head configs. head_pool=nn.AvgPool3d, head_pool_kernel_size=( cfg.DATA.NUM_FRAMES // pool_size[0][0], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][1], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][2], ), head_activation=None, head_output_with_global_average=False, ) self.post_act = head_act