def test_create_slowfast_with_callable(self): """ Test builder `create_slowfast` with callable inputs. """ for (norm, activation) in itertools.product( (nn.BatchNorm3d, None), (nn.ReLU, nn.Sigmoid, None)): input_clip_length = 32 input_crop_size = 224 input_channel = 3 model = create_slowfast( slowfast_channel_reduction_ratio=8, slowfast_conv_channel_fusion_ratio=2, slowfast_fusion_conv_kernel_size=(7, 1, 1), slowfast_fusion_conv_stride=(4, 1, 1), input_channels=(input_channel, ) * 2, model_depth=18, model_num_class=400, dropout_rate=0, norm=norm, activation=activation, ) # Test forwarding. for tensor in TestSlowFast._get_inputs(input_channel, input_clip_length, input_crop_size): with torch.no_grad(): if tensor[0].shape[1] != input_channel: with self.assertRaises(RuntimeError): model(tensor) continue model(tensor)
def _slowfast( pretrained: bool = False, progress: bool = True, checkpoint_path: str = "", **kwargs: Any, ) -> nn.Module: model = create_slowfast(**kwargs) if pretrained: checkpoint = load_state_dict_from_url(checkpoint_path, progress=progress) state_dict = checkpoint["model_state"] model.load_state_dict(state_dict) return model
def _slowfast( pretrained: bool = False, progress: bool = True, checkpoint_path: str = "", **kwargs: Any, ) -> nn.Module: model = create_slowfast(**kwargs) if pretrained: # All models are loaded onto CPU by default checkpoint = load_state_dict_from_url( checkpoint_path, progress=progress, map_location="cpu" ) state_dict = checkpoint["model_state"] model.load_state_dict(state_dict) return model
def _construct_network(self, cfg): """ Builds a SlowFast model. Args: cfg (CfgNode): model building configs, details are in the comments of the config file. """ # Params from configs. norm_module = get_norm(cfg) pool_size = _POOL1[cfg.MODEL.ARCH] num_groups = cfg.RESNET.NUM_GROUPS width_per_group = cfg.RESNET.WIDTH_PER_GROUP spatial_dilations = cfg.RESNET.SPATIAL_DILATIONS spatial_strides = cfg.RESNET.SPATIAL_STRIDES temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH] self.model = create_slowfast( # SlowFast configs. slowfast_channel_reduction_ratio=cfg.SLOWFAST.BETA_INV, slowfast_conv_channel_fusion_ratio=cfg.SLOWFAST. FUSION_CONV_CHANNEL_RATIO, slowfast_fusion_conv_kernel_size=( cfg.SLOWFAST.FUSION_KERNEL_SZ, 1, 1, ), slowfast_fusion_conv_stride=(cfg.SLOWFAST.ALPHA, 1, 1), # Input clip configs. input_channels=cfg.DATA.INPUT_CHANNEL_NUM, # Model configs. model_depth=cfg.RESNET.DEPTH, model_num_class=cfg.MODEL.NUM_CLASSES, dropout_rate=cfg.MODEL.DROPOUT_RATE, # Normalization configs. norm=norm_module, # Activation configs. activation=partial(nn.ReLU, inplace=cfg.RESNET.INPLACE_RELU), # Stem configs. stem_dim_outs=( width_per_group, width_per_group // cfg.SLOWFAST.BETA_INV, ), stem_conv_kernel_sizes=( (temp_kernel[0][0][0], 7, 7), (temp_kernel[0][1][0], 7, 7), ), stem_conv_strides=((1, 2, 2), (1, 2, 2)), stem_pool=nn.MaxPool3d, stem_pool_kernel_sizes=((1, 3, 3), (1, 3, 3)), stem_pool_strides=((1, 2, 2), (1, 2, 2)), # Stage configs. stage_conv_a_kernel_sizes=( ( (temp_kernel[1][0][0], 1, 1), (temp_kernel[2][0][0], 1, 1), (temp_kernel[3][0][0], 1, 1), (temp_kernel[4][0][0], 1, 1), ), ( (temp_kernel[1][1][0], 1, 1), (temp_kernel[2][1][0], 1, 1), (temp_kernel[3][1][0], 1, 1), (temp_kernel[4][1][0], 1, 1), ), ), stage_conv_b_kernel_sizes=( ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)), ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)), ), stage_conv_b_num_groups=( (num_groups, num_groups, num_groups, num_groups), (num_groups, num_groups, num_groups, num_groups), ), stage_conv_b_dilations=( ( (1, spatial_dilations[0][0], spatial_dilations[0][0]), (1, spatial_dilations[1][0], spatial_dilations[1][0]), (1, spatial_dilations[2][0], spatial_dilations[2][0]), (1, spatial_dilations[3][0], spatial_dilations[3][0]), ), ( (1, spatial_dilations[0][1], spatial_dilations[0][1]), (1, spatial_dilations[1][1], spatial_dilations[1][1]), (1, spatial_dilations[1][1], spatial_dilations[1][1]), (1, spatial_dilations[1][1], spatial_dilations[1][1]), ), ), stage_spatial_strides=( ( spatial_strides[0][0], spatial_strides[1][0], spatial_strides[2][0], spatial_strides[3][0], ), ( spatial_strides[0][1], spatial_strides[1][1], spatial_strides[2][1], spatial_strides[3][1], ), ), stage_temporal_strides=((1, 1, 1, 1), (1, 1, 1, 1)), bottleneck=create_bottleneck_block, # Head configs. head_pool=nn.AvgPool3d, head_pool_kernel_sizes=( ( cfg.DATA.NUM_FRAMES // cfg.SLOWFAST.ALPHA // pool_size[0][0], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][1], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][2], ), ( cfg.DATA.NUM_FRAMES // pool_size[1][0], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[1][1], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[1][2], ), ), head_activation=None, head_output_with_global_average=False, ) self.post_act = get_head_act(cfg.MODEL.HEAD_ACT)
def _construct_network(self, cfg): """ Builds a SlowFast model. Args: cfg (CfgNode): model building configs, details are in the comments of the config file. """ _MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3), 101: (3, 4, 23, 3)} # Params from configs. norm_module = get_norm(cfg) pool_size = _POOL1[cfg.MODEL.ARCH] num_groups = cfg.RESNET.NUM_GROUPS width_per_group = cfg.RESNET.WIDTH_PER_GROUP spatial_dilations = cfg.RESNET.SPATIAL_DILATIONS spatial_strides = cfg.RESNET.SPATIAL_STRIDES temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH] num_block_temp_kernel = cfg.RESNET.NUM_BLOCK_TEMP_KERNEL stage_depth = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH] stage_conv_a_kernel_sizes = [[], []] for pathway in range(2): for stage in range(4): stage_conv_a_kernel_sizes[pathway].append( ((temp_kernel[stage + 1][pathway][0], 1, 1),) * num_block_temp_kernel[stage][pathway] + ((1, 1, 1),) * ( stage_depth[stage] - num_block_temp_kernel[stage][pathway] ) ) # Head from config # Number of stages = 4 stage_dim_in = cfg.RESNET.WIDTH_PER_GROUP * 2 ** (4 + 1) head_in_features = stage_dim_in + stage_dim_in // cfg.SLOWFAST.BETA_INV if cfg.DETECTION.ENABLE: self.detection_head = create_res_roi_pooling_head( in_features=head_in_features, out_features=cfg.MODEL.NUM_CLASSES, pool=None, output_size=(1, 1, 1), dropout_rate=cfg.MODEL.DROPOUT_RATE, activation=None, output_with_global_average=False, pool_spatial=nn.MaxPool2d, resolution=[cfg.DETECTION.ROI_XFORM_RESOLUTION] * 2, spatial_scale=1.0 / float(cfg.DETECTION.SPATIAL_SCALE_FACTOR), sampling_ratio=0, roi=ROIAlign, ) head_pool_kernel_sizes = ( ( cfg.DATA.NUM_FRAMES // cfg.SLOWFAST.ALPHA // pool_size[0][0], 1, 1, ), (cfg.DATA.NUM_FRAMES // pool_size[1][0], 1, 1), ) else: head_pool_kernel_sizes = ( ( cfg.DATA.NUM_FRAMES // cfg.SLOWFAST.ALPHA // pool_size[0][0], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][1], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][2], ), ( cfg.DATA.NUM_FRAMES // pool_size[1][0], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[1][1], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[1][2], ), ) self.model = create_slowfast( # SlowFast configs. slowfast_channel_reduction_ratio=cfg.SLOWFAST.BETA_INV, slowfast_conv_channel_fusion_ratio=cfg.SLOWFAST.FUSION_CONV_CHANNEL_RATIO, slowfast_fusion_conv_kernel_size=( cfg.SLOWFAST.FUSION_KERNEL_SZ, 1, 1, ), slowfast_fusion_conv_stride=(cfg.SLOWFAST.ALPHA, 1, 1), # Input clip configs. input_channels=cfg.DATA.INPUT_CHANNEL_NUM, # Model configs. model_depth=cfg.RESNET.DEPTH, model_num_class=cfg.MODEL.NUM_CLASSES, dropout_rate=cfg.MODEL.DROPOUT_RATE, # Normalization configs. norm=norm_module, # Activation configs. activation=partial(nn.ReLU, inplace=cfg.RESNET.INPLACE_RELU), # Stem configs. stem_dim_outs=( width_per_group, width_per_group // cfg.SLOWFAST.BETA_INV, ), stem_conv_kernel_sizes=( (temp_kernel[0][0][0], 7, 7), (temp_kernel[0][1][0], 7, 7), ), stem_conv_strides=((1, 2, 2), (1, 2, 2)), stem_pool=nn.MaxPool3d, stem_pool_kernel_sizes=((1, 3, 3), (1, 3, 3)), stem_pool_strides=((1, 2, 2), (1, 2, 2)), # Stage configs. stage_conv_a_kernel_sizes=stage_conv_a_kernel_sizes, stage_conv_b_kernel_sizes=( ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)), ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)), ), stage_conv_b_num_groups=( (num_groups, num_groups, num_groups, num_groups), (num_groups, num_groups, num_groups, num_groups), ), stage_conv_b_dilations=( ( (1, spatial_dilations[0][0], spatial_dilations[0][0]), (1, spatial_dilations[1][0], spatial_dilations[1][0]), (1, spatial_dilations[2][0], spatial_dilations[2][0]), (1, spatial_dilations[3][0], spatial_dilations[3][0]), ), ( (1, spatial_dilations[0][1], spatial_dilations[0][1]), (1, spatial_dilations[1][1], spatial_dilations[1][1]), (1, spatial_dilations[1][1], spatial_dilations[1][1]), (1, spatial_dilations[1][1], spatial_dilations[1][1]), ), ), stage_spatial_strides=( ( spatial_strides[0][0], spatial_strides[1][0], spatial_strides[2][0], spatial_strides[3][0], ), ( spatial_strides[0][1], spatial_strides[1][1], spatial_strides[2][1], spatial_strides[3][1], ), ), stage_temporal_strides=((1, 1, 1, 1), (1, 1, 1, 1)), bottleneck=create_bottleneck_block, # Head configs. head=create_res_basic_head if not self.detection_mode else None, head_pool=nn.AvgPool3d, head_pool_kernel_sizes=head_pool_kernel_sizes, head_activation=None, head_output_with_global_average=False, ) self.post_act = get_head_act(cfg.MODEL.HEAD_ACT)