def _construct_network(self, cfg): """ Builds a SlowFast model. Args: cfg (CfgNode): model building configs, details are in the comments of the config file. """ _MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3), 101: (3, 4, 23, 3)} # Params from configs. norm_module = get_norm(cfg) pool_size = _POOL1[cfg.MODEL.ARCH] num_groups = cfg.RESNET.NUM_GROUPS width_per_group = cfg.RESNET.WIDTH_PER_GROUP spatial_dilations = cfg.RESNET.SPATIAL_DILATIONS spatial_strides = cfg.RESNET.SPATIAL_STRIDES temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH] num_block_temp_kernel = cfg.RESNET.NUM_BLOCK_TEMP_KERNEL stage_depth = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH] stage_conv_a_kernel_sizes = [[], []] for pathway in range(2): for stage in range(4): stage_conv_a_kernel_sizes[pathway].append( ((temp_kernel[stage + 1][pathway][0], 1, 1),) * num_block_temp_kernel[stage][pathway] + ((1, 1, 1),) * ( stage_depth[stage] - num_block_temp_kernel[stage][pathway] ) ) # Head from config # Number of stages = 4 stage_dim_in = cfg.RESNET.WIDTH_PER_GROUP * 2 ** (4 + 1) head_in_features = stage_dim_in + stage_dim_in // cfg.SLOWFAST.BETA_INV if cfg.DETECTION.ENABLE: self.detection_head = create_res_roi_pooling_head( in_features=head_in_features, out_features=cfg.MODEL.NUM_CLASSES, pool=None, output_size=(1, 1, 1), dropout_rate=cfg.MODEL.DROPOUT_RATE, activation=None, output_with_global_average=False, pool_spatial=nn.MaxPool2d, resolution=[cfg.DETECTION.ROI_XFORM_RESOLUTION] * 2, spatial_scale=1.0 / float(cfg.DETECTION.SPATIAL_SCALE_FACTOR), sampling_ratio=0, roi=ROIAlign, ) head_pool_kernel_sizes = ( ( cfg.DATA.NUM_FRAMES // cfg.SLOWFAST.ALPHA // pool_size[0][0], 1, 1, ), (cfg.DATA.NUM_FRAMES // pool_size[1][0], 1, 1), ) else: head_pool_kernel_sizes = ( ( cfg.DATA.NUM_FRAMES // cfg.SLOWFAST.ALPHA // pool_size[0][0], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][1], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][2], ), ( cfg.DATA.NUM_FRAMES // pool_size[1][0], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[1][1], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[1][2], ), ) self.model = create_slowfast( # SlowFast configs. slowfast_channel_reduction_ratio=cfg.SLOWFAST.BETA_INV, slowfast_conv_channel_fusion_ratio=cfg.SLOWFAST.FUSION_CONV_CHANNEL_RATIO, slowfast_fusion_conv_kernel_size=( cfg.SLOWFAST.FUSION_KERNEL_SZ, 1, 1, ), slowfast_fusion_conv_stride=(cfg.SLOWFAST.ALPHA, 1, 1), # Input clip configs. input_channels=cfg.DATA.INPUT_CHANNEL_NUM, # Model configs. model_depth=cfg.RESNET.DEPTH, model_num_class=cfg.MODEL.NUM_CLASSES, dropout_rate=cfg.MODEL.DROPOUT_RATE, # Normalization configs. norm=norm_module, # Activation configs. activation=partial(nn.ReLU, inplace=cfg.RESNET.INPLACE_RELU), # Stem configs. stem_dim_outs=( width_per_group, width_per_group // cfg.SLOWFAST.BETA_INV, ), stem_conv_kernel_sizes=( (temp_kernel[0][0][0], 7, 7), (temp_kernel[0][1][0], 7, 7), ), stem_conv_strides=((1, 2, 2), (1, 2, 2)), stem_pool=nn.MaxPool3d, stem_pool_kernel_sizes=((1, 3, 3), (1, 3, 3)), stem_pool_strides=((1, 2, 2), (1, 2, 2)), # Stage configs. stage_conv_a_kernel_sizes=stage_conv_a_kernel_sizes, stage_conv_b_kernel_sizes=( ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)), ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)), ), stage_conv_b_num_groups=( (num_groups, num_groups, num_groups, num_groups), (num_groups, num_groups, num_groups, num_groups), ), stage_conv_b_dilations=( ( (1, spatial_dilations[0][0], spatial_dilations[0][0]), (1, spatial_dilations[1][0], spatial_dilations[1][0]), (1, spatial_dilations[2][0], spatial_dilations[2][0]), (1, spatial_dilations[3][0], spatial_dilations[3][0]), ), ( (1, spatial_dilations[0][1], spatial_dilations[0][1]), (1, spatial_dilations[1][1], spatial_dilations[1][1]), (1, spatial_dilations[1][1], spatial_dilations[1][1]), (1, spatial_dilations[1][1], spatial_dilations[1][1]), ), ), stage_spatial_strides=( ( spatial_strides[0][0], spatial_strides[1][0], spatial_strides[2][0], spatial_strides[3][0], ), ( spatial_strides[0][1], spatial_strides[1][1], spatial_strides[2][1], spatial_strides[3][1], ), ), stage_temporal_strides=((1, 1, 1, 1), (1, 1, 1, 1)), bottleneck=create_bottleneck_block, # Head configs. head=create_res_basic_head if not self.detection_mode else None, head_pool=nn.AvgPool3d, head_pool_kernel_sizes=head_pool_kernel_sizes, head_activation=None, head_output_with_global_average=False, ) self.post_act = get_head_act(cfg.MODEL.HEAD_ACT)
def _construct_network(self, cfg): """ Builds a single pathway ResNet model. Args: cfg (CfgNode): model building configs, details are in the comments of the config file. """ # Params from configs. norm_module = get_norm(cfg) head_act = get_head_act(cfg.MODEL.HEAD_ACT) pool_size = _POOL1[cfg.MODEL.ARCH] num_groups = cfg.RESNET.NUM_GROUPS spatial_dilations = cfg.RESNET.SPATIAL_DILATIONS spatial_strides = cfg.RESNET.SPATIAL_STRIDES temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH] stage1_pool = pool_size[0][0] != 1 or len(set(pool_size[0])) > 1 stage_spatial_stride = ( spatial_strides[0][0], spatial_strides[1][0], spatial_strides[2][0], spatial_strides[3][0], ) if cfg.MODEL.ARCH == "i3d": stage_conv_a_kernel_size = ( (3, 1, 1), [(3, 1, 1), (1, 1, 1)], [(3, 1, 1), (1, 1, 1)], [(1, 1, 1), (3, 1, 1)], ) else: stage_conv_a_kernel_size = ( (temp_kernel[1][0][0], 1, 1), (temp_kernel[2][0][0], 1, 1), (temp_kernel[3][0][0], 1, 1), (temp_kernel[4][0][0], 1, 1), ) # Head from config if cfg.DETECTION.ENABLE: self.detection_head = create_res_roi_pooling_head( in_features=cfg.RESNET.WIDTH_PER_GROUP * 2 ** (4 + 1), out_features=cfg.MODEL.NUM_CLASSES, pool=nn.AvgPool3d, output_size=(1, 1, 1), pool_kernel_size=( cfg.DATA.NUM_FRAMES // pool_size[0][0], 1, 1, ), dropout_rate=cfg.MODEL.DROPOUT_RATE, activation=None, output_with_global_average=False, pool_spatial=nn.MaxPool2d, resolution=[cfg.DETECTION.ROI_XFORM_RESOLUTION] * 2, spatial_scale=1.0 / float(cfg.DETECTION.SPATIAL_SCALE_FACTOR), sampling_ratio=0, roi=ROIAlign, ) self.model = create_resnet( # Input clip configs. input_channel=cfg.DATA.INPUT_CHANNEL_NUM[0], # Model configs. model_depth=cfg.RESNET.DEPTH, model_num_class=cfg.MODEL.NUM_CLASSES, dropout_rate=cfg.MODEL.DROPOUT_RATE, # Normalization configs. norm=norm_module, # Activation configs. activation=partial(nn.ReLU, inplace=cfg.RESNET.INPLACE_RELU), # Stem configs. stem_dim_out=cfg.RESNET.WIDTH_PER_GROUP, stem_conv_kernel_size=(temp_kernel[0][0][0], 7, 7), stem_conv_stride=(1, 2, 2), stem_pool=nn.MaxPool3d, stem_pool_kernel_size=(1, 3, 3), stem_pool_stride=(1, 2, 2), # Stage configs. stage1_pool=nn.MaxPool3d if stage1_pool else None, stage1_pool_kernel_size=pool_size[0], stage_conv_a_kernel_size=stage_conv_a_kernel_size, stage_conv_b_kernel_size=( (1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3), ), stage_conv_b_num_groups=( num_groups, num_groups, num_groups, num_groups, ), stage_conv_b_dilation=( (1, spatial_dilations[0][0], spatial_dilations[0][0]), (1, spatial_dilations[1][0], spatial_dilations[1][0]), (1, spatial_dilations[2][0], spatial_dilations[2][0]), (1, spatial_dilations[3][0], spatial_dilations[3][0]), ), stage_spatial_h_stride=stage_spatial_stride, stage_spatial_w_stride=stage_spatial_stride, stage_temporal_stride=(1, 1, 1, 1), bottleneck=create_bottleneck_block, # Head configs. head=create_res_basic_head if not self.detection_mode else None, head_pool=nn.AvgPool3d, head_pool_kernel_size=( cfg.DATA.NUM_FRAMES // pool_size[0][0], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][1], cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][2], ), head_activation=None, head_output_with_global_average=False, ) self.post_act = head_act
def test_build_head_with_callable(self): """ Test builder `create_res_roi_pooling_head`. """ # ROI layer configs resolution = (10, 15) spatial_scale = 1.0 / 5.0 sampling_ratio = 0 roi_layer = RoIAlign(resolution, spatial_scale=spatial_scale, sampling_ratio=sampling_ratio) for (pool, activation) in itertools.product( (nn.AvgPool3d, nn.MaxPool3d, nn.AdaptiveAvgPool3d, None), (nn.ReLU, nn.Softmax, nn.Sigmoid, None), ): if activation is None: activation_model = None elif activation == nn.Softmax: activation_model = activation(dim=1) else: activation_model = activation() if pool is None: pool_model = None elif pool == nn.AdaptiveAvgPool3d: pool_model = pool(1) else: pool_model = pool(kernel_size=[5, 1, 1], stride=[1, 1, 1]) model = create_res_roi_pooling_head( in_features=16, out_features=32, resolution=resolution, spatial_scale=spatial_scale, sampling_ratio=sampling_ratio, roi=RoIAlign, pool=pool, pool_spatial=nn.MaxPool2d, pool_kernel_size=(5, 1, 1), output_size=(1, 1, 1), dropout_rate=0.0, activation=activation, output_with_global_average=True, ) model_gt = ResNetRoIHead( proj=nn.Linear(16, 32), activation=activation_model, pool=pool_model, pool_spatial=nn.MaxPool2d(resolution, stride=1), roi_layer=roi_layer, dropout=None, output_pool=nn.AdaptiveAvgPool3d(1), ) model.load_state_dict(model_gt.state_dict(), strict=True) # explicitly use strict mode. # Test forwarding. for (input_tensor, bboxes) in TestRoIHeadHelper._get_inputs(input_dim=16): with torch.no_grad(): if (input_tensor.shape[1] != 16 or (pool is None) or (input_tensor.shape[-3] != 5 and pool != nn.AdaptiveAvgPool3d)): with self.assertRaises(Exception): output_tensor = model(input_tensor, bboxes) continue else: output_tensor = model(input_tensor, bboxes) output_tensor_gt = model_gt(input_tensor, bboxes) self.assertEqual( output_tensor.shape, output_tensor_gt.shape, "Output shape {} is different from expected shape {}". format(output_tensor.shape, output_tensor_gt.shape), ) self.assertTrue( np.allclose(output_tensor.numpy(), output_tensor_gt.numpy()))