def segm_resnet50(segm_input_dim=(256, 256), segm_inter_dim=(256, 256), backbone_pretrained=True, topk_pos=3, topk_neg=3, mixer_channels=2): # backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) # segmentation dimensions segm_input_dim = (64, 256, 512, 1024) segm_inter_dim = (4, 16, 32, 64) segm_dim = (64, 64) # convolutions before cosine similarity # segmentation segm_predictor = segmmodels.SegmNet(segm_input_dim=segm_input_dim, segm_inter_dim=segm_inter_dim, segm_dim=segm_dim, topk_pos=topk_pos, topk_neg=topk_neg, mixer_channels=mixer_channels) net = SegmNet(feature_extractor=backbone_net, segm_predictor=segm_predictor, segm_layers=['conv1', 'layer1', 'layer2', 'layer3'], extractor_grad=False) # extractor_grad=False return net
def drnet_resnet50(iou_input_dim=(512, 1024), iou_inter_dim=(256, 256), backbone_pretrained=True): # backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) if backbone_pretrained: mod = torch.load( '/mnt/lustre/baishuai/experiment/pytracking_networks/rpn_r50_c4_2x-3d4c1e14.pth' )['state_dict'] model_dict = backbone_net.state_dict() pretrained_dict = {} for k, v in mod.items(): name = k.split('.')[1:] name = '.'.join(name) if name in model_dict and k.split('.')[0] != "rpn_head": # print(name) pretrained_dict[name] = v # pretrained_dict = {k: v for k, v in other_state_dict.items() if k in model_dict and k.split('.')[0] != "mask_head"} model_dict.update(pretrained_dict) backbone_net.load_state_dict(model_dict, strict=True) # Bounding box regressor iou_predictor = bbmodels.DirectReg(input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) net = DRNet(feature_extractor=backbone_net, bb_regressor=iou_predictor, bb_regressor_layer=['layer2', 'layer3'], extractor_grad=False) return net
def SBDT_resnet50(input_dim=(512, 1024), locator_inter_dim=(128, 256), iou_input_dim=(256, 256), iou_inter_dim=(256, 256), backbone_pretrained=True): # backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) # Bounding box regressor iou_predictor = bbmodels.AtomIoUNet(input_dim=input_dim, pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) # locator location_predictor = locmodels.OnlineRRNet50( input_dim=input_dim, pred_input_dim=locator_inter_dim) # SBDTNet net = SBDTNet(feature_extractor=backbone_net, feature_layer=['layer2', 'layer3'], bb_regressor=iou_predictor, location_predictor=location_predictor, extractor_grad=False) return net
def __init__(self, output_layers, pretrained, frozen_layers): backbone = backbones.resnet50(output_layers=output_layers, pretrained=pretrained, frozen_layers=frozen_layers) num_channels = 1024 super().__init__(backbone, num_channels)
def steepest_descent_learn_filter_resnet50_newiou(filter_size=1, optim_iter=3, optim_init_step=1.0, optim_init_reg=0.01, output_activation=None, classification_layer='layer3', backbone_pretrained=False, clf_feat_blocks=1, clf_feat_norm=True, init_filter_norm=False, final_conv=False, out_feature_dim=256, init_gauss_sigma=1.0, num_dist_bins=5, bin_displacement=1.0, test_loss=None, mask_init_factor=4.0, iou_input_dim=(256,256), iou_inter_dim=(256,256), jitter_sigma_factor=None): # backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) norm_scale = math.sqrt(1.0 / (out_feature_dim * filter_size * filter_size)) # classifier clf_feature_extractor = clf_features.residual_bottleneck_comb(num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=out_feature_dim) initializer = clf_initializer.FilterInitializerLinear(filter_size=filter_size, filter_norm=init_filter_norm, feature_dim=out_feature_dim) optimizer = clf_optimizer.SteepestDescentLearn(num_iter=optim_iter, filter_size=filter_size, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, feature_dim=out_feature_dim, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, test_loss=test_loss, mask_init_factor=mask_init_factor) classifier = target_clf.LinearFilter(filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=clf_feature_extractor, output_activation=output_activation, jitter_sigma_factor=jitter_sigma_factor) # Bounding box regressor # combine RGB and TIR by 2* bb_regressor = bbmodels.AtomIoUNet(input_dim=(4*128,4*256), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) # load pretrained model pretrainmodel_path='/home/lichao/projects/pytracking_lichao/pytracking/DiMP_nets/sdlearn_300_onlytestloss_lr_causal_mg30_iou_nocf_res50_lfilt512_coco/OptimTracker_ep0040.pth.tar' pretrainmodel = loading.torch_load_legacy(pretrainmodel_path)['net'] usepretrain = True; updback = True; updcls = True; updbb = True if usepretrain: if updback: # update backbone backbone_dict = backbone_net.state_dict() pretrain_dict = {k[len('feature_extractor.'):]: v for k, v in pretrainmodel.items() if k[len('feature_extractor.'):] in backbone_dict} backbone_net.load_state_dict(pretrain_dict) if updcls: # update classifier pretrainmodel['classifier.feature_extractor.0.weight']=torch.cat((pretrainmodel['classifier.feature_extractor.0.weight'],pretrainmodel['classifier.feature_extractor.0.weight']),1) classifier_dict = classifier.state_dict() pretrain_dict = {k[len('classifier.'):]: v for k, v in pretrainmodel.items() if k[len('classifier.'):] in classifier_dict} #classifier_dict.update(pretrain_dict) classifier.load_state_dict(pretrain_dict) if updbb: # update Bounding box regressor bb_regressor_dict = bb_regressor.state_dict() pretrain_dict = {k[len('bb_regressor.'):]: v for k, v in pretrainmodel.items() if k[len('bb_regressor.'):] in bb_regressor_dict} bb_regressor.load_state_dict(pretrain_dict) net = OptimTracker(feature_extractor=backbone_net, classifier=classifier, bb_regressor=bb_regressor, classification_layer=classification_layer, bb_regressor_layer=['layer2', 'layer3']) return net
def dimpnet50(filter_size=1, optim_iter=5, optim_init_step=1.0, optim_init_reg=0.01, classification_layer='layer3', feat_stride=16, backbone_pretrained=True, clf_feat_blocks=0, clf_feat_norm=True, init_filter_norm=False, final_conv=True, out_feature_dim=512, init_gauss_sigma=1.0, num_dist_bins=5, bin_displacement=1.0, mask_init_factor=4.0, iou_input_dim=(256, 256), iou_inter_dim=(256, 256), score_act='relu', act_param=None, target_mask_act='sigmoid', detach_length=float('Inf'), frozen_backbone_layers=()): # Backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained, frozen_layers=frozen_backbone_layers) # Feature normalization norm_scale = math.sqrt(1.0 / (out_feature_dim * filter_size * filter_size)) # Classifier features if classification_layer == 'layer3': feature_dim = 256 elif classification_layer == 'layer4': feature_dim = 512 else: raise Exception clf_feature_extractor = clf_features.residual_bottleneck(feature_dim=feature_dim, num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=out_feature_dim) # Initializer for the DiMP classifier initializer = clf_initializer.FilterInitializerLinear(filter_size=filter_size, filter_norm=init_filter_norm, feature_dim=out_feature_dim) # Optimizer for the DiMP classifier optimizer = clf_optimizer.DiMPSteepestDescentGN(num_iter=optim_iter, feat_stride=feat_stride, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, mask_init_factor=mask_init_factor, score_act=score_act, act_param=act_param, mask_act=target_mask_act, detach_length=detach_length) ### Transformer init_transformer = transformer.Transformer(d_model=512, nhead=1, num_layers=1) # The classifier module classifier = target_clf.LinearFilter(filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=clf_feature_extractor, transformer=init_transformer) # Bounding box regressor bb_regressor = bbmodels.AtomIoUNet(input_dim=(4*128,4*256), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) # DiMP network net = DiMPnet(feature_extractor=backbone_net, classifier=classifier, bb_regressor=bb_regressor, classification_layer=classification_layer, bb_regressor_layer=['layer2', 'layer3']) return net
def atom_resnet50(iou_input_dim=(256,256), iou_inter_dim=(256,256), backbone_pretrained=True): # backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) # Bounding box regressor iou_predictor = bbmodels.AtomIoUNet(input_dim=(4*128,4*256), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) net = ATOMnet(feature_extractor=backbone_net, bb_regressor=iou_predictor, bb_regressor_layer=['layer2', 'layer3'], extractor_grad=False) return net
def klcedimpnet50(filter_size=1, optim_iter=5, optim_init_step=1.0, optim_init_reg=0.01, classification_layer='layer3', feat_stride=16, backbone_pretrained=True, clf_feat_blocks=0, clf_feat_norm=True, init_filter_norm=False, final_conv=True, out_feature_dim=512, gauss_sigma=1.0, iou_input_dim=(256, 256), iou_inter_dim=(256, 256), detach_length=float('Inf'), alpha_eps=0.0, train_feature_extractor=True, init_uni_weight=None, optim_min_reg=1e-3, init_initializer='default', normalize_label=False, label_shrink=0, softmax_reg=None, label_threshold=0, final_relu=False, frozen_backbone_layers=()): if not train_feature_extractor: frozen_backbone_layers = 'all' # Backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained, frozen_layers=frozen_backbone_layers) # Feature normalization norm_scale = math.sqrt(1.0 / (out_feature_dim * filter_size * filter_size)) # Classifier features clf_feature_extractor = clf_features.residual_bottleneck(num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=out_feature_dim, final_relu=final_relu) # Initializer for the DiMP classifier initializer = clf_initializer.FilterInitializerLinear(filter_size=filter_size, filter_norm=init_filter_norm, feature_dim=out_feature_dim, init_weights=init_initializer) # Optimizer for the DiMP classifier optimizer = clf_optimizer.PrDiMPSteepestDescentNewton(num_iter=optim_iter, feat_stride=feat_stride, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, gauss_sigma=gauss_sigma, detach_length=detach_length, alpha_eps=alpha_eps, init_uni_weight=init_uni_weight, min_filter_reg=optim_min_reg, normalize_label=normalize_label, label_shrink=label_shrink, softmax_reg=softmax_reg, label_threshold=label_threshold) # The classifier module classifier = target_clf.LinearFilter(filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=clf_feature_extractor) # Bounding box regressor bb_regressor = bbmodels.AtomIoUNet(input_dim=(4*128,4*256), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) # DiMP network net = DiMPnet(feature_extractor=backbone_net, classifier=classifier, bb_regressor=bb_regressor, classification_layer=classification_layer, bb_regressor_layer=['layer2', 'layer3']) return net
def drnet_resnet50(iou_input_dim=(512, 1024), iou_inter_dim=(256, 256), backbone_pretrained=True): # backbone backbone_net = backbones.resnet50( output_layers=['conv1', 'layer1', 'layer2', 'layer3'], pretrained=backbone_pretrained) # Bounding box regressor iou_predictor = bbmodels.DirectReg(input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) net = DRNet(feature_extractor=backbone_net, bb_regressor=iou_predictor, bb_regressor_layer=['layer2', 'layer3'], extractor_grad=False, regressor_grad=False) return net
def atom_resnet50_mul_fpn(iou_input_dim=(256, 256), iou_inter_dim=(256, 256), backbone_pretrained=True, share_rt=False): # backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) # Bounding box regressor iou_predictor = bbmodels.AtomMulFPNIoUNet( input_dim=(512, 1024), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim, share_rt=share_rt, ) net = ATOMnet(feature_extractor=backbone_net, bb_regressor=iou_predictor, bb_regressor_layer=['layer2', 'layer3'], extractor_grad=False) return net
def depth_atom_resnet50(iou_input_dim=(256, 256), iou_inter_dim=(256, 256), backbone_pretrained=True): # backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) # depthNet depth_net = depth.depthResnet50() # Bounding box regressor iou_predictor = depthModels.DepthAtomIoUNet(input_dim=(4 * 256, 4 * 512), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) net = DepthATOMnet(feature_extractor=backbone_net, depth_feature_extractor=depth_net, bb_regressor=iou_predictor, bb_regressor_layer=['layer2', 'layer3'], extractor_grad=False) return net
def dimpnet50(filter_size=1, optim_iter=5, optim_init_step=1.0, optim_init_reg=0.01, classification_layer='layer3', feat_stride=16, backbone_pretrained=True, clf_feat_blocks=0, clf_feat_norm=True, init_filter_norm=False, final_conv=True, out_feature_dim=512, init_gauss_sigma=1.0, num_dist_bins=5, bin_displacement=1.0, mask_init_factor=4.0, iou_input_dim=(256, 256), iou_inter_dim=(256, 256), score_act='relu', act_param=None, target_mask_act='sigmoid', detach_length=float('Inf')): # Backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) # Feature normalization norm_scale = math.sqrt(1.0 / (out_feature_dim * filter_size * filter_size)) # Classifier features clf_feature_extractor = clf_features.residual_bottleneck( num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=out_feature_dim) # Initializer for the DiMP classifier initializer = clf_initializer.FilterInitializerLinear( settings=settings, filter_size=filter_size, filter_norm=init_filter_norm, feature_dim=out_feature_dim) # Optimizer for the DiMP classifier optimizer = clf_optimizer.DiMPSteepestDescentGN( settings=settings, num_iter=optim_iter, feat_stride=feat_stride, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, mask_init_factor=mask_init_factor, score_act=score_act, act_param=act_param, mask_act=target_mask_act, detach_length=detach_length) print( 'Song in ltr.models.tracking.DiMPnet_rgbd_blend1.py line 233, before classifier, target_clf.LinearFilter ...' ) # The classifier module classifier = target_clf.LinearFilter( settings=settings, filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=clf_feature_extractor) # Bounding box regressor for rgb bb_regressor = bbmodels.AtomIoUNet(settings=settings, input_dim=(4 * 128, 4 * 256), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) print( 'Song in ltr.models.tracking.DiMPnet_rgbd_blend1.py line 240, dimpnet50 model_constructor ...' ) # DiMP network net = DiMPnet_rgbd_blend1(settings=settings, feature_extractor=backbone_net, classifier=classifier, bb_regressor=bb_regressor, classification_layer=classification_layer, bb_regressor_layer=['layer2', 'layer3']) return net
def steepest_descent_resnet50(filter_size=1, num_filters=1, optim_iter=3, optim_init_reg=0.01, backbone_pretrained=False, clf_feat_blocks=1, clf_feat_norm=True, final_conv=False, out_feature_dim=512, target_model_input_layer='layer3', decoder_input_layers=( "layer4", "layer3", "layer2", "layer1", ), detach_length=float('Inf'), label_encoder_dims=(1, 1), frozen_backbone_layers=(), decoder_mdim=64, filter_groups=1, use_bn_in_label_enc=True, dilation_factors=None, backbone_type='imagenet'): # backbone feature extractor F if backbone_type == 'imagenet': backbone_net = backbones.resnet50(pretrained=backbone_pretrained, frozen_layers=frozen_backbone_layers) elif backbone_type == 'mrcnn': backbone_net = mrcnn_backbones.resnet50( pretrained=False, frozen_layers=frozen_backbone_layers) else: raise Exception norm_scale = math.sqrt(1.0 / (out_feature_dim * filter_size * filter_size)) layer_channels = backbone_net.out_feature_channels() # Extracts features input to the target model target_model_feature_extractor = clf_features.residual_basic_block( feature_dim=layer_channels[target_model_input_layer], num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=out_feature_dim) # Few-shot label generator and weight predictor label_encoder = seg_label_encoder.ResidualDS16SW( layer_dims=label_encoder_dims + (num_filters, ), use_bn=use_bn_in_label_enc) # Predicts initial target model parameters initializer = seg_initializer.FilterInitializerZero( filter_size=filter_size, num_filters=num_filters, feature_dim=out_feature_dim, filter_groups=filter_groups) # Computes few-shot learning loss residual_module = loss_residual_modules.LWTLResidual( init_filter_reg=optim_init_reg, filter_dilation_factors=dilation_factors) # Iteratively updates the target model parameters by minimizing the few-shot learning loss optimizer = steepestdescent.GNSteepestDescent( residual_module=residual_module, num_iter=optim_iter, detach_length=detach_length, residual_batch_dim=1, compute_losses=True) # Target model and Few-shot learner target_model = target_clf.LinearFilter( filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=target_model_feature_extractor, filter_dilation_factors=dilation_factors) # Decoder decoder_input_layers_channels = { L: layer_channels[L] for L in decoder_input_layers } decoder = lwtl_decoder.LWTLDecoder(num_filters, decoder_mdim, decoder_input_layers_channels, use_bn=True) net = LWTLNet(feature_extractor=backbone_net, target_model=target_model, decoder=decoder, label_encoder=label_encoder, target_model_input_layer=target_model_input_layer, decoder_input_layers=decoder_input_layers) return net
def __init__(self, settings=None, filter_size=1, num_filters=1, optim_iter=3, optim_init_reg=0.01, backbone_pretrained=False, clf_feat_blocks=1, clf_feat_norm=True, final_conv=False, out_feature_dim=512, target_model_input_layer='layer3', decoder_input_layers=("layer4", "layer3", "layer2", "layer1",), detach_length=float('Inf'), label_encoder_dims=(1, 1), frozen_backbone_layers=(), decoder_mdim=64, filter_groups=1, use_bn_in_label_enc=True, dilation_factors=None, backbone_type='imagenet'): super().__init__() if settings is None: raise Exception("settings cannot be None") self.settings = settings ############## BUILD NET ################### # backbone feature extractor F if backbone_type == 'imagenet': backbone_net = backbones.resnet50(pretrained=backbone_pretrained, frozen_layers=frozen_backbone_layers) elif backbone_type == 'mrcnn': backbone_net = mrcnn_backbones.resnet50(pretrained=False, frozen_layers=frozen_backbone_layers) else: raise Exception norm_scale = math.sqrt(1.0 / (out_feature_dim * filter_size * filter_size)) layer_channels = backbone_net.out_feature_channels() # Extracts features input to the target model target_model_feature_extractor = clf_features.residual_basic_block( feature_dim=layer_channels[target_model_input_layer], num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=out_feature_dim) # Few-shot label generator and weight predictor label_encoder = seg_label_encoder.ResidualDS16SW(layer_dims=label_encoder_dims + (num_filters,), use_bn=use_bn_in_label_enc) # Predicts initial target model parameters initializer = seg_initializer.FilterInitializerZero(filter_size=filter_size, num_filters=num_filters, feature_dim=out_feature_dim, filter_groups=filter_groups) # Computes few-shot learning loss residual_module = loss_residual_modules.LWTLResidual(init_filter_reg=optim_init_reg, filter_dilation_factors=dilation_factors) # Iteratively updates the target model parameters by minimizing the few-shot learning loss optimizer = steepestdescent.GNSteepestDescent(residual_module=residual_module, num_iter=optim_iter, detach_length=detach_length, residual_batch_dim=1, compute_losses=True) # Target model and Few-shot learner target_model = target_clf.LinearFilter(filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=target_model_feature_extractor, filter_dilation_factors=dilation_factors) # Decoder decoder_input_layers_channels = {L: layer_channels[L] for L in decoder_input_layers} decoder = lwtl_decoder.LWTLDecoder(num_filters, decoder_mdim, decoder_input_layers_channels, use_bn=True) # build lwl model self.net = LWTLNet(feature_extractor=backbone_net, target_model=target_model, decoder=decoder, label_encoder=label_encoder, target_model_input_layer=target_model_input_layer, decoder_input_layers=decoder_input_layers) ############## BUILD NET ################### # Loss function self.objective = { 'segm': LovaszSegLoss(per_image=False), } self.loss_weight = { 'segm': 100.0 } # actor初始化 self.num_refinement_iter = 2 self.disable_backbone_bn = False self.disable_all_bn = True # Load pre-trained maskrcnn weights self._load_pretrained_weights(settings)
def steepest_descent_learn_filter_resnet50_newiou( filter_size=1, optim_iter=3, optim_init_step=1.0, optim_init_reg=0.01, output_activation=None, classification_layer='layer3', backbone_pretrained=False, clf_feat_blocks=1, clf_feat_norm=True, init_filter_norm=False, final_conv=False, out_feature_dim=256, init_gauss_sigma=1.0, num_dist_bins=5, bin_displacement=1.0, test_loss=None, mask_init_factor=4.0, iou_input_dim=(256, 256), iou_inter_dim=(256, 256), jitter_sigma_factor=None): # backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) norm_scale = math.sqrt(1.0 / (out_feature_dim * filter_size * filter_size)) # classifier clf_feature_extractor = clf_features.residual_bottleneck( num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=out_feature_dim) initializer = clf_initializer.FilterInitializerLinear( filter_size=filter_size, filter_norm=init_filter_norm, feature_dim=out_feature_dim) optimizer = clf_optimizer.SteepestDescentLearn( num_iter=optim_iter, filter_size=filter_size, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, feature_dim=out_feature_dim, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, test_loss=test_loss, mask_init_factor=mask_init_factor) classifier = target_clf.LinearFilter( filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=clf_feature_extractor, output_activation=output_activation, jitter_sigma_factor=jitter_sigma_factor) # Bounding box regressor bb_regressor = bbmodels.AtomIoUNet(input_dim=(4 * 128, 4 * 256), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) net = OptimTracker(feature_extractor=backbone_net, classifier=classifier, bb_regressor=bb_regressor, classification_layer=classification_layer, bb_regressor_layer=['layer2', 'layer3']) return net
def kysnet_res50(filter_size=4, optim_iter=3, appearance_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, classification_layer='layer3', backbone_pretrained=True, clf_feat_blocks=0, clf_feat_norm=True, final_conv=True, init_filter_norm=False, mask_init_factor=3.0, score_act='relu', target_mask_act='sigmoid', num_dist_bins=100, bin_displacement=0.1, detach_length=float('Inf'), train_feature_extractor=True, train_iounet=True, iou_input_dim=(256, 256), iou_inter_dim=(256, 256), cv_kernel_size=3, cv_max_displacement=9, cv_stride=1, init_gauss_sigma=1.0, state_dim=8, representation_predictor_dims=(64, 32), gru_ksz=3, conf_measure='max', dimp_thresh=None): # ######################## backbone ######################## backbone_net = backbones.resnet50(pretrained=backbone_pretrained) norm_scale = math.sqrt( 1.0 / (appearance_feature_dim * filter_size * filter_size)) # ######################## classifier ######################## clf_feature_extractor = clf_features.residual_bottleneck( num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=appearance_feature_dim) # Initializer for the DiMP classifier initializer = clf_initializer.FilterInitializerLinear( filter_size=filter_size, filter_norm=init_filter_norm, feature_dim=appearance_feature_dim) # Optimizer for the DiMP classifier optimizer = clf_optimizer.DiMPSteepestDescentGN( num_iter=optim_iter, feat_stride=16, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, mask_init_factor=mask_init_factor, score_act=score_act, act_param=None, mask_act=target_mask_act, detach_length=detach_length) # The classifier module classifier = target_clf.LinearFilter( filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=clf_feature_extractor) # Bounding box regressor bb_regressor = bbmodels.AtomIoUNet(input_dim=(4 * 128, 4 * 256), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) cost_volume_layer = cost_volume.CostVolume(cv_kernel_size, cv_max_displacement, stride=cv_stride, abs_coordinate_output=True) motion_response_predictor = resp_pred.ResponsePredictor( state_dim=state_dim, representation_predictor_dims=representation_predictor_dims, gru_ksz=gru_ksz, conf_measure=conf_measure, dimp_thresh=dimp_thresh) response_predictor = predictor_wrappers.PredictorWrapper( cost_volume_layer, motion_response_predictor) net = KYSNet(backbone_feature_extractor=backbone_net, dimp_classifier=classifier, predictor=response_predictor, bb_regressor=bb_regressor, classification_layer=classification_layer, bb_regressor_layer=['layer2', 'layer3'], train_feature_extractor=train_feature_extractor, train_iounet=train_iounet) return net
def fcotnet(clf_filter_size=4, reg_filter_size=3, optim_iter=5, optim_init_step=1.0, optim_init_reg=0.01, classification_layer='layer3', feat_stride=16, backbone_pretrained=True, clf_feat_blocks=0, clf_feat_norm=True, init_filter_norm=False, final_conv=True, out_feature_dim=512, norm_scale_coef=2, init_gauss_sigma=1.0, num_dist_bins=5, bin_displacement=1.0, mask_init_factor=4.0, score_act='relu', act_param=None, target_mask_act='sigmoid', detach_length=float('Inf'), train_cls_72_and_reg_init=True, train_reg_optimizer=False, train_cls_18=False): # Backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) pyramid_first_conv = FPNUpBlock(res_channels=1024, planes=256, smooth_output=False, first_conv=True) up_36 = FPNUpBlock(res_channels=512, planes=256, smooth_output=False, first_conv=False) up_72 = FPNUpBlock(res_channels=256, planes=256, smooth_output=True, first_conv=False) # classifier_72 norm_scale_72 = math.sqrt(norm_scale_coef / (256 * clf_filter_size * clf_filter_size)) clf_head_72 = clf_features.clf_head_72(feature_dim=256, l2norm=clf_feat_norm, norm_scale=norm_scale_72, out_dim=256, inner_dim=128) initializer_72 = clf_initializer.FilterInitializerLinear( filter_size=clf_filter_size, filter_norm=init_filter_norm, feature_dim=256, feature_stride=4) optimizer_72 = clf_optimizer.DiMPSteepestDescentGN( num_iter=optim_iter, feat_stride=4, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, mask_init_factor=mask_init_factor, score_act=score_act, act_param=act_param, mask_act=target_mask_act, detach_length=detach_length) classifier_72 = target_clf.LinearFilter(filter_size=clf_filter_size, filter_initializer=initializer_72, filter_optimizer=optimizer_72, feature_extractor=clf_head_72) # classifier_18 (We use the same architecture of classifier_18 with DiMP.) norm_scale_18 = math.sqrt( 1.0 / (out_feature_dim * clf_filter_size * clf_filter_size)) clf_head_18 = clf_features.clf_head_18(num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale_18, out_dim=out_feature_dim) initializer_18 = clf_initializer.FilterInitializerLinear( filter_size=clf_filter_size, filter_norm=init_filter_norm, feature_dim=out_feature_dim) optimizer_18 = clf_optimizer.DiMPSteepestDescentGN( num_iter=optim_iter, feat_stride=feat_stride, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, mask_init_factor=mask_init_factor, score_act=score_act, act_param=act_param, mask_act=target_mask_act, detach_length=detach_length) classifier_18 = target_clf.LinearFilter(filter_size=clf_filter_size, filter_initializer=initializer_18, filter_optimizer=optimizer_18, feature_extractor=clf_head_18) # regressor_72 reg_optimizer_72 = reg_optimizer.RegSteepestDescentGN( num_iter=optim_iter, feat_stride=4, init_step_length=1.0, init_filter_reg=optim_init_reg, detach_length=detach_length) regressor_72 = RegFilter( pool_size=reg_filter_size, filter_dim=4, filter_channel=256, input_features_size=72, input_features_channel=256, inner_channel=128, filter_optimizer=reg_optimizer_72, train_reg_optimizer=train_reg_optimizer, train_cls_72_and_reg_init=train_cls_72_and_reg_init) # FCOT network net = FCOTNet(feature_extractor=backbone_net, classification_layer=classification_layer, pyramid_first_conv=pyramid_first_conv, pyramid_36=up_36, pyramid_72=up_72, classifier_18=classifier_18, classifier_72=classifier_72, regressor_72=regressor_72, train_reg_optimizer=train_reg_optimizer, train_cls_18=train_cls_18, train_cls_72_and_reg_init=train_cls_72_and_reg_init) return net