def __init__(self, num_iter=1, filter_size=1, feature_dim=256, feat_stride=16, init_step_length=1.0, init_filter_reg=1e-2, init_gauss_sigma=1.0, num_dist_bins=5, bin_displacement=1.0, mask_init_factor=4.0, test_loss=None): super().__init__() if test_loss is None: test_loss = ltr_losses.LBHinge(threshold=0.05) self.log_step_length = nn.Parameter( math.log(init_step_length) * torch.ones(1)) self.num_iter = num_iter self.test_loss = test_loss self.filter_reg = nn.Parameter(init_filter_reg * torch.ones(1)) self.feat_stride = feat_stride self.distance_map = DistanceMap(num_dist_bins, bin_displacement) # Distance coordinates d = torch.arange(num_dist_bins, dtype=torch.float32).view( 1, -1, 1, 1) * bin_displacement if init_gauss_sigma == 0: init_gauss = torch.zeros_like(d) init_gauss[0, 0, 0, 0] = 1 else: init_gauss = torch.exp(-1 / 2 * (d / init_gauss_sigma)**2) self.label_map_predictor = nn.Conv2d(num_dist_bins, 1, kernel_size=1, bias=False) self.label_map_predictor.weight.data = init_gauss - init_gauss.min() self.target_mask_predictor = nn.Sequential( nn.Conv2d(num_dist_bins, 1, kernel_size=1, bias=False), nn.Sigmoid()) self.target_mask_predictor[ 0].weight.data = mask_init_factor * torch.tanh(2.0 - d) self.spatial_weight_predictor = nn.Conv2d(num_dist_bins, 1, kernel_size=1, bias=False) self.spatial_weight_predictor.weight.data.fill_(1.0)
def run(settings): settings.description = 'First training with gradient descent.' settings.batch_size = 6 settings.num_workers = 16 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1 / 4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 settings.print_stats = [ 'Loss/total', 'Loss/iou', 'ClfTrain/init_loss', 'ClfTrain/train_loss', 'ClfTrain/iter_loss', 'ClfTrain/test_loss', 'ClfTrain/test_init_loss', 'ClfTrain/test_iter_loss' ] # Train datasets #lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k_i(settings.env.got10k_dir, split='train') #trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=[0, 1, 2, 3]) #coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets # lasot_val = Lasot(settings.env.lasot_dir, vid_ids=list(range(17, 21))) got10k_val = Got10k_i(settings.env.got10k_dir, split='val') # Data transform transform_joint = dltransforms.ToGrayscale(probability=0.05) transform_train = torchvision.transforms.Compose([ dltransforms.ToTensorAndJitter(0.2), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) transform_val = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 8, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } data_processing_train = processing.TrackingProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.TrackingProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.RandomSequenceWithDistractors( [got10k_train], [1], samples_per_epoch=26000, max_gap=30, frame_sample_mode='causal', num_seq_test_frames=3, num_class_distractor_frames=0, num_seq_train_frames=3, num_class_distractor_train_frames=0, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders # dataset_val = sampler.RandomSequence([lasot_val, got10k_val], [1,1], samples_per_epoch=5000, max_gap=100, # num_test_frames=1, processing=data_processing_val) dataset_val = sampler.RandomSequenceWithDistractors( [got10k_val], [1], samples_per_epoch=5000, max_gap=30, frame_sample_mode='causal', num_seq_test_frames=3, num_class_distractor_frames=0, num_seq_train_frames=3, num_class_distractor_train_frames=0, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = optim_tracker_models.steepest_descent_learn_filter_resnet50_newiou( filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=10, bin_displacement=0.5, mask_init_factor=3.0) objective = { 'iou': nn.MSELoss(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold) } loss_weight = { 'iou': 1, 'test_clf': 100, 'train_clf': 0, 'init_clf': 0, 'test_init_clf': 100, 'test_iter_clf': 400 } actor = actors.OptimTrackerActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam( [{ 'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.bb_regressor.parameters() }, { 'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): settings.description = 'Default train settings for DiMP with ResNet50 as backbone.' settings.batch_size = 10 settings.num_workers = 8 settings.multi_gpu = False settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1 / 4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/clf_ce', 'ClfTrain/test_loss'] ''' Depth Inputs: 1) raw_depth X 2) norm_depth 3) centered_norm_depth 4) centered_raw_depth X 5) colormap 6) centered_colormap ''' # depth_inputs = 'norm_depth' # depth_inputs = 'colormap' depth_inputs = 'hha' # Train datasets # depthtrack_train = DepthTrack(root=settings.env.depthtrack_dir, split='train', dtype=depth_inputs) coco_train = MSCOCOSeq_depth(settings.env.cocodepth_dir, dtype=depth_inputs) # got10k_depth_train = MSCOCOSeq_depth(settings.env.got10kdepth_dir, dtype=depth_inputs) lasot_depth_train = Lasot_depth(root=settings.env.lasotdepth_dir, rgb_root=settings.env.lasot_dir, dtype=depth_inputs) # Validation datasets depthtrack_val = DepthTrack(root=settings.env.depthtrack_dir, split='val', dtype=depth_inputs) # Data transform transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 8, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } data_processing_train = processing.DiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.DiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.DiMPSampler([coco_train, lasot_depth_train], [1, 1], samples_per_epoch=26000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders dataset_val = sampler.DiMPSampler([depthtrack_val], [1], samples_per_epoch=5000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet.dimpnet50( filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # net = dimpnet.dimpnet50(filter_size=settings.target_filter_sz, backbone_pretrained=False, optim_iter=5, # !!!!!!!!!!!!!!!!!!!!!!!!!!!! clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100, bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu') # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = { 'iou': nn.MSELoss(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold) } loss_weight = { 'iou': 1, 'test_clf': 100, 'test_init_clf': 100, 'test_iter_clf': 400 } actor = actors.DiMPActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam( [{ 'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.bb_regressor.parameters() }, { 'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): settings.description = 'Default train settings for FCOT with ResNet50 as backbone.' settings.multi_gpu = True settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1 / 4 settings.clf_target_filter_sz = 4 settings.reg_target_filter_sz = 3 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 settings.logging_file = 'fcot_log.txt' # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets got10k_val = Got10k(settings.env.got10k_dir, split='votval') # Data transform transform_joint = dltransforms.ToGrayscale(probability=0.05) transform_train = torchvision.transforms.Compose([ dltransforms.ToTensorAndJitter(0.2), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) transform_val = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 8, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.clf_target_filter_sz } data_processing_train = processing_fcot.AnchorFreeProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', output_spatial_scale=72 / 288., proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing_fcot.AnchorFreeProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', output_spatial_scale=72 / 288., proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.FCOTSampler( [lasot_train, got10k_train, trackingnet_train, coco_train], [settings.lasot_rate, 1, 1, 1], samples_per_epoch=settings.samples_per_epoch, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders dataset_val = sampler.FCOTSampler([got10k_val], [1], samples_per_epoch=5000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, epoch_interval=5, num_workers=settings.num_workers, shuffle=False, drop_last=True, stack_dim=1) # Create network net = fcotnet.fcotnet( clf_filter_size=settings.clf_target_filter_sz, reg_filter_size=settings.reg_target_filter_sz, backbone_pretrained=True, optim_iter=5, norm_scale_coef=settings.norm_scale_coef, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100, bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu', train_reg_optimizer=settings.train_reg_optimizer, train_cls_72_and_reg_init=settings.train_cls_72_and_reg_init, train_cls_18=settings.train_cls_18) # Load dimp-model as initial weights device = torch.device('cuda:{}'.format(settings.devices_id[0]) if torch. cuda.is_available() else 'cpu') if settings.use_pretrained_dimp: assert settings.pretrained_dimp50 is not None dimp50 = torch.load(settings.pretrained_dimp50, map_location=device) state_dict = collections.OrderedDict() for key, v in dimp50['net'].items(): if key.split('.')[0] == 'feature_extractor': state_dict['.'.join(key.split('.')[1:])] = v net.feature_extractor.load_state_dict(state_dict) state_dict = collections.OrderedDict() for key, v in dimp50['net'].items(): if key.split('.')[0] == 'classifier': state_dict['.'.join(key.split('.')[1:])] = v net.classifier_18.load_state_dict(state_dict) print("loading backbone and Classifier modules from DiMP50 done.") # Load fcot-model trained in the previous stage if settings.load_model: assert settings.fcot_model is not None load_dict = torch.load(settings.fcot_model) fcot_dict = net.state_dict() load_fcotnet_dict = { k: v for k, v in load_dict['net'].items() if k in fcot_dict } fcot_dict.update(load_fcotnet_dict) net.load_state_dict(fcot_dict) print("loading FCOT model done.") # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, device_ids=settings.devices_id, dim=1).to(device) # Loss for cls_72, cls_18 and regression objective = { 'test_clf_72': ltr_losses.LBHinge(threshold=settings.hinge_threshold), 'test_clf_18': ltr_losses.LBHinge(threshold=settings.hinge_threshold), 'reg_72': REGLoss(dim=4) } # Create actor and adam-optimizer if settings.train_cls_72_and_reg_init and settings.train_cls_18: ### train regression branch and clssification branches jointly, except for regression optimizer (TODO: fix) print("train cls_72, cls_18 and reg_init jointly...") loss_weight = { 'test_clf_72': 100, 'test_init_clf_72': 100, 'test_iter_clf_72': 400, 'test_clf_18': 100, 'test_init_clf_18': 100, 'test_iter_clf_18': 400, 'reg_72': 1 } actor = actors.FcotActor(net=net, objective=objective, loss_weight=loss_weight, device=device) optimizer = optim.Adam( [{ 'params': actor.net.classifier_72.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier_72.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier_72.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier_18.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier_18.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier_18.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.regressor_72.parameters() }, { 'params': actor.net.pyramid_first_conv.parameters() }, { 'params': actor.net.pyramid_36.parameters() }, { 'params': actor.net.pyramid_72.parameters() }, { 'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[35, 46, 60], gamma=0.2) elif settings.train_cls_72_and_reg_init: # Setting of the first training stage: train backbone, cls_72 and regression (except for regression optimizer) branch. print("train cls_72 and reg_init...") loss_weight = { 'test_clf_72': 100, 'test_init_clf_72': 10, 'test_iter_clf_72': 400, 'test_clf_18': 0, 'test_init_clf_18': 0, 'test_iter_clf_18': 0, 'reg_72': 0.3 } actor = actors.FcotCls72AndRegInitActor(net=net, objective=objective, loss_weight=loss_weight, device=device) optimizer = optim.Adam( [{ 'params': actor.net.classifier_72.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier_72.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier_72.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.regressor_72.parameters() }, { 'params': actor.net.pyramid_first_conv.parameters() }, { 'params': actor.net.pyramid_36.parameters() }, { 'params': actor.net.pyramid_72.parameters() }, { 'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[35, 45, 69], gamma=0.2) elif settings.train_reg_optimizer: # Setting of the second training stage: train regression optimizer. print("train regression optimizer...") loss_weight = { 'test_reg_72': 1, 'test_init_reg_72': 0, 'test_iter_reg_72': 1 } actor = actors.FcotOnlineRegressionActor(net=net, objective=objective, loss_weight=loss_weight, device=device) optimizer = optim.Adam( [{ 'params': actor.net.regressor_72.filter_optimizer.parameters() }], lr=5e-4) lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2], gamma=0.2) elif settings.train_cls_18: print("train cls_18...") # Setting of the third training stage: train cls_18 branch. loss_weight = { 'test_clf_18': 100, 'test_init_clf_18': 100, 'test_iter_clf_18': 400 } actor = actors.FcotCls18Actor(net=net, objective=objective, loss_weight=loss_weight, device=device) optimizer = optim.Adam( [{ 'params': actor.net.classifier_18.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier_18.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier_18.feature_extractor.parameters(), 'lr': 5e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[25], gamma=0.2) else: # TODO: train jointly raise Exception("Please run training in correct way.") trainer = LTRFcotTrainer(actor, [loader_train, loader_val], optimizer, settings, device, lr_scheduler, logging_file=settings.logging_file) trainer.train(settings.total_epochs, load_latest=True, fail_safe=True)
def run(settings): settings.description = 'Default train settings for DiMP with ResNet50 as backbone.' settings.batch_size = 4 settings.num_workers = 8 settings.multi_gpu = False settings.print_interval = 5 settings.normalize_mean = [0.485, 0.456, 0.406, 0] settings.normalize_std = [0.229, 0.224, 0.225, 1.0] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1/4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/init_loss', 'ClfTrain/test_loss'] # # Train datasets # lasot_train = Lasot(settings.env.lasot_dir, split='train') # got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') # trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) # coco_train = MSCOCOSeq(settings.env.coco_dir) # # # Validation datasets # got10k_val = Got10k(settings.env.got10k_dir, split='votval') # Train datasets #lasot_train = Lasot(split='train') ptb_train = PrincetonRGBD(split='validation') # stc_train = StcRGBD(split='train') # kevinlai_train=kevinlaiRGBD(split='train') #trackingnet_train = TrackingNet(set_ids=list(range(11))) #coco_train = MSCOCOSeq() # Validation datasets #lasot_val = Lasot(split='train')#TrackingNet(set_ids=list(range(11,12))) ptb_val = PrincetonRGBD(split='validation') # Data transform transform_joint = dltransforms.ToGrayscale(probability=0.05) transform_train = torchvision.transforms.Compose([dltransforms.ToTensorAndJitter(0.2), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)]) transform_val = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)]) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = {'min_iou': 0.1, 'boxes_per_frame': 8, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]} label_params = {'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz} data_processing_train = processing.DiMPProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.DiMPProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.DiMPSampler([ptb_train], [1], samples_per_epoch=26000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders # dataset_val = sampler.DiMPSampler([got10k_val], [1], samples_per_epoch=5000, max_gap=30, # num_test_frames=3, num_train_frames=3, # processing=data_processing_val) dataset_val = sampler.DiMPSampler([ptb_val], [1], samples_per_epoch=5000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet_rgbd_locc.dimpnet50(filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100, bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu') # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = {'iou': nn.MSELoss(), 'occ': nn.SmoothL1Loss(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold)} loss_weight = {'iou': 1, 'occ':1, 'test_clf': 100, 'test_init_clf': 100, 'test_iter_clf': 400} actor = actors.DiMPActor_OCC(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam([{'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 0*5e-5}, {'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 0*5e-4}, {'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 0*5e-5}, {'params': actor.net.occ_classifer.parameters(), 'lr': 2e-3}, {'params': actor.net.bb_regressor.parameters(), 'lr': 0*2e-4}, {'params': actor.net.feature_extractor.parameters(), 'lr': 0*2e-5}, {'params': actor.net.feature_extractor_depth.parameters(), 'lr': 0.1*2e-5}], lr=0.1*2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) #trainer.train(10, load_latest=True, fail_safe=True, path_pretrained=None)#'./checkpoints/dimp50.pth') #trainer.train(50, load_latest=True, fail_safe=True, path_pretrained=None) trainer.train(50, load_latest=True, fail_safe=True, path_pretrained='./checkpoints/dimp50.pth')
def run(settings): settings.description = 'Transformer-assisted tracker. Our baseline approach is SuperDiMP' settings.batch_size = 40 settings.num_workers = 8 settings.multi_gpu = True settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 6.0 settings.output_sigma_factor = 1 / 4 settings.target_filter_sz = 4 settings.feature_sz = 22 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 5.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/init_loss', 'ClfTrain/test_loss'] # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets got10k_val = Got10k(settings.env.got10k_dir, split='votval') # Data transform transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05), tfm.RandomHorizontalFlip(probability=0.5)) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.RandomHorizontalFlip(probability=0.5), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = { 'boxes_per_frame': 128, 'gt_sigma': (0.05, 0.05), 'proposal_sigma': [(0.05, 0.05), (0.5, 0.5)] } label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } label_density_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } data_processing_train = processing.KLDiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, crop_type='inside_major', max_scale_change=1.5, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, label_density_params=label_density_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.KLDiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, crop_type='inside_major', max_scale_change=1.5, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, label_density_params=label_density_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.DiMPSampler( [lasot_train, got10k_train, trackingnet_train, coco_train], [1, 1, 1, 1], samples_per_epoch=50000, max_gap=500, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders dataset_val = sampler.DiMPSampler([got10k_val], [1], samples_per_epoch=10000, max_gap=500, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet.dimpnet50( filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100, bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu', frozen_backbone_layers=['conv1', 'bn1', 'layer1', 'layer2']) # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = { 'bb_ce': klreg_losses.KLRegression(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold) } loss_weight = { 'bb_ce': 0.01, 'test_clf': 100, 'test_init_clf': 100, 'test_iter_clf': 400 } actor = tracking_actors.KLDiMPActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam( [{ 'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.transformer.parameters(), 'lr': 1e-3 }, { 'params': actor.net.bb_regressor.parameters(), 'lr': 1e-3 }, { 'params': actor.net.feature_extractor.layer3.parameters(), 'lr': 2e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=True, fail_safe=True)