def run(settings): # Most common settings are assigned in the settings struct settings.description = 'distilled ATOM IoUNet with default settings according to the paper.' settings.batch_size = 32 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(11))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets trackingnet_val = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(11, 12))) # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } data_processing_train = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler( [lasot_train, trackingnet_train, coco_train], [1, 1, 1], samples_per_epoch=1000 * settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([trackingnet_val], [1], samples_per_epoch=500 * settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Load teacher network teacher_net = atom_models.atom_resnet18(backbone_pretrained=True) teacher_path = '/home/ddanier/CFKD/pytracking/networks/atom_default.pth' teacher_net = loading.load_weights(teacher_net, teacher_path, strict=True) print( '*******************Teacher net loaded successfully*******************' ) # Create student network and actor student_net = atom_models.atom_resnet18small(backbone_pretrained=False) objective = distillation.CFKDLoss( reg_loss=nn.MSELoss(), w_cf=0.01, w_fd=100, cf_layers=['conv1', 'layer1', 'layer2', 'layer3']) actor = actors.AtomCompressionActor(student_net, teacher_net, objective) # Optimizer optimizer = optim.Adam( [{ 'params': actor.student_net.feature_extractor.parameters() }, { 'params': actor.student_net.bb_regressor.parameters() }], lr=1e-2) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1) # Create trainer trainer = LTRDistillationTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(50, load_latest=False, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'ATOM IoUNet with default settings, but additionally using GOT10k for training.' settings.batch_size = 64 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} # Train datasets # lasot_train = Lasot(settings.env.lasot_dir, split='train') # got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') # trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) # coco_train = MSCOCOSeq(settings.env.coco_dir) coco_train = MSCOCOSeq_depth(settings.env.cocodepth_dir, dtype='rgbcolormap') lasot_depth_train = Lasot_depth(root=settings.env.lasotdepth_dir, dtype='rgbcolormap') depthtrack_train = DepthTrack(root=settings.env.depthtrack_dir, dtype='rgbcolormap') depthtrack_horizontal_train = DepthTrack( root=settings.env.depthtrack_horizontal_dir, dtype='rgbcolormap') depthtrack_vertical_train = DepthTrack( root=settings.env.depthtrack_vertical_dir, dtype='rgbcolormap') # Validation datasets # got10k_val = Got10k(settings.env.got10k_dir, split='votval') cdtb_val = CDTB(settings.env.cdtb_dir, split='val', dtype='rgbcolormap') # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } data_processing_train = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler([ lasot_depth_train, depthtrack_train, depthtrack_horizontal_train, depthtrack_vertical_train, coco_train ], [1, 1, 1, 1, 1], samples_per_epoch=1000 * settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([cdtb_val], [1], samples_per_epoch=500 * settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = atom_models.atom_resnet18_DeT( backbone_pretrained=True, merge_type='max') # 'mean', 'conv', 'weightedSum' objective = nn.MSELoss() actor = actors.AtomActor(net=net, objective=objective) # Optimizer # optimizer = optim.Adam(actor.net.bb_regressor.parameters(), lr=1e-3) optimizer = optim.Adam( [ { 'params': actor.net.bb_regressor.parameters() }, { 'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5 }, { 'params': actor.net.feature_extractor_depth.parameters(), 'lr': 2e-5 }, # {'params': actor.net.merge_layer2.parameters(), 'lr': 2e-5}, # {'params': actor.net.merge_layer3.parameters(), 'lr': 2e-5}, ], lr=1e-3) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(80, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.device = 'cuda' settings.description = 'TransT with default settings.' settings.batch_size = 38 settings.num_workers = 8 settings.multi_gpu = True settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 4.0 settings.template_area_factor = 2.0 settings.search_feature_sz = 32 settings.template_feature_sz = 16 settings.search_sz = settings.search_feature_sz * 8 settings.temp_sz = settings.template_feature_sz * 8 settings.center_jitter_factor = {'search': 3, 'template': 0} settings.scale_jitter_factor = {'search': 0.25, 'template': 0} # Transformer settings.position_embedding = 'sine' settings.hidden_dim = 256 settings.dropout = 0.1 settings.nheads = 8 settings.dim_feedforward = 2048 settings.featurefusion_layers = 4 # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) coco_train = MSCOCOSeq(settings.env.coco_dir) # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform(tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs data_processing_train = processing.TransTProcessing(search_area_factor=settings.search_area_factor, template_area_factor = settings.template_area_factor, search_sz=settings.search_sz, temp_sz=settings.temp_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', transform=transform_train, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.TransTSampler([lasot_train, got10k_train, coco_train, trackingnet_train], [1,1,1,1], samples_per_epoch=1000*settings.batch_size, max_gap=100, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=0) # Create network and actor model = transt_models.transt_resnet50(settings) # Wrap the network for multi GPU training if settings.multi_gpu: model = MultiGPU(model, dim=0) objective = transt_models.transt_loss(settings) n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) actor = actors.TranstActor(net=model, objective=objective) # Optimizer param_dicts = [ {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad], "lr": 1e-5, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=1e-4, weight_decay=1e-4) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 500) # Create trainer trainer = LTRTrainer(actor, [loader_train], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(1000, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'construct training settings for depth-based rgbd visual tracking' settings.batch_size = 2 settings.num_workers = 2 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} # Train datasets cdtb_train = CDTB() # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform(tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = tfm.Transform(tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs proposal_params = {'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]} data_processing_train = processing.DepthProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.DepthProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.DepthSampler([cdtb_train], p_datasets=None, samples_per_epoch=1000*settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.DepthSampler([cdtb_train], [1], samples_per_epoch=500*settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = depth_models.depth_atom_resnet50(backbone_pretrained=True) objective = nn.MSELoss() actor = actors.AtomActor(net=net, objective=objective) # Optimizer optimizer = optim.Adam(actor.net.bb_regressor.parameters(), lr=1e-3) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct debug = False if debug: settings.batch_size = 8 # 8 # 4 # 120 # 70 # 38 settings.num_workers = 0 # 24 # 30 # 10 # 35 # 30 min(settings.batch_size, 16) settings.multi_gpu = False # True # True # True # True # True # True # True else: settings.batch_size = 64 # 8 # 4 # 120 # 70 # 38 settings.num_workers = 20 # 24 # 30 # 10 # 35 # 30 min(settings.batch_size, 16) settings.multi_gpu = True # True # True # True # True # True # True # True settings.device = 'cuda' settings.description = 'TransT with default settings.' settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 4.0 settings.template_area_factor = 2. settings.search_feature_sz = 32 settings.template_feature_sz = 16 settings.search_sz = settings.search_feature_sz * 8 settings.temp_sz = settings.template_feature_sz * 8 settings.center_jitter_factor = {'search': 3.0, 'template': 0} # 3 settings.scale_jitter_factor = {'search': 0.25, 'template': 0} # 0.25 settings.sequence_length = 8 # 30 # 64 NEXT # Same as PT settings.rand = True # settings.search_gap = 1 # Depreciated settings.init_ckpt = "pytracking/networks/transt.pth" # Transformer settings.position_embedding = 'sine' settings.hidden_dim = 256 settings.dropout = 0.1 settings.nheads = 8 settings.dim_feedforward = 2048 settings.featurefusion_layers = 4 # settings.sigma = 1 / 4 / 5. # settings.kernel = 4 # settings.feature = 32 # 18 # settings.output_sz = 256 # settings.feature * 16 # settings.end_pad_if_even = False # settings.label_function_params = True settings.move_data_to_gpu = True # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') # votval trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) # coco_train = MSCOCOSeq(settings.env.coco_dir) # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) # transform_train = tfm.Transform(tfm.ToTensorAndJitter(0.2), # tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), # tfm.RandomHorizontalFlip(), # tfm.RandomAffine(p_flip=0.5, max_scale=1.5), # tfm.RandomBlur(1), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs data_processing_train = processing.TransTProcessing( search_area_factor=settings.search_area_factor, template_area_factor=settings.template_area_factor, search_sz=settings.search_sz, temp_sz=settings.temp_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', joint=False, # Whether or not to apply same transform to every image transform=transform_train, rand=settings.rand, label_function_params=None, # settings.label_function_params, joint_transform=transform_joint) # The sampler for training # dataset_train = sampler.TransTSampler([got10k_train], [1], samples_per_epoch=1000*settings.batch_size, max_gap=100, processing=data_processing_train, num_search_frames=settings.sequence_length, frame_sample_mode="rnn_causal") dataset_train = sampler.TransTSampler( [lasot_train, got10k_train, trackingnet_train], [1, 1, 1], samples_per_epoch=1000 * settings.batch_size, max_gap=settings.sequence_length * 4, processing=data_processing_train, num_search_frames=settings.sequence_length, frame_sample_mode="rnn_interval") # dataset_train = sampler.TransTSampler([lasot_train, got10k_train, trackingnet_train], [1,1,1], samples_per_epoch=1000*settings.batch_size, max_gap=100, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=0, pin_memory=settings.move_data_to_gpu == False) # Create network and actor model = transt_models.transt_resnet50(settings) # Wrap the network for multi GPU training if settings.multi_gpu: model = MultiGPU(model, dim=0) objective = transt_models.transt_loss(settings) n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) actor = actors.OldCircuitTranstActor(net=model, objective=objective) # Optimizer # Change learning rate forthe Q we have changed and the RNN and the readout # q = self.mix_q(torch.cat([q, self.mix_norm(exc)], -1)) # self.class_embed_new = MLP(hidden_dim * 2, hidden_dim, num_classes + 1, 3) # self.bbox_embed_new = MLP(hidden_dim * 2, hidden_dim, 4, 3) param_dicts = [ # {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad and "circuit" not in n and "mix" not in n and "new" not in n]}, # { # "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad], # "lr": 1e-5, # }, { "params": [ p for n, p in model.named_parameters() if "encoder" not in n or "input_proj" not in n or "decoder" not in n or "backbone" not in n or "embed" not in n ], # "exc" in n or "circuit" in n or "mix" in n or "new" in n or "rnn" in n or "step" in n], # or "class_embed" in n or "bbox_embed" in n], "lr": 1e-4, }, # { # "params": [p for n, p in model.named_parameters() if "class_embed" in n or "bbox_embed" in n], # if "circuit" in n or "mix" in n or "new" in n or "class_embed" in n or "bbox_embed" in n], # "lr": 1e-5, # }, ] print("Higher lr:") print([ n for n, p in model.named_parameters() if "encoder" not in n and "input_proj" not in n and "decoder" not in n and "backbone" not in n and "embed" not in n ]) print("*" * 60) for n, p in model.named_parameters(): # if "circuit" in n or "mix" in n or "new" in n or "class_embed" in n or "bbox_embed" in n or "decoder" in n or "encoder" in n or "rnn" in n: if "backbone" not in n: # "circuit" in n or "exc" in n or "mix" or "class_embed" in n or "bbox_embed" in n or "new" in n or "rnn" in n or "encoder" in n or "decoder" in n or "step" in n or "proj" in n: # or "decoder" in n: # "class_embed" in n or "bbox_embed" in n or "rnn" in n: # if "encoder" not in n or "input_proj" not in n or "decoder" not in n or "backbone" not in n or "embed" not in n: print("TRAINING {}".format(n)) else: p.requires_grad = False print("Removing grad on {}".format(n)) optimizer = torch.optim.AdamW( param_dicts, lr=1e-4, # lr=1e-4, weight_decay=1e-4) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 500) # Create trainer trainer = LTRTrainer(actor, [loader_train], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(1000, load_latest=True, fail_safe=True)
def run(settings): settings.description = 'Default train settings for DiMP with ResNet50 as backbone.' settings.batch_size = 10 settings.num_workers = 8 settings.multi_gpu = False settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1 / 4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/clf_ce', 'ClfTrain/test_loss'] # Train datasets # lasot_train = Lasot(settings.env.lasot_dir, split='train') # got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') # trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) # coco_train = MSCOCOSeq(settings.env.coco_dir) dtype = 'colormap' cdtb_depth_train = CDTB(root=settings.env.cdtb_dir, split=None, dtype=dtype) cdtb_st_train = CDTB_ST(root=settings.env.cdtb_st_dir, split=None, dtype=dtype) # depthtrack_train = DepthTrack(root=settings.env.depthtrack_dir, split='train', dtype=dtype) coco_train = MSCOCOSeq_depth(settings.env.cocodepth_dir, dtype=dtype) got10k_depth_train = Got10k_depth(settings.env.got10kdepth_dir, dtype=dtype) lasot_depth_train = Lasot_depth(root=settings.env.lasotdepth_dir, rgb_root=settings.env.lasot_dir, dtype=dtype) # depthtrack_horizontal_train = DepthTrack(root=settings.env.depthtrack_horizontal_dir, split='train', dtype='color') # depthtrack_vertical_train = DepthTrack(root=settings.env.depthtrack_vertical_dir, split='train', dtype='color') # Validation datasets # got10k_val = Got10k(settings.env.got10k_dir, split='votval') depthtrack_val = DepthTrack(root=settings.env.depthtrack_dir, split='val', dtype=dtype) # Data transform transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 8, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } data_processing_train = processing.DiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.DiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader # dataset_train = sampler.DiMPSampler([lasot_train, got10k_train, trackingnet_train, coco_train], [0.25,1,1,1], # samples_per_epoch=26000, max_gap=30, num_test_frames=3, num_train_frames=3, # processing=data_processing_train) dataset_train = sampler.DiMPSampler([cdtb_depth_train], [1], samples_per_epoch=26000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders # dataset_val = sampler.DiMPSampler([got10k_val], [1], samples_per_epoch=5000, max_gap=30, # num_test_frames=3, num_train_frames=3, # processing=data_processing_val) dataset_val = sampler.DiMPSampler([depthtrack_val], [1], samples_per_epoch=5000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet.dimpnet50(filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100, bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu') # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = { 'iou': nn.MSELoss(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold) } loss_weight = { 'iou': 1, 'test_clf': 100, 'test_init_clf': 100, 'test_iter_clf': 400 } actor = actors.DiMPActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam( [{ 'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.bb_regressor.parameters() }, { 'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5 }], lr=2e-4) # optimizer = optim.Adam([{'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 1e-4}, # {'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 1e-4}, # {'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 1e-45}, # {'params': actor.net.bb_regressor.parameters()}, # {'params': actor.net.feature_extractor.parameters(), 'lr': 1e-4}], # lr=1e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(150, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'SiamFC with Alexnet backbone and trained with vid' settings.print_interval = 100 # How often to print loss and other info settings.batch_size = 8 # Batch size settings.num_workers = 8 # Number of workers for image loading settings.normalize_mean = [0., 0., 0.] # Normalize mean settings.normalize_std = [1 / 255., 1 / 255., 1 / 255.] # Normalize std settings.search_area_factor = { 'train': 1.0, 'test': 2.0078740157480315 } # roughly the same as SiamFC settings.output_sz = {'train': 127, 'test': 255} settings.scale_type = 'context' settings.border_type = 'meanpad' # Settings for the image sample and proposal generation settings.center_jitter_factor = {'train': 0, 'test': 0} settings.scale_jitter_factor = {'train': 0, 'test': 0.} # Train datasets vid_train = ImagenetVID() # Validation datasets got10k_val = vid_train #Got10k(split='val') # The joint augmentation transform, that is applied to the pairs jointly transform_joint = dltransforms.ToGrayscale(probability=0.25) # The augmentation transform applied to the training set (individually to each image in the pair) transform_exemplar = dltransforms.Compose([ dltransforms.ToArray(), dltransforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) transform_instance = dltransforms.Compose([ DataAug(), dltransforms.ToArray(), dltransforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # Data processing to do on the training pairs data_processing_train = processing.SiamFCProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', train_transform=transform_exemplar, test_transform=transform_instance, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.SiamFCProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', transform=transform_exemplar, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler([vid_train], [ 1, ], samples_per_epoch=6650 * settings.batch_size, max_gap=100, processing=data_processing_train) # The loader for training train_loader = loader.LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([got10k_val], [ 1, ], samples_per_epoch=1000 * settings.batch_size, max_gap=100, processing=data_processing_val) # The loader for validation val_loader = loader.LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, epoch_interval=5, stack_dim=1) # creat network, set objective, creat optimizer, learning rate scheduler, trainer with dygraph.guard(): # Create network net = siamfc_alexnet() # Create actor, which wraps network and objective actor = actors.SiamFCActor(net=net, objective=None, batch_size=settings.batch_size, shape=(17, 17), radius=16, stride=8) # Set to training mode actor.train() # define optimizer and learning rate lr_scheduler = fluid.layers.exponential_decay(learning_rate=0.01, decay_steps=6650, decay_rate=0.8685, staircase=True) regularizer = fluid.regularizer.L2DecayRegularizer( regularization_coeff=0.0005) optimizer = fluid.optimizer.Momentum(momentum=0.9, regularization=regularizer, parameter_list=net.parameters(), learning_rate=lr_scheduler) trainer = LTRTrainer(actor, [train_loader], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=False, fail_safe=False)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'ATOM using the probabilistic maximum likelihood trained regression model for bounding-box' \ 'regression presented in [https://arxiv.org/abs/1909.12297].' settings.batch_size = 64 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets got10k_val = Got10k(settings.env.got10k_dir, split='votval') # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs proposal_params = { 'boxes_per_frame': 128, 'gt_sigma': (0, 0), 'proposal_sigma': [(0.05, 0.05), (0.5, 0.5)], 'add_mean_box': True } data_processing_train = processing.KLBBregProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.KLBBregProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler( [lasot_train, got10k_train, trackingnet_train, coco_train], [1, 1, 1, 1], samples_per_epoch=1000 * settings.batch_size, max_gap=200, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([got10k_val], [1], samples_per_epoch=500 * settings.batch_size, max_gap=200, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = atom_models.atom_resnet18(backbone_pretrained=True) objective = klreg_losses.MLRegression() actor = bbreg_actors.AtomBBKLActor(net=net, objective=objective) # Optimizer optimizer = optim.Adam(actor.net.bb_regressor.parameters(), lr=1e-3) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(50, load_latest=True, fail_safe=True)