def train_model(sym_net, model_prefix, dataset, input_conf, clip_length=16, train_frame_interval=2, resume_epoch=-1, batch_size=4, save_frequency=1, lr_base=0.01, lr_factor=0.1, lr_steps=[400000, 800000], end_epoch=1000, distributed=False, fine_tune=False, **kwargs): assert torch.cuda.is_available(), "Currently, we only support CUDA version" # data iterator iter_seed = torch.initial_seed() + 100 + max(0, resume_epoch) * 100 train_iter = iter_fac.creat(name=dataset, batch_size=batch_size, clip_length=clip_length, train_interval=train_frame_interval, mean=input_conf['mean'], std=input_conf['std'], seed=iter_seed) # wapper (dynamic model) net = model( net=sym_net, criterion=nn.CrossEntropyLoss().cuda(), model_prefix=model_prefix, step_callback_freq=50, save_checkpoint_freq=save_frequency, opt_batch_size=batch_size, ) net.net.cuda() # config optimization param_base_layers = [] param_new_layers = [] name_base_layers = [] for name, param in net.net.named_parameters(): if fine_tune: if ('classifier' in name) or ('fc' in name): param_new_layers.append(param) else: param_base_layers.append(param) name_base_layers.append(name) else: param_new_layers.append(param) if name_base_layers: out = "[\'" + '\', \''.join(name_base_layers) + "\']" logging.info( "Optimizer:: >> recuding the learning rate of {} params: {}". format( len(name_base_layers), out if len(out) < 300 else out[0:150] + " ... " + out[-150:])) net.net = torch.nn.DataParallel(net.net).cuda() optimizer = torch.optim.SGD([{ 'params': param_base_layers, 'lr_mult': 0.2 }, { 'params': param_new_layers, 'lr_mult': 1.0 }], lr=lr_base, momentum=0.9, weight_decay=0.0001, nesterov=True) # load params from pretrained 3d network if resume_epoch > 0: logging.info("Initializer:: resuming model from previous training") # resume training: model and optimizer if resume_epoch < 0: epoch_start = 0 step_counter = 0 else: net.load_checkpoint(epoch=resume_epoch, optimizer=optimizer) epoch_start = resume_epoch step_counter = epoch_start * train_iter.__len__() # set learning rate scheduler num_worker = dist.get_world_size() if torch.distributed.is_initialized( ) else 1 lr_scheduler = MultiFactorScheduler( base_lr=lr_base, steps=[int(x / (batch_size * num_worker)) for x in lr_steps], factor=lr_factor, step_counter=step_counter) # define evaluation metric metrics = metric.MetricList( metric.Loss(name="loss-ce"), metric.Accuracy(name="top1", topk=1), metric.Accuracy(name="top5", topk=5), ) net.fit( train_iter=train_iter, optimizer=optimizer, lr_scheduler=lr_scheduler, metrics=metrics, epoch_start=epoch_start, epoch_end=end_epoch, )
def train_model(net_name, sym_net, model_prefix, dataset, input_conf, modality='rgb', split=1, clip_length=16, train_frame_interval=2, val_frame_interval=2, resume_epoch=-1, batch_size=4, save_frequency=1, lr_base=0.01, lr_base2=0.01, lr_d=None, lr_factor=0.1, lr_steps=[400000, 800000], end_epoch=1000, distributed=False, pretrained_3d=None, fine_tune=False, iter_size=1, optim='sgd', accumulate=True, ds_factor=16, epoch_thre=1, score_dir=None, mv_minmaxnorm=False, mv_loadimg=False, detach=False, adv=0, new_classifier=False, **kwargs): assert torch.cuda.is_available(), "Currently, we only support CUDA version" torch.multiprocessing.set_sharing_strategy('file_system') import resource rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1])) # data iterator iter_seed = torch.initial_seed() \ + (torch.distributed.get_rank() * 10 if distributed else 100) \ + max(0, resume_epoch) * 100 train_iter, eval_iter = iterator_factory.creat( name=dataset, batch_size=batch_size, clip_length=clip_length, train_interval=train_frame_interval, val_interval=val_frame_interval, mean=input_conf['mean'], std=input_conf['std'], seed=iter_seed, modality=modality, split=split, net_name=net_name, accumulate=accumulate, ds_factor=ds_factor, mv_minmaxnorm=mv_minmaxnorm, mv_loadimg=mv_loadimg) #define an instance of class model net = model( net=sym_net, criterion=torch.nn.CrossEntropyLoss().cuda(), model_prefix=model_prefix, step_callback_freq=50, save_checkpoint_freq=save_frequency, opt_batch_size=batch_size, # optional criterion2=torch.nn.MSELoss().cuda() if modality == 'flow+mp4' else None, criterion3=torch.nn.CrossEntropyLoss().cuda() if adv > 0. else None, adv=adv, ) net.net.cuda() print(torch.cuda.current_device(), torch.cuda.device_count()) # config optimization param_base_layers = [] param_new_layers = [] name_base_layers = [] params_gf = [] params_d = [] for name, param in net.net.named_parameters(): if modality == 'flow+mp4': if name.startswith('gen_flow_model'): params_gf.append(param) elif name.startswith('discriminator'): params_d.append(param) else: if (name.startswith('conv3d_0c_1x1') or name.startswith('classifier')): #if name.startswith('classifier'): param_new_layers.append(param) else: param_base_layers.append(param) name_base_layers.append(name) #else: # #print(name) # param_new_layers.append(param) else: if fine_tune: if name.startswith('classifier') or name.startswith( 'conv3d_0c_1x1'): #if name.startswith('classifier'): param_new_layers.append(param) else: param_base_layers.append(param) name_base_layers.append(name) else: param_new_layers.append(param) if modality == 'flow+mp4': if fine_tune: lr_mul = 0.2 else: lr_mul = 0.5 else: lr_mul = 0.2 #print(params_d) if name_base_layers: out = "[\'" + '\', \''.join(name_base_layers) + "\']" logging.info( "Optimizer:: >> recuding the learning rate of {} params: {} by factor {}" .format( len(name_base_layers), out if len(out) < 300 else out[0:150] + " ... " + out[-150:], lr_mul)) if net_name == 'I3D': weight_decay = 0.0001 else: raise ValueError('UNKOWN net_name', net_name) logging.info("Train_Model:: weight_decay: `{}'".format(weight_decay)) if distributed: net.net = torch.nn.parallel.DistributedDataParallel(net.net).cuda() else: net.net = torch.nn.DataParallel(net.net).cuda() if optim == 'adam': optimizer = torch.optim.Adam([{ 'params': param_base_layers, 'lr_mult': lr_mul }, { 'params': param_new_layers, 'lr_mult': 1.0 }], lr=lr_base, weight_decay=weight_decay) optimizer_2 = torch.optim.Adam([{ 'params': param_base_layers, 'lr_mult': lr_mul }, { 'params': param_new_layers, 'lr_mult': 1.0 }], lr=lr_base2, weight_decay=weight_decay) else: optimizer = torch.optim.SGD([{ 'params': param_base_layers, 'lr_mult': lr_mul }, { 'params': param_new_layers, 'lr_mult': 1.0 }], lr=lr_base, momentum=0.9, weight_decay=weight_decay, nesterov=True) optimizer_2 = torch.optim.SGD([{ 'params': param_base_layers, 'lr_mult': lr_mul }, { 'params': param_new_layers, 'lr_mult': 1.0 }], lr=lr_base2, momentum=0.9, weight_decay=weight_decay, nesterov=True) if adv > 0.: optimizer_3 = torch.optim.Adam(params_d, lr=lr_base, weight_decay=weight_decay, eps=0.001) else: optimizer_3 = None if modality == 'flow+mp4': if optim == 'adam': optimizer_mse = torch.optim.Adam(params_gf, lr=lr_base, weight_decay=weight_decay, eps=1e-08) optimizer_mse_2 = torch.optim.Adam(params_gf, lr=lr_base2, weight_decay=weight_decay, eps=0.001) else: optimizer_mse = torch.optim.SGD(params_gf, lr=lr_base, momentum=0.9, weight_decay=weight_decay, nesterov=True) optimizer_mse_2 = torch.optim.SGD(params_gf, lr=lr_base2, momentum=0.9, weight_decay=weight_decay, nesterov=True) else: optimizer_mse = None optimizer_mse_2 = None # load params from pretrained 3d network if pretrained_3d and not pretrained_3d == 'False': if resume_epoch < 0: assert os.path.exists(pretrained_3d), "cannot locate: `{}'".format( pretrained_3d) logging.info( "Initializer:: loading model states from: `{}'".format( pretrained_3d)) if net_name == 'I3D': checkpoint = torch.load(pretrained_3d) keys = list(checkpoint.keys()) state_dict = {} for name in keys: state_dict['module.' + name] = checkpoint[name] del checkpoint net.load_state(state_dict, strict=False) if new_classifier: checkpoint = torch.load( './network/pretrained/model_flow.pth') keys = list(checkpoint.keys()) state_dict = {} for name in keys: state_dict['module.' + name] = checkpoint[name] del checkpoint net.load_state(state_dict, strict=False) else: checkpoint = torch.load(pretrained_3d) net.load_state(checkpoint['state_dict'], strict=False) else: logging.info( "Initializer:: skip loading model states from: `{}'" + ", since it's going to be overwrited by the resumed model". format(pretrained_3d)) # resume training: model and optimizer if resume_epoch < 0: epoch_start = 0 step_counter = 0 else: net.load_checkpoint(epoch=resume_epoch, optimizer=optimizer, optimizer_mse=optimizer_mse) epoch_start = resume_epoch step_counter = epoch_start * train_iter.__len__() # set learning rate scheduler num_worker = dist.get_world_size() if torch.distributed._initialized else 1 lr_scheduler = MultiFactorScheduler( base_lr=lr_base, steps=[int(x / (batch_size * num_worker)) for x in lr_steps], factor=lr_factor, step_counter=step_counter) if modality == 'flow+mp4': lr_scheduler2 = MultiFactorScheduler( base_lr=lr_base2, steps=[int(x / (batch_size * num_worker)) for x in lr_steps], factor=lr_factor, step_counter=step_counter) if lr_d == None: lr_scheduler3 = MultiFactorScheduler( base_lr=lr_d, steps=[int(x / (batch_size * num_worker)) for x in lr_steps], factor=lr_factor, step_counter=step_counter) else: print("_____________", lr_d) lr_scheduler3 = MultiFactorScheduler( base_lr=lr_d, steps=[int(x / (batch_size * num_worker)) for x in lr_steps], factor=lr_factor, step_counter=step_counter) else: lr_scheduler2 = None lr_scheduler3 = None # define evaluation metric metrics_D = None if modality == 'flow+mp4': metrics = metric.MetricList( metric.Loss(name="loss-ce"), metric.Loss(name="loss-mse"), metric.Accuracy(name="top1", topk=1), metric.Accuracy(name="top5", topk=5), ) if adv > 0: metrics_D = metric.MetricList(metric.Loss(name="classi_D"), metric.Loss(name="adv_D")) else: metrics = metric.MetricList( metric.Loss(name="loss-ce"), metric.Accuracy(name="top1", topk=1), metric.Accuracy(name="top5", topk=5), ) # enable cudnn tune cudnn.benchmark = True net.fit(train_iter=train_iter, eval_iter=eval_iter, optimizer=optimizer, lr_scheduler=lr_scheduler, metrics=metrics, epoch_start=epoch_start, epoch_end=end_epoch, iter_size=iter_size, optimizer_mse=optimizer_mse, optimizer_2=optimizer_2, optimizer_3=optimizer_3, optimizer_mse_2=optimizer_mse_2, lr_scheduler2=lr_scheduler2, lr_scheduler3=lr_scheduler3, metrics_D=metrics_D, epoch_thre=epoch_thre, score_dir=score_dir, detach=detach)
def train_model(sym_net, model_prefix, dataset, input_conf, clip_length=16, train_frame_interval=2, val_frame_interval=2, resume_epoch=-1, batch_size=4, save_frequency=1, lr_base=0.01, lr_factor=0.1, lr_steps=[400000, 800000], end_epoch=1000, distributed=False, pretrained_3d=None, fine_tune=False, load_from_frames=False, use_flow=False, triplet_loss=False, **kwargs): assert torch.cuda.is_available(), "Currently, we only support CUDA version" # data iterator iter_seed = torch.initial_seed() \ + (torch.distributed.get_rank() * 10 if distributed else 100) \ + max(0, resume_epoch) * 100 train_iter, eval_iter = iterator_factory.creat(name=dataset, batch_size=batch_size, clip_length=clip_length, train_interval=train_frame_interval, val_interval=val_frame_interval, mean=input_conf['mean'], std=input_conf['std'], seed=iter_seed, load_from_frames=load_from_frames, use_flow=use_flow) # wapper (dynamic model) if use_flow: class LogNLLLoss(torch.nn.Module): def __init__(self): super(LogNLLLoss, self).__init__() self.loss = torch.nn.NLLLoss() def forward(self, output, target): output = torch.log(output) loss = self.loss(output, target) return loss # criterion = LogNLLLoss().cuda() criterion = torch.nn.CrossEntropyLoss().cuda() elif triplet_loss: logging.info("Using triplet loss") criterion=torch.nn.MarginRankingLoss().cuda() else: criterion = torch.nn.CrossEntropyLoss().cuda() net = model(net=sym_net, criterion=criterion, triplet_loss=triplet_loss, model_prefix=model_prefix, step_callback_freq=50, save_checkpoint_freq=save_frequency, opt_batch_size=batch_size, # optional ) net.net.cuda() # config optimization param_base_layers = [] param_new_layers = [] name_base_layers = [] for name, param in net.net.named_parameters(): if fine_tune: # if name.startswith('classifier'): if 'classifier' in name or 'fc' in name: param_new_layers.append(param) else: param_base_layers.append(param) name_base_layers.append(name) else: param_new_layers.append(param) if name_base_layers: out = "[\'" + '\', \''.join(name_base_layers) + "\']" logging.info("Optimizer:: >> recuding the learning rate of {} params: {}".format(len(name_base_layers), out if len(out) < 300 else out[0:150] + " ... " + out[-150:])) if distributed: net.net = torch.nn.parallel.DistributedDataParallel(net.net).cuda() else: net.net = torch.nn.DataParallel(net.net).cuda() optimizer = torch.optim.SGD([{'params': param_base_layers, 'lr_mult': 0.2}, {'params': param_new_layers, 'lr_mult': 1.0}], lr=lr_base, momentum=0.9, weight_decay=0.0001, nesterov=True) # load params from pretrained 3d network if pretrained_3d: if resume_epoch < 0: if os.path.exists(pretrained_3d): # assert os.path.exists(pretrained_3d), "cannot locate: '{}'".format(pretrained_3d) logging.info("Initializer:: loading model states from: `{}'".format(pretrained_3d)) checkpoint = torch.load(pretrained_3d) net.load_state(checkpoint['state_dict'], mode='ada') else: logging.warning("cannot locate: '{}'".format(pretrained_3d)) else: logging.info("Initializer:: skip loading model states from: `{}'" + ", since it's going to be overwrited by the resumed model".format(pretrained_3d)) # resume training: model and optimizer if resume_epoch < 0: epoch_start = 0 step_counter = 0 else: net.load_checkpoint(epoch=resume_epoch, optimizer=optimizer) epoch_start = resume_epoch step_counter = epoch_start * train_iter.__len__() # set learning rate scheduler num_worker = 1 lr_scheduler = MultiFactorScheduler(base_lr=lr_base, steps=[int(x/(batch_size*num_worker)) for x in lr_steps], factor=lr_factor, step_counter=step_counter) # define evaluation metric if triplet_loss: metrics = metric.MetricList(metric.Loss(name="loss-triplet"), metric.TripletAccuracy(name="acc"), ) else: metrics = metric.MetricList(metric.Loss(name="loss-ce"), metric.Accuracy(name="top1", topk=1), metric.Accuracy(name="top5", topk=5), ) # enable cudnn tune cudnn.benchmark = True net.fit(train_iter=train_iter, eval_iter=eval_iter, optimizer=optimizer, lr_scheduler=lr_scheduler, metrics=metrics, epoch_start=epoch_start, epoch_end=end_epoch,)
def train_model(sym_net, name, model_prefix, input_conf, clip_length=32, clip_size=224, train_frame_interval=2, val_frame_interval=2, resume_epoch=-1, batch_size=16, save_frequency=1, lr_base=0.01, lr_factor=0.1, lr_steps=[50,100,150], enable_long_cycles=True, enable_short_cycles=True, end_epoch=300, pretrained_3d=None, fine_tune=False, dataset_location='Kinetics', net_name='r3d_50', gpus=4, **kwargs): assert torch.cuda.is_available(), "Only support CUDA devices." # Make results directory for .csv files if it does not exist results_path = str('./results/'+str(name)+'/'+str(net_name)) if not os.path.exists(results_path): os.makedirs(results_path) # data iterator - randomisation based on date and time values iter_seed = torch.initial_seed() + 100 + max(0, resume_epoch) * 100 now = datetime.datetime.now() iter_seed += now.year + now.month + now.day + now.hour + now.minute + now.second # Get parent location # - `data` folder should include all the dataset examples. # - `labels` folder should inclde all labels in .csv format. # We use a global label formating - you can have a look at the link in the `README.md` to download the files. data_location = dataset_location.split('/data/')[0] clip_length = int(clip_length) clip_size = int(clip_size) train_loaders = {} # Create custom loaders for train and validation train_data, eval_loader, train_length = iterator_factory.create( name=name, batch_size=batch_size, return_len=True, clip_length=clip_length, clip_size=clip_size, val_clip_length=clip_length, val_clip_size=clip_size, train_interval=train_frame_interval, val_interval=val_frame_interval, mean=input_conf['mean'], std=input_conf['std'], seed=iter_seed, data_root=data_location) # Create model net = model(net=sym_net, criterion=torch.nn.CrossEntropyLoss().cuda(), model_prefix=model_prefix, step_callback_freq=1, save_checkpoint_freq=save_frequency, opt_batch_size=batch_size, # optional ) net.net.cuda() # Parameter LR configuration for optimiser # Base layers are based on the layers as loaded to the model param_base_layers = [] base_layers_mult = 1.0 # New layers are based on fine-tuning param_new_layers = [] new_layers_mult = 1.0 name_base_layers = [] param_transpose_layers = [] transpose_layers_mult = 1.0 param_rec_layers = [] rec_layers_mult = 1.0 # Iterate over all parameters for name, param in net.net.named_parameters(): if fine_tune: if 'transpose' in name.lower(): param_transpose_layers.append(param) transpose_layers_mult = .2 elif name.lower().startswith('classifier'): new_layers_mult = .1 param_new_layers.append(param) elif ('lstm' or 'gru') in name.lower(): param_transpose_layers.append(param) rec_layers_mult = .5 else: param_base_layers.append(param) base_layers_mult = .6 name_base_layers.append(name) else: if 'transpose' in name.lower(): param_transpose_layers.append(param) transpose_layers_mult = .8 elif ('lstm' or 'gru') in name.lower(): param_transpose_layers.append(param) rec_layers_mult = 1. else: param_new_layers.append(param) # User feedback if name_base_layers: out = "[\'" + '\', \''.join(name_base_layers) + "\']" logging.info("Optimiser:: >> recuding the learning rate of {} params: {}".format(len(name_base_layers), out if len(out) < 300 else out[0:150] + " ... " + out[-150:])) optimiser = torch.optim.SGD([ {'params': param_base_layers, 'lr_mult': base_layers_mult}, {'params': param_new_layers, 'lr_mult': new_layers_mult}, {'params': param_rec_layers, 'lr_mult': rec_layers_mult}, {'params': param_transpose_layers, 'lr_mult': transpose_layers_mult},], lr=lr_base, momentum=0.9, weight_decay=0.0001, nesterov=True ) # Use Apex for Mixed precidion - Note: Please comment out any apex code on `train_model.py` and `model.py` # in case you wish to switch back to standard float32. "O0" opt_level still has some bugs when also using DataParallel net.net, optimiser = amp.initialize(net.net, optimiser, opt_level="O1") # Create DataParallel wrapper net.net = torch.nn.DataParallel(net.net, device_ids=[i for i in range(int(gpus))]) # load params from pretrained 3d network if pretrained_3d: assert os.path.exists(pretrained_3d), "cannot locate: `{}'".format(pretrained_3d) logging.info("Initialiser:: loading model states from: `{}'".format(pretrained_3d)) checkpoint = torch.load(pretrained_3d) net.load_state(checkpoint['state_dict'], strict=False) num_steps = train_length // batch_size # Long Cycle steps if (enable_long_cycles): count = 0 index = 0 iter_sizes = [8, 4, 2, 1] initial_num = num_steps # Expected to find the number of batches that fit exactly to the number of iterations. # So the sum of the floowing batch sizes should be less or equal to the number of batches left. while sum(iter_sizes[index:]) <= num_steps: # Case 1: 8 x B if iter_sizes[index] == 8: count += 1 index = 1 num_steps -= 8 # Case 2: 4 x B elif iter_sizes[index] == 4: count += 1 index = 2 num_steps -= 4 # Case 3: 2 x B elif iter_sizes[index] == 2: count += 1 index = 3 num_steps -= 2 # Base case elif iter_sizes[index] == 1: count += 1 index = 0 num_steps -= 1 print ("New number of batches per epoch is {:d} being equivalent to {:1.3f} of original number of batches with Long cycles".format(count,float(count)/float(initial_num))) num_steps = count # Short Cycle steps if (enable_short_cycles): # Iterate for *every* batch i = 0 while i <= num_steps: m = i%3 # Case 1: Base case if (m==0): num_steps -= 1 # Case 2: b = 2 x B if (m==1): num_steps -= 2 # Case 3: b = 4 x B else: num_steps -= 4 i += 1 # Update new number of batches print ("New number of batches per epoch is {:d} being equivalent to {:1.3f} of original number of batches with Short cycles".format(i,float(i)/float(initial_num))) num_steps = i # Split the batch number to four for every change in the long cycles long_steps = None if (enable_long_cycles): step = num_steps//4 long_steps = list(range(num_steps))[0::step] num_steps = long_steps[-1] # Create full list of long steps (for all batches) for epoch in range(1,end_epoch): end = long_steps[-1] long_steps = long_steps + [x.__add__(end) for x in long_steps[-4:]] # Fool-proofing if (long_steps[0]==0): long_steps[0]=1 # resume training: model and optimiser - (account of various batch sizes) if resume_epoch < 0: epoch_start = 0 step_counter = 0 else: # Try to load previous state dict in case `pretrained_3d` is None if not pretrained_3d: try: net.load_checkpoint(epoch=resume_epoch, optimizer=optimiser) except Exception: logging.warning('Initialiser:: No previous checkpoint found in the directory! You can specify the path explicitly with `pretrained_3d` argument.') epoch_start = resume_epoch step_counter = epoch_start * num_steps # Step dictionary creation iteration_steps = {'long_0':[],'long_1':[],'long_2':[],'long_3':[],'short_0':[],'short_1':[],'short_2':[]} #Populate dictionary for batch_i in range(0,num_steps): # Long cycle cases if batch_i>=0 and batch_i<num_steps//4: iteration_steps['long_0'].append(batch_i) elif batch_i>=num_steps//4 and batch_i<num_steps//2: iteration_steps['long_1'].append(batch_i) elif batch_i>=num_steps//2 and batch_i<(3*num_steps)//4: iteration_steps['long_2'].append(batch_i) else: iteration_steps['long_3'].append(batch_i) # Short cases if (batch_i%3==0): iteration_steps['short_0'].append(batch_i) elif (batch_i%3==1): iteration_steps['short_1'].append(batch_i) else: iteration_steps['short_2'].append(batch_i) # set learning rate scheduler lr_scheduler = MultiFactorScheduler(base_lr=lr_base, steps=[x*num_steps for x in lr_steps], iterations_per_epoch=num_steps, iteration_steps=iteration_steps, factor=lr_factor, step_counter=step_counter) # define evaluation metric metrics = metric.MetricList(metric.Loss(name="loss-ce"), metric.Accuracy(name="top1", topk=1), metric.Accuracy(name="top5", topk=5), metric.BatchSize(name="batch_size"), metric.LearningRate(name="lr")) # enable cudnn tune cudnn.benchmark = True # Main training happens here net.fit(train_iter=train_data, eval_iter=eval_loader, batch_shape=(int(batch_size),int(clip_length),int(clip_size),int(clip_size)), workers=8, no_cycles=(not(enable_long_cycles) and not(enable_short_cycles)), optimiser=optimiser, long_short_steps_dir=iteration_steps, lr_scheduler=lr_scheduler, metrics=metrics, iter_per_epoch=num_steps, epoch_start=epoch_start, epoch_end=end_epoch, directory=results_path)
def train_model(Hash_center, sym_net, model_prefix, dataset, input_conf, hash_bit, clip_length=16, train_frame_interval=2, val_frame_interval=2, resume_epoch=-1, batch_size=4, save_frequency=1, lr_base=0.01, lr_factor=0.1, lr_steps=[400000, 800000], end_epoch=1000, distributed=False, pretrained_3d=None, fine_tune=False, **kwargs): assert torch.cuda.is_available(), "Currently, we only support CUDA version" # data iterator iter_seed = torch.initial_seed() \ + (torch.distributed.get_rank() * 10 if distributed else 100) \ + max(0, resume_epoch) * 100 train_iter, eval_iter = iterator_factory.creat(name=dataset, batch_size=batch_size, clip_length=clip_length, train_interval=train_frame_interval, val_interval=val_frame_interval, mean=input_conf['mean'], std=input_conf['std'], seed=iter_seed) print(len(train_iter)) print(len(eval_iter)) # wapper (dynamic model) net = model(net=sym_net, criterion=torch.nn.BCELoss().cuda(), model_prefix=model_prefix, step_callback_freq=50, save_checkpoint_freq=save_frequency, opt_batch_size=batch_size, # optional dataset=dataset, # dataset name hash_bit=hash_bit, ) net.net.cuda() # config optimization param_base_layers = [] param_new_layers = [] name_base_layers = [] for name, param in net.net.named_parameters(): if fine_tune: #print(f'fine tune {fine_tune}') if name.startswith('hash'): param_new_layers.append(param) else: param_base_layers.append(param) name_base_layers.append(name) else: param_new_layers.append(param) if name_base_layers: out = "[\'" + '\', \''.join(name_base_layers) + "\']" logging.info("Optimizer:: >> recuding the learning rate of {} params: {}".format(len(name_base_layers), out if len(out) < 300 else out[0:150] + " ... " + out[-150:])) if distributed: net.net = torch.nn.parallel.DistributedDataParallel(net.net).cuda() else: net.net = torch.nn.DataParallel(net.net).cuda() optimizer = torch.optim.SGD([{'params': param_base_layers, 'lr_mult': 0.2}, {'params': param_new_layers, 'lr_mult': 1.0}], lr=lr_base, momentum=0.9, weight_decay=0.0001, nesterov=True) # load params from pretrained 3d network if pretrained_3d: if resume_epoch < 0: assert os.path.exists(pretrained_3d), "cannot locate: `{}'".format(pretrained_3d) logging.info("Initializer:: loading model states from: `{}'".format(pretrained_3d)) checkpoint = torch.load(pretrained_3d) net.load_state(checkpoint['state_dict'], strict=False) else: logging.info("Initializer:: skip loading model states from: `{}'" + ", since it's going to be overwrited by the resumed model".format(pretrained_3d)) # resume training: model and optimizer if resume_epoch < 0: epoch_start = 0 step_counter = 0 else: net.load_checkpoint(epoch=resume_epoch, optimizer=optimizer) epoch_start = resume_epoch step_counter = epoch_start * train_iter.__len__() # set learning rate scheduler num_worker = dist.get_world_size() if torch.distributed._initialized else 1 lr_scheduler = MultiFactorScheduler(base_lr=lr_base, steps=[int(x/(batch_size*num_worker)) for x in lr_steps], factor=lr_factor, step_counter=step_counter) # define evaluation metric metrics = metric.MetricList(metric.Loss(name="loss-ce")) # enable cudnn tune cudnn.benchmark = True net.fit(train_iter=train_iter, eval_iter=eval_iter, optimizer=optimizer, lr_scheduler=lr_scheduler, metrics=metrics, epoch_start=epoch_start, epoch_end=end_epoch, Hash_center=Hash_center,)
def train_model(sym_net, model_prefix, dataset, fold, clip_length=8, train_frame_interval=2, val_frame_interval=2, resume_epoch=-1, batch_size=4, save_frequency=1, lr_base=0.01, lr_factor=0.1, lr_steps=[400000, 800000], end_epoch=1000, distributed=False, fine_tune=False, epoch_div_factor=4, precise_bn=False, **kwargs): assert torch.cuda.is_available(), "Currently, we only support CUDA version" # data iterator iter_seed = torch.initial_seed() \ + (torch.distributed.get_rank() * 10 if distributed else 100) \ + max(0, resume_epoch) * 100 train_iter, eval_iter = iterator_factory_brats.create( name=dataset, batch_size=batch_size, fold=fold, # clip_length=clip_length, # train_interval=train_frame_interval, # val_interval=val_frame_interval, # mean=input_conf['mean'], # std=input_conf['std'], seed=iter_seed) # model (dynamic) net = model( net=sym_net, criterion=torch.nn.CrossEntropyLoss().cuda(), model_prefix=model_prefix, step_callback_freq=50, save_checkpoint_freq=save_frequency, opt_batch_size=batch_size, # optional single_checkpoint= precise_bn, # TODO: use shared filesystem to rsync running mean/var ) # if True: # for name, module in net.net.named_modules(): # if name.endswith("bn"): module.momentum = 0.005 net.net.cuda() # config optimization, [[w/ wd], [w/o wd]] param_base_layers = [[[], []], [[], []]] param_new_layers = [[[], []], [[], []]] name_freeze_layers, name_base_layers = [], [] for name, param in net.net.named_parameters(): idx_wd = 0 if name.endswith('.bias') else 1 idx_bn = 0 if name.endswith(('.bias', 'bn.weight')) else 1 if fine_tune: if not name.startswith('classifier'): param_base_layers[idx_bn][idx_wd].append(param) name_base_layers.append(name) else: param_new_layers[idx_bn][idx_wd].append(param) else: if "conv_m2" in name: param_base_layers[idx_bn][idx_wd].append(param) name_base_layers.append(name) else: param_new_layers[idx_bn][idx_wd].append(param) if name_freeze_layers: out = "[\'" + '\', \''.join(name_freeze_layers) + "\']" logging.info("Optimizer:: >> freezing {} params: {}".format( len(name_freeze_layers), out if len(out) < 300 else out[0:150] + " ... " + out[-150:])) if name_base_layers: out = "[\'" + '\', \''.join(name_base_layers) + "\']" logging.info( "Optimizer:: >> recuding the learning rate of {} params: {}". format( len(name_base_layers), out if len(out) < 300 else out[0:150] + " ... " + out[-150:])) if distributed: net.net = torch.nn.parallel.DistributedDataParallel(net.net).cuda() else: net.net = torch.nn.DataParallel(net.net).cuda() # optimizer = torch.optim.SGD(sym_net.parameters(), wd = 0.0001 optimizer = custom_optim.SGD( [ { 'params': param_base_layers[0][0], 'lr_mult': 0.5, 'weight_decay': 0. }, { 'params': param_base_layers[0][1], 'lr_mult': 0.5, 'weight_decay': wd }, { 'params': param_base_layers[1][0], 'lr_mult': 0.5, 'weight_decay': 0., 'name': 'precise.bn' }, # *.bias { 'params': param_base_layers[1][1], 'lr_mult': 0.5, 'weight_decay': wd, 'name': 'precise.bn' }, # bn.weight { 'params': param_new_layers[0][0], 'lr_mult': 1.0, 'weight_decay': 0. }, { 'params': param_new_layers[0][1], 'lr_mult': 1.0, 'weight_decay': wd }, { 'params': param_new_layers[1][0], 'lr_mult': 1.0, 'weight_decay': 0., 'name': 'precise.bn' }, # *.bias { 'params': param_new_layers[1][1], 'lr_mult': 1.0, 'weight_decay': wd, 'name': 'precise.bn' } ], # bn.weight lr=lr_base, momentum=0.9, nesterov=True) # resume: model and optimizer if resume_epoch < 0: epoch_start = 0 step_counter = 0 else: net.load_checkpoint(epoch=resume_epoch, optimizer=optimizer) epoch_start = resume_epoch step_counter = epoch_start * int( train_iter.__len__() / epoch_div_factor) num_worker = torch.distributed.get_world_size( ) if torch.distributed.is_initialized() else 1 lr_scheduler = MultiFactorScheduler( base_lr=lr_base, steps=[int(x / (batch_size * num_worker)) for x in lr_steps], factor=lr_factor, step_counter=step_counter) metrics = metric.MetricList( metric.Loss(name="loss-ce"), metric.Accuracy(name="top1", topk=1), metric.Accuracy(name="top2", topk=2), ) cudnn.benchmark = True # cudnn.fastest = False # cudnn.enabled = False net.fit( train_iter=train_iter, eval_iter=eval_iter, optimizer=optimizer, lr_scheduler=lr_scheduler, metrics=metrics, epoch_start=epoch_start, epoch_end=end_epoch, epoch_div_factor=epoch_div_factor, precise_bn=precise_bn, )