def __init__(self, max_lr: Union[float, List[float]], total_steps: Optional[int] = None, epochs: Optional[int] = None, steps_per_epoch: Optional[int] = None, pct_start: float = 0.3, anneal_strategy: str = 'cos', cycle_momentum: bool = True, base_momentum: Union[float, List[float]] = 0.85, max_momentum: Union[float, List[float]] = 0.95, div_factor: float = 25., final_div_factor: float = 1e4, last_epoch: int = -1): from distutils.version import LooseVersion if LooseVersion(torch.__version__) >= LooseVersion("1.3.0"): super().__init__(lambda opt: _scheduler.OneCycleLR( opt, max_lr, total_steps=total_steps, epochs=epochs, steps_per_epoch=steps_per_epoch, pct_start=pct_start, anneal_strategy=anneal_strategy, cycle_momentum=cycle_momentum, base_momentum=base_momentum, max_momentum=max_momentum, div_factor=div_factor, final_div_factor=final_div_factor, last_epoch=last_epoch), step_on_iteration=True) else: raise ImportError("Update torch>=1.3.0 to use 'OneCycleLR'")
def __init__( self, optimizer: Optimizer, max_lr: Union[float, List[float]], total_steps: Optional[int] = None, epochs: Optional[int] = None, steps_per_epoch: Optional[int] = None, pct_start: float = 0.3, anneal_strategy: str = "cos", cycle_momentum: bool = True, base_momentum: float = 0.85, max_momentum: float = 0.95, div_factor: float = 25.0, final_div_factor: float = 10000.0, last_epoch: int = -1, step_duration: int = 1, ): scheduler = lr_scheduler.OneCycleLR( optimizer, max_lr, total_steps, epochs, steps_per_epoch, pct_start, anneal_strategy, cycle_momentum, base_momentum, max_momentum, div_factor, final_div_factor, last_epoch, ) super().__init__(scheduler, step_duration)
def _train_model(self) -> nn.Module: """Тренировка модели.""" phenotype = self._phenotype loader = data_loader.DescribedDataLoader( self._tickers, self._end, phenotype["data"], data_params.TrainParams ) model = self._make_untrained_model(loader) optimizer = optim.AdamW(model.parameters(), **phenotype["optimizer"]) steps_per_epoch = len(loader) scheduler_params = dict(phenotype["scheduler"]) epochs = scheduler_params.pop("epochs") total_steps = 1 + int(steps_per_epoch * epochs) scheduler_params["total_steps"] = total_steps scheduler = lr_scheduler.OneCycleLR(optimizer, **scheduler_params) print(f"Epochs - {epochs:.2f}") print(f"Train size - {len(loader.dataset)}") len_deque = int(total_steps ** 0.5) llh_sum = 0.0 llh_deque = collections.deque([0], maxlen=len_deque) weight_sum = 0.0 weight_deque = collections.deque([0], maxlen=len_deque) loss_fn = normal_llh loader = itertools.repeat(loader) loader = itertools.chain.from_iterable(loader) loader = itertools.islice(loader, total_steps) model.train() bar = tqdm.tqdm(loader, file=sys.stdout, total=total_steps, desc="~~> Train") for batch in bar: optimizer.zero_grad() output = model(batch) loss, weight = loss_fn(output, batch) llh_sum += -loss.item() - llh_deque[0] llh_deque.append(-loss.item()) weight_sum += weight - weight_deque[0] weight_deque.append(weight) loss.backward() optimizer.step() scheduler.step() llh = llh_sum / weight_sum bar.set_postfix_str(f"{llh:.5f}") # Такое условие позволяет отсеять NaN if not (llh > LOW_LLH): raise GradientsError(llh) self._validate(model) return model
def get_optimization(cfg, model): if cfg.optimization.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), **cfg.optimization.adam_param) cfg.optimization.onecycle_scheduler.max_lr = cfg.optimization.adam_param.lr elif cfg.optimization.optimizer == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), **cfg.optimization.adam_param) cfg.optimization.onecycle_scheduler.max_lr = cfg.optimization.adam_param.lr elif cfg.optimization.optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), **cfg.optimization.sgd_param) cfg.optimization.onecycle_scheduler.max_lr = cfg.optimization.sgd_param.lr if cfg.optimization.scheduler == 'exp': scheduler = lr_scheduler.ExponentialLR( optimizer, **cfg.optimization.exp_scheduler) elif cfg.optimization.scheduler == 'step': scheduler = lr_scheduler.MultiStepLR(optimizer, **cfg.optimization.step_scheduler) elif cfg.optimization.scheduler == 'onecycle': scheduler = lr_scheduler.OneCycleLR( optimizer, **cfg.optimization.onecycle_scheduler) elif cfg.optimization.scheduler == 'cosine': scheduler = lr_scheduler.CosineAnnealingLR( optimizer, **cfg.optimization.cosine_scheduler) return optimizer, scheduler
def __init__( self, max_lr: Union[float, List[float]], total_steps: Optional[int] = None, epochs: Optional[int] = None, steps_per_epoch: Optional[int] = None, pct_start: float = 0.3, anneal_strategy: str = "cos", cycle_momentum: bool = True, base_momentum: Union[float, List[float]] = 0.85, max_momentum: Union[float, List[float]] = 0.95, div_factor: float = 25.0, final_div_factor: float = 1e4, last_epoch: int = -1, ): """Constructor for OneCycleLR.""" super().__init__( lambda opt: _schedulers.OneCycleLR( opt, max_lr, total_steps=total_steps, epochs=epochs, steps_per_epoch=steps_per_epoch, pct_start=pct_start, anneal_strategy=anneal_strategy, cycle_momentum=cycle_momentum, base_momentum=base_momentum, max_momentum=max_momentum, div_factor=div_factor, final_div_factor=final_div_factor, last_epoch=last_epoch, ), step_on_batch=True, )
def init_scheduler(args, optimizer): lr_init, lr_final = args.lr_init, args.lr_final lr_decay = min(args.lr_decay, args.num_epoch) minibatch_per_epoch = ceil(args.num_train / args.batch_size) if args.lr_minibatch: lr_decay = lr_decay*minibatch_per_epoch lr_ratio = lr_final/lr_init lr_bounds = lambda lr, lr_min: min(1, max(lr_min, lr)) if args.sgd_restart > 0: restart_epochs = [(2**k-1) for k in range(1, ceil(log2(args.num_epoch))+1)] lr_hold = restart_epochs[0] if args.lr_minibatch: lr_hold *= minibatch_per_epoch logger.info('SGD Restart epochs: {}'.format(restart_epochs)) else: restart_epochs = [] lr_hold = args.num_epoch if args.lr_minibatch: lr_hold *= minibatch_per_epoch if args.lr_decay_type.startswith('cos'): scheduler = sched.CosineAnnealingLR(optimizer, lr_hold, eta_min=lr_final) elif args.lr_decay_type.startswith('exp'): lr_lambda = lambda epoch: lr_bounds(exp(epoch / lr_decay * log(lr_ratio)), lr_ratio) scheduler = sched.LambdaLR(optimizer, lr_lambda) elif args.lr_decay_type.startswith('one'): lr_lambda = sched.OneCycleLR(optimizer, 10 * lr_init, epochs=num_epoch, steps_per_epoch=100) else: raise ValueError('Incorrect choice for lr_decay_type!') return scheduler, restart_epochs
def build(optimizer, cfg, **kwargs): return lr_scheduler.OneCycleLR( optimizer, cfg.SOLVER.LR_SCHEDULER.MAX_LR, total_steps=cfg.SOLVER.LR_SCHEDULER.MAX_ITER, pct_start=cfg.SOLVER.LR_SCHEDULER.PCT_START, base_momentum=cfg.SOLVER.LR_SCHEDULER.BASE_MOM, max_momentum=cfg.SOLVER.LR_SCHEDULER.MAX_MOM, div_factor=cfg.SOLVER.LR_SCHEDULER.DIV_FACTOR)
def _init_optimizers_schedulers(self, max_lr, epochs, div_factor=1.5): """ Function for creating dictionary of optimizers for different branches and common part. Optimizer have differential learning rate determined by the div_factor. The OneCycleLR schedulers are also defined at the end. """ len_dynamic_layers = len(self.half_dynamic) len_half_second_layers = len(self.half_second) # Optimizer for the common part of the model self.opt_static = optim.AdamW([ {'params': self.half_second[:len_half_second_layers//2].parameters(), 'lr': max_lr / div_factor**2}, {'params': self.half_second[len_half_second_layers//2:].parameters(), 'lr': max_lr / div_factor}, {'params': self.fc1.parameters(), 'lr': max_lr / div_factor}, {'params': self.bn1.parameters(), 'lr': max_lr}, {'params': self.fc2.parameters(), 'lr': max_lr}, ], lr=max_lr) list_lrs = [ max_lr / div_factor**2, max_lr / div_factor, max_lr / div_factor, max_lr, max_lr ] # Scheduler for the common part of the model self.sched_static = lr_scheduler.OneCycleLR(self.opt_static, max_lr=list_lrs, epochs=epochs, steps_per_epoch=len(self.data["train"]), div_factor=9) # Creating dictionary of optimizers and schedulers for the different branches for domain in self.list_domains: self.dict_opt_dynamic[domain] = optim.AdamW([{'params': self.dynamic_extractors[domain][:len_dynamic_layers//2].parameters(), 'lr': max_lr / div_factor**4}, {'params': self.dynamic_extractors[domain][len_dynamic_layers//2:].parameters(), 'lr': max_lr / div_factor**3}], lr=div_factor**3) list_lrs = [ max_lr / div_factor**4, max_lr / div_factor**3 ] self.dict_sched_dynamic[domain] = lr_scheduler.OneCycleLR(self.dict_opt_dynamic[domain], max_lr=list_lrs, epochs=epochs, steps_per_epoch=len(self.data["train"]), div_factor=9)
def create_lr_scheduler( conf_lrs: Config, epochs: int, optimizer: Optimizer, steps_per_epoch: Optional[int]) -> Tuple[Optional[_LRScheduler], bool]: # epoch_or_step - apply every epoch or every step scheduler, epoch_or_step = None, True if conf_lrs is not None: lr_scheduler_type = conf_lrs['type'] # TODO: default should be none? if lr_scheduler_type == 'cosine': # adjust max epochs for warmup # TODO: shouldn't we be increasing epochs or schedule lr only after warmup? if conf_lrs.get('warmup', None): epochs -= conf_lrs['warmup']['epochs'] scheduler = lr_scheduler.CosineAnnealingLR( optimizer, T_max=epochs, eta_min=conf_lrs['min_lr']) elif lr_scheduler_type == 'resnet': scheduler = _adjust_learning_rate_resnet(optimizer, epochs) elif lr_scheduler_type == 'pyramid': scheduler = _adjust_learning_rate_pyramid(optimizer, epochs, get_optim_lr(optimizer)) elif lr_scheduler_type == 'step': decay_period = conf_lrs['decay_period'] gamma = conf_lrs['gamma'] scheduler = lr_scheduler.StepLR(optimizer, decay_period, gamma=gamma) elif lr_scheduler_type == 'one_cycle': assert steps_per_epoch is not None ensure_pytorch_ver('1.3.0', 'LR scheduler OneCycleLR is not available.') max_lr = conf_lrs['max_lr'] epoch_or_step = False scheduler = lr_scheduler.OneCycleLR( optimizer, max_lr=max_lr, epochs=epochs, steps_per_epoch=steps_per_epoch, ) # TODO: other params elif not lr_scheduler_type: scheduler = None # TODO: check support for this or use StepLR else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) # select warmup for LR schedule if conf_lrs.get('warmup', None): scheduler = GradualWarmupScheduler( optimizer, multiplier=conf_lrs['warmup']['multiplier'], total_epoch=conf_lrs['warmup']['epochs'], after_scheduler=scheduler) return scheduler, epoch_or_step
def run_model(model, train_loader, test_loader, epochs, device, learning_rate, **regularization): # model = Net().to(device) criterion = nn.CrossEntropyLoss() # optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) l2_factor = regularization['l2_factor'] l1_factor = regularization['l1_factor'] optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=l2_factor) # scheduler = StepLR(optimizer, step_size=5, gamma=0.15) # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min') # ( factor=0.1, patience=10, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08, verbose=False) scheduler = lr_scheduler.OneCycleLR(optimizer, learning_rate, epochs=24, steps_per_epoch=64, pct_start=0.2) ## TRACKERS train_losses = [] train_acc = [] train_trackers = {'train_acc': train_acc, 'train_losses': train_losses} test_acc = [] test_losses = [] test_trackers = {'test_acc': test_acc, 'test_losses': test_losses} incorrect_samples = [] ## Model RUN! for epoch in range(1, epochs + 1): print(f'\nEpoch {epoch}:') train(model, train_loader, criterion, optimizer, device, l1_factor=l1_factor, **train_trackers) scheduler.step() test(model, test_loader, criterion, device, incorrect_samples, **test_trackers) # scheduler.step(test_trackers['test_losses'][-1]) return model, train_trackers, test_trackers, incorrect_samples
def get_scheduler(optimizer, opt): print('opt.lr_policy = [{}]'.format(opt.lr_policy)) if opt.lr_policy == 'lambda': def lambda_rule(epoch): lr_l = 1.0 - max(0, epoch + 1 + opt.epoch_count - opt.niter) / float(opt.niter_decay + 1) return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif opt.lr_policy == 'step': scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.5) elif opt.lr_policy == 'step2': scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1) elif opt.lr_policy == 'onecyclelr': # TODO: Need to set automatically! scheduler = lr_scheduler.OneCycleLR(optimizer=optimizer, max_lr=1e-4, steps_per_epoch=192, epochs=opt.n_epochs) elif opt.lr_policy == 'plateau': print('schedular=plateau') scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, threshold=0.01, patience=5) elif opt.lr_policy == 'plateau2': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5) elif opt.lr_policy == 'step_warmstart': def lambda_rule(epoch): #print(epoch) if epoch < 5: lr_l = 0.1 elif 5 <= epoch < 100: lr_l = 1 elif 100 <= epoch < 200: lr_l = 0.1 elif 200 <= epoch: lr_l = 0.01 return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif opt.lr_policy == 'step_warmstart2': def lambda_rule(epoch): #print(epoch) if epoch < 5: lr_l = 0.1 elif 5 <= epoch < 50: lr_l = 1 elif 50 <= epoch < 100: lr_l = 0.1 elif 100 <= epoch: lr_l = 0.01 return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) else: return NotImplementedError('learning rate policy [%s] is not implemented', opt.lr_policy) return scheduler
def one_cycle_lr(optimizer, last_epoch, max_lr, pct_start, epochs, steps_per_epoch, anneal_strategy='cos', **_): return lr_scheduler.OneCycleLR(optimizer, max_lr=max_lr, epochs=epochs, steps_per_epoch=steps_per_epoch, pct_start=pct_start, anneal_strategy=anneal_strategy, last_epoch=last_epoch)
def configure_optimizers(self): optimizer = optim.AdamW(self.parameters(), lr=self.lr, weight_decay=0.01) scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=1e-5, epochs=self.max_epochs, steps_per_epoch=1338) return { 'optimizer': optimizer, 'interval': 'step', 'lr_scheduler': { 'scheduler': scheduler, 'interval': 'step' } }
def choose_scheduler(self, optimizer): if optimizer is None: return None from torch.optim import lr_scheduler if self.hparams['lr_scheduler'] == 'ExpLR': scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.97) elif self.hparams['lr_scheduler'] == 'CosLR': scheduler = lr_scheduler.CosineAnnealingLR( optimizer, T_max=20 * self.steps_per_epoch + 1, eta_min=0) scheduler = {'scheduler': scheduler, 'interval': 'step'} elif self.hparams['lr_scheduler'] == 'StepLR': scheduler = lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1) elif self.hparams['lr_scheduler'] == 'OneCycLR': # + 1 to avoid over flow in steps() when there's totally 800 steps specified and 801 steps called # there will be such errors. scheduler = lr_scheduler.OneCycleLR( optimizer, max_lr=self.hparams["max_lr"], steps_per_epoch=self.steps_per_epoch + 1, epochs=self.hparams["num_epochs"]) scheduler = {'scheduler': scheduler, 'interval': 'step'} elif self.hparams['lr_scheduler'] == 'MultiStepLR': scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[70, 140, 190], gamma=0.1) elif self.hparams['lr_scheduler'] == 'MultiStepLR_CRD': scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[150, 180, 210], gamma=0.1) elif self.hparams['lr_scheduler'] == 'MultiStepLR_NN': scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[100, 140, 150], gamma=0.1) elif self.hparams['lr_scheduler'] == 'MultiStepLR_NN_50': scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[25, 40], gamma=0.1) elif self.hparams['lr_scheduler'] == 'MultiStepLR_NN_70_Adam': scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[50, 65], gamma=0.1) else: return None return scheduler
def train_21k(model, train_loader, val_loader, optimizer, args): # set loss loss_fn = CrossEntropyLS(args.label_smooth) # set scheduler scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=args.lr, steps_per_epoch=len(train_loader), epochs=args.epochs, pct_start=0.1, cycle_momentum=False, div_factor=20) # set scalaer scaler = GradScaler() # training loop for epoch in range(args.epochs): if num_distrib() > 1: train_loader.sampler.set_epoch(epoch) # train epoch print_at_master("\nEpoch {}".format(epoch)) epoch_start_time = time.time() for i, (input, target) in enumerate(train_loader): with autocast(): # mixed precision output = model(input) loss = loss_fn(output, target) # note - loss also in fp16 model.zero_grad() scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() scheduler.step() epoch_time = time.time() - epoch_start_time print_at_master( "\nFinished Epoch, Training Rate: {:.1f} [img/sec]".format( len(train_loader) * args.batch_size / epoch_time * max(num_distrib(), 1))) # validation epoch validate_21k(val_loader, model)
def configure_optimizers(self): """ This is required as part of pytorch-lightning :return: """ optimizer_type = self.hparams["optimizer_type"] if optimizer_type == "SGD": optimizer = optim.SGD( self.parameters(), lr=self.hparams["lr"], weight_decay=self.hparams["weight_decay"], ) if optimizer_type == "ADAM": optimizer = optim.Adam( self.parameters(), lr=self.hparams["lr"], weight_decay=self.hparams["weight_decay"], ) if self.hparams["scheduler_type"] == None: return [optimizer] else: if self.hparams["scheduler_type"] == "plateu": scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=5) elif self.hparams["scheduler_type"] == "one_cycle": scheduler = lr_scheduler.OneCycleLR( self.optimizer, max_lr=self.hparams["lr"], epochs=self.hparams["max_epochs"], steps_per_epoch=self.hparams["steps_per_epoch"], ) else: raise ValueError("Unspecified scheduler type: {}".format( self.hparams["scheduler_type"])) return [optimizer], [scheduler]
def get_scheduler(optimizer, lr_policy, args): if lr_policy == 'step': scheduler = lr_scheduler.StepLR(optimizer, gamma=args['gamma']) # 0.1 elif lr_policy == 'plateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode=args['mode'], factor=args['factor'], threshold=args['threshold'], patience=args['patience']) # optimizer, mode='min', factor=0.2, threshold=0.01, patience=5) elif lr_policy == 'cosine': scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=args['T_max']) # 200 elif lr_policy == 'one_cylce': scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=args['max_lr'], steps_per_epoch=len( args['data_loader']), epochs=args['epochs']) else: return NotImplementedError( 'learning rate policy [%s] is not implemented', lr_policy) return scheduler
def lr_scheduler_factory(optimizer, hparams, data_loader): if hparams.sched == "plateau": return lr_scheduler.ReduceLROnPlateau( optimizer, mode="max", patience=2, threshold=0.01, factor=0.1, verbose=True, ) if hparams.sched == "onecycle": return lr_scheduler.OneCycleLR( optimizer=optimizer, max_lr=hparams.lr, cycle_momentum=True, pct_start=0.25, div_factor=25.0, final_div_factor=100000.0, steps_per_epoch=len(data_loader), epochs=hparams.epochs, ) else: raise ValueError("Learning rate scheduler not supported yet.")
def get_optimizer(policy, args): if args.optimizer == "adam": optimizer = optim.Adam(policy.parameters(), lr=args.lr) elif args.optimizer == "sgd": optimizer = optim.SGD(policy.parameters(), lr=args.lr) elif args.optimizer == "rmsprop": optimizer = optim.RMSprop(policy.parameters(), lr=args.lr) scheduler = args.opt_schedule if scheduler == "cyclic": scheduler = lr_scheduler.OneCycleLR( optimizer=optimizer, max_lr=args.div_factor * args.lr, total_steps=args.num_episodes_train) elif scheduler == "cyclic_multi": scheduler = lr_scheduler.CyclicLR(optimizer=optimizer, base_lr=args.lr, max_lr=args.div_factor * args.lr) elif scheduler == "WR": T_0 = max(1, int(args.num_episodes_train / 1000)) scheduler = lr_scheduler.CosineAnnealingWarmRestarts( optimizer=optimizer, T_0=T_0) return optimizer, scheduler
def fit(self, dataloader, lr, epochs, weight_decay=0, print_steps=200): self.model.train() loss_fn = nn.CrossEntropyLoss() optimizer = optim.SGD(self.model.parameters(), lr, momentum=0.9, weight_decay=weight_decay, nesterov=False) scheduler = lr_scheduler.OneCycleLR(optimizer, lr, epochs=epochs, steps_per_epoch=len(dataloader)) history_loss = [] history_steps = [] for epoch in range(epochs): for step, (imgs, labels) in enumerate(dataloader): if torch.cuda.is_available(): imgs, labels = imgs.to('cuda'), labels.to('cuda') outputs = self.model(imgs) loss = loss_fn(outputs, labels) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() if step % print_steps == print_steps - 1: history_loss.append(loss.item()) history_steps.append(epoch * len(dataloader) + step + 1) print( f"epoch: {epoch + 1} \tstep: {step + 1} \tloss: {loss:.4f}" ) return history_steps, history_loss
def main(): args = parser.parse_args() args.batch_size = args.batch_size # setup model print('creating model...') #state = torch.load(args.model_path, map_location='cpu') #args.num_classes = state['num_classes'] args.do_bottleneck_head = True model = create_model(args).cuda() ema = EMA(model, 0.999) ema.register() #model.load_state_dict(state['model'], strict=True) #model.train() classes_list = np.array(list(idx_to_class.values())) print('done\n') # Data loading code normalize = transforms.Normalize(mean=[0, 0, 0], std=[1, 1, 1]) instances_path_val = os.path.join(args.data, 'annotations/instances_val2017.json') #instances_path_train = os.path.join(args.data, 'annotations/instances_val2017.json')#temprarily use val as train instances_path_train = os.path.join( args.data, 'annotations/instances_train2017.json') data_path_val = os.path.join(args.data, 'val2017') #data_path_train = os.path.join(args.data, 'val2017')#temporarily use val as train data_path_train = os.path.join(args.data, 'train2017') val_dataset = CocoDetection( data_path_val, instances_path_val, transforms.Compose([ transforms.Resize((args.image_size, args.image_size)), transforms.ToTensor(), normalize, ])) train_dataset = CocoDetection( data_path_train, instances_path_train, transforms.Compose([ transforms.Resize((args.image_size, args.image_size)), transforms.ToTensor(), normalize, ])) print("len(val_dataset)): ", len(val_dataset)) print("len(train_dataset)): ", len(train_dataset)) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=False) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False) criterion = AsymmetricLoss() params = model.parameters() optimizer = torch.optim.Adam(params, lr=0.0002, weight_decay=0.0001) #尝试新的optimizer total_step = len(train_loader) scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=0.0002, total_steps=total_step, epochs=25) #total_step = len(train_loader) highest_mAP = 0 trainInfoList = [] Sig = torch.nn.Sigmoid() #f=open('info_train.txt', 'a') for epoch in range(5): for i, (inputData, target) in enumerate(train_loader): f = open('info_train.txt', 'a') #model.train() inputData = inputData.cuda() target = target.cuda() target = target.max(dim=1)[0] #Sig = torch.nn.Sigmoid() output = Sig(model(inputData)) #output[output<args.thre] = 0 #output[output>=args.thre]=1 #print(output.shape) #(batchsize, channel, imhsize, imgsize) #print(inputData.shape) #(batchsize, numclasses) #print(output[0]) #print(target[0]) loss = criterion(output, target) model.zero_grad() loss.backward() optimizer.step() ema.update() #store information if i % 10 == 0: trainInfoList.append([epoch, i, loss.item()]) print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format( epoch, 5, i, total_step, loss.item())) f.write('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}\n'.format( epoch, 5, i, total_step, loss.item())) if (i + 1) % 400 == 0: #储存相应迭代模型 torch.save( model.state_dict(), os.path.join('models/', 'model-{}-{}.ckpt'.format(epoch + 1, i + 1))) #modelName = 'models/' + 'decoder-{}-{}.ckpt'.format(epoch+1, i+1) mAP_score = validate_multi(val_loader, model, args, ema) #model.train() if mAP_score > highest_mAP: highest_mAP = mAP_score print('current highest_mAP = ', highest_mAP) f.write('current highest_mAP = {}\n'.format(highest_mAP)) torch.save(model.state_dict(), os.path.join('models/', 'model-highest.ckpt')) f.close() scheduler.step() #修改学习率
def onecycle(optimizer, n_examples, cfg): lr = cfg.learning_rate n_steps = cfg.n_steps(n_examples) return lr_scheduler.OneCycleLR(optimizer, lr, total_steps=n_steps)
def train_multi_label_coco(model, train_loader, val_loader, lr): ema = ModelEma(model, 0.9997) # 0.9997^641=0.82 # set optimizer Epochs = 80 Stop_epoch = 40 weight_decay = 1e-4 criterion = AsymmetricLoss(gamma_neg=4, gamma_pos=0, clip=0.05, disable_torch_grad_focal_loss=True) parameters = add_weight_decay(model, weight_decay) optimizer = torch.optim.Adam(params=parameters, lr=lr, weight_decay=0) # true wd, filter_bias_and_bn steps_per_epoch = len(train_loader) scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=lr, steps_per_epoch=steps_per_epoch, epochs=Epochs, pct_start=0.2) highest_mAP = 0 trainInfoList = [] scaler = GradScaler() for epoch in range(Epochs): if epoch > Stop_epoch: break for i, (inputData, target) in enumerate(train_loader): inputData = inputData.cuda() target = target.cuda() # (batch,3,num_classes) target = target.max(dim=1)[0] with autocast(): # mixed precision output = model(inputData).float() # sigmoid will be done in loss ! loss = criterion(output, target) model.zero_grad() scaler.scale(loss).backward() # loss.backward() scaler.step(optimizer) scaler.update() # optimizer.step() scheduler.step() ema.update(model) # store information if i % 100 == 0: trainInfoList.append([epoch, i, loss.item()]) print('Epoch [{}/{}], Step [{}/{}], LR {:.1e}, Loss: {:.1f}' .format(epoch, Epochs, str(i).zfill(3), str(steps_per_epoch).zfill(3), scheduler.get_last_lr()[0], \ loss.item())) try: torch.save(model.state_dict(), os.path.join( 'models/', 'model-{}-{}.ckpt'.format(epoch + 1, i + 1))) except: pass model.eval() mAP_score = validate_multi(val_loader, model, ema) model.train() if mAP_score > highest_mAP: highest_mAP = mAP_score try: torch.save(model.state_dict(), os.path.join( 'models/', 'model-highest.ckpt')) except: pass print('current_mAP = {:.2f}, highest_mAP = {:.2f}\n'.format(mAP_score, highest_mAP))
def main(): args = parse_arguments() with open('./config/train.yml') as f: conf = yaml.load(f, Loader=yaml.FullLoader) epochs = args.epochs if not os.path.exists(conf['train']['saved_model']) and args.saved: raise FileNotFoundError('No such saved model {}'.format( conf['train']['saved_model'])) if not os.path.exists(conf['train']['saved_model']): os.makedirs(conf['train']['saved_model']) #model = Model(conf['model']) # init model #model = resnet152(num_classes=conf['train']['num_classes']) model = resnet50(num_classes=conf['train']['num_classes']) model.to(device) ema = ModelEma(model, 0.9997) # get parameter index with grad filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # training parameter assign learning_rate = conf['train']['learning_rate'] batch_size = conf['train']['batch_size'] epochs = conf['train']['epochs'] # prepare datasets dataset = BirdCallDataset(conf['data_folder']) dataloader = DataLoader(dataset, batch_size=conf['train']['batch_size'], shuffle=True) # init utility classes loss_avg = Averager() criterion = AsymmetricLoss(gamma_neg=4, gamma_pos=0, clip=0.05, disable_torch_grad_focal_loss=True) scaler = GradScaler() optimizer = optim.Adam(filtered_parameters, lr=learning_rate, betas=(0.9, 0.999)) scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=learning_rate, steps_per_epoch=len(dataset), epochs=epochs, pct_start=0.2) best_loss = 100 cur_time = time.time() # Run Training Session print(len(dataset)) with tqdm(range(epochs), unit="epoch") as tepoch: for epoch in tepoch: model.train() for batch, data in enumerate(dataloader): tepoch.set_description(f" Epoch {epoch+1}/{batch} ") wav, bird = data wav = wav.to(torch.float32) wav = wav.to(device) bird_smooth = np.where(bird == 1, 0.995, 0.0025) bird_smooth = torch.from_numpy(bird_smooth).to(device) with autocast(): # mixed precision output = model( wav).float() # sigmoid will be done in loss ! loss = criterion(output, bird_smooth) loss_avg.add(loss) model.zero_grad() scaler.scale(loss).backward() # loss.backward() scaler.step(optimizer) scaler.update() # optimizer.step() scheduler.step() ema.update(model) pred_score = torch.where( F.softmax(output, dim=1) > conf['train']['threshold'], 1, 0) t1 = pred_score.cpu().detach().numpy()[0] t2 = bird.cpu().detach().numpy()[0] torch.nn.utils.clip_grad_norm_( model.parameters(), 5) # gradient clipping with 5 (Default) #https://kh-kim.gitbook.io/natural-language-processing-with-pytorch/00-cover-6/05-gradient-clipping tepoch.set_postfix(loss=loss_avg.val().item(), f1_score=f1_score(t1, t2)) del wav, bird, loss, output, t1, t2, pred_score if loss_avg.val().item() < best_loss: best_loss = loss_avg.val().item() torch.save( model.state_dict(), os.path.join(conf['train']['save_folder'], f'{args.file_name}.pth')) # validation section, ToDo. '''
def get_scheduler(optimizer, opt, **kwargs): print('opt.lr_policy = [{}]'.format(opt.lr_policy)) if opt.lr_policy == 'lambda': def lambda_rule(epoch): lr_l = 1.0 - max(0, epoch + 1 + opt.epoch_count - opt.niter) / float(opt.niter_decay + 1) return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif opt.lr_policy == 'step': scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.5) elif opt.lr_policy == 'step2': scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1) elif opt.lr_policy == 'plateau': print('schedular=plateau') scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, threshold=0.01, patience=5) elif opt.lr_policy == 'plateau2': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5) elif opt.lr_policy == 'step_warmstart': def lambda_rule(epoch): #print(epoch) if epoch < 5: lr_l = 0.1 elif 5 <= epoch < 100: lr_l = 1 elif 100 <= epoch < 200: lr_l = 0.1 elif 200 <= epoch: lr_l = 0.01 return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif opt.lr_policy == 'step_warmstart2': def lambda_rule(epoch): #print(epoch) if epoch < 5: lr_l = 0.1 elif 5 <= epoch < 50: lr_l = 1 elif 50 <= epoch < 100: lr_l = 0.1 elif 100 <= epoch: lr_l = 0.01 return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif opt.lr_policy == 'one_cycle': print("Using one-cycle scheduler") scheduler = lr_scheduler.OneCycleLR( optimizer, max_lr=kwargs['max_lr'], steps_per_epoch=kwargs['len_train'], epochs=opt.n_epochs, cycle_momentum=True, div_factor=kwargs['division_factor']) print(f"Scheduler: last epoch: {kwargs['last_epoch']}") scheduler.last_epoch = kwargs[ 'last_epoch'] if kwargs['last_epoch'] > 0 else -1 else: return NotImplementedError( 'learning rate policy [%s] is not implemented', opt.lr_policy) return scheduler
# sanity check for param in filter(lambda p: p.requires_grad, modelVars['model'].parameters()): print(param.name,param.shape) else: modelVars['optimizer'] = optim.AdamW([ {'params': filter(lambda p: not p.is_cnn_param, modelVars['model'].parameters()), 'lr': params['learning_rate_meta']}, {'params': filter(lambda p: p.is_cnn_param, modelVars['model'].parameters()), 'lr': params['learning_rate']} ], lr=params['learning_rate']) else: modelVars['optimizer'] = optim.AdamW(modelVars['model'].parameters(), lr=params['learning_rate']) # Decay LR by a factor of 0.1 every 7 epochs # modelVars['scheduler'] = lr_scheduler.StepLR(modelVars['optimizer'], step_size=params['lowerLRAfter'], gamma=1/np.float32(params['LRstep'])) modelVars['scheduler'] = lr_scheduler.OneCycleLR(modelVars['optimizer'], max_lr=params['learning_rate'], epochs=params['training_steps'], steps_per_epoch=len(dataset_train)//params['batchSize']) # Define softmax modelVars['softmax'] = nn.Softmax(dim=1) # Set up training # loading from checkpoint if load_old: # Find last, not last best checkpoint files = glob(params['saveDir']+'/*') global_steps = np.zeros([len(files)]) for i in range(len(files)): # Use meta files to find the highest index if 'best' in files[i]:
def test_OneCycleLR(self, debug=True): """ Usage: python template_lib/modelarts/scripts/copy_tool.py \ -s s3://bucket-7001/ZhouPeng/pypi/torch1_7_0 -d /cache/pypi -t copytree for filename in /cache/pypi/*.whl; do pip install $filename done proj_root=moco-exp python template_lib/modelarts/scripts/copy_tool.py \ -s s3://bucket-7001/ZhouPeng/codes/$proj_root -d /cache/$proj_root -t copytree -b /cache/$proj_root/code.zip cd /cache/$proj_root pip install -r requirements.txt export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export TIME_STR=1 export PYTHONPATH=./exp:./stylegan2-pytorch:./ python -c "from exp.tests.test_styleganv2 import Testing_stylegan2;\ Testing_stylegan2().test_train_ffhq_128()" :return: """ if 'CUDA_VISIBLE_DEVICES' not in os.environ: os.environ['CUDA_VISIBLE_DEVICES'] = '0' if 'TIME_STR' not in os.environ: os.environ['TIME_STR'] = '0' if utils.is_debugging() else '0' from template_lib.v2.config_cfgnode.argparser import \ (get_command_and_outdir, setup_outdir_and_yaml, get_append_cmd_str, start_cmd_run) tl_opts = ' '.join(sys.argv[sys.argv.index('--tl_opts') + 1:]) if '--tl_opts' in sys.argv else '' print(f'tl_opts:\n {tl_opts}') command, outdir = get_command_and_outdir( self, func_name=sys._getframe().f_code.co_name, file=__file__) argv_str = f""" --tl_config_file none --tl_command none --tl_outdir {outdir} """ args = setup_outdir_and_yaml(argv_str, return_cfg=True) import torch.nn as nn from torch.optim import lr_scheduler from matplotlib import pyplot as plt model = nn.Linear(3, 64) def create_optimizer(): return SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4) def plot_lr(scheduler, title='', labels=['base'], nrof_epoch=100): lr_li = [[] for _ in range(len(labels))] epoch_li = list(range(nrof_epoch)) for epoch in epoch_li: scheduler.step() # 调用step()方法,计算和更新optimizer管理的参数基于当前epoch的学习率 lr = scheduler.get_last_lr() # 获取当前epoch的学习率 for i in range(len(labels)): lr_li[i].append(lr[i]) for lr, label in zip(lr_li, labels): plt.plot(epoch_li, lr, label=label) plt.grid() plt.xlabel('epoch') plt.ylabel('lr') plt.title(title) plt.legend() plt.show() optimizer = create_optimizer() scheduler = lr_scheduler.OneCycleLR(optimizer, 0.1, total_steps=100) plot_lr(scheduler, title='OneCycleLR') pass
def train_ensemble(models, num_epochs, train_loader, test_loader, train_func, test_func, torch_device, loss_pos_weight, pos_label, lr, clip, save_model_path_func=None, start_idx=0): # Iterate through models and train each one. for idx, model in enumerate(models): print() print("Training model " + str(idx) + " of " + str(len(models) - 1)) # Get optimizer and learning rate scheduler. optimizer = AdamW(model.parameters(), lr=lr) scheduler = lr_scheduler.OneCycleLR(optimizer=optimizer, max_lr=lr, epochs=num_epochs, steps_per_epoch=len(train_loader)) # Scaler for mixed precision training. scaler = GradScaler() # Loss function to use. loss_func = nn.BCEWithLogitsLoss(pos_weight=loss_pos_weight) best_model_state_dict = deepcopy(model.state_dict()) best_aps, best_roc_auc = test_func(model, test_loader, torch_device, pos_label) print("Initial results for model: APS=" + str(best_aps) + " ROC AUC=" + str(best_roc_auc)) print() best_epoch = 0 # Train. for epoch in range(num_epochs): # Call the training function for this epoch. print(str(epoch) + " of " + str(num_epochs - 1)) train_func(model, train_loader, loss_func, torch_device, optimizer, scheduler, scaler, clip) aps, roc_auc = test_func(model, test_loader, torch_device, pos_label) print("APS=" + str(aps) + " ROC AUC=" + str(roc_auc)) # Save model state if it's the best we have seen so far. if (round(roc_auc, 2) > round(best_roc_auc, 2) or (round(roc_auc, 2) == round(best_roc_auc, 2) and round(aps, 2) > round(best_aps, 2))): best_roc_auc = roc_auc best_aps = aps best_epoch = epoch best_model_state_dict = deepcopy(model.state_dict()) # Set model to its best version and save. model.load_state_dict(best_model_state_dict) if save_model_path_func is not None: torch.save(best_model_state_dict, save_model_path_func(idx + start_idx)) print("Best epoch for model: " + str(best_epoch)) # Return best results for cross-validation using just one model. if len(models) == 1: return best_aps, best_roc_auc
def create_lr_scheduler( conf_lrs: Config, epochs: int, optimizer: Optimizer, steps_per_epoch: Optional[int]) -> Tuple[Optional[_LRScheduler], bool]: # epoch_or_step - apply every epoch or every step scheduler, epoch_or_step = None, True # by default sched step on epoch conf_warmup = conf_lrs.get_val('warmup', None) warmup_epochs = 0 if conf_warmup is not None and 'epochs' in conf_warmup: warmup_epochs = conf_warmup['epochs'] if conf_lrs is not None: lr_scheduler_type = conf_lrs['type'] # TODO: default should be none? if lr_scheduler_type == 'cosine': scheduler = lr_scheduler.CosineAnnealingLR( optimizer, T_max=epochs - warmup_epochs, eta_min=conf_lrs['min_lr']) elif lr_scheduler_type == 'multi_step': scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=conf_lrs['milestones'], gamma=conf_lrs['gamma']) elif lr_scheduler_type == 'pyramid': scheduler = _adjust_learning_rate_pyramid(optimizer, epochs - warmup_epochs, get_optim_lr(optimizer)) elif lr_scheduler_type == 'step': decay_period = conf_lrs['decay_period'] gamma = conf_lrs['gamma'] scheduler = lr_scheduler.StepLR(optimizer, decay_period, gamma=gamma) elif lr_scheduler_type == 'one_cycle': assert steps_per_epoch is not None ensure_pytorch_ver('1.3.0', 'LR scheduler OneCycleLR is not available.') max_lr = conf_lrs['max_lr'] epoch_or_step = False scheduler = lr_scheduler.OneCycleLR( optimizer, max_lr=max_lr, epochs=epochs - warmup_epochs, steps_per_epoch=steps_per_epoch, ) # TODO: other params elif not lr_scheduler_type: scheduler = None else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) # select warmup for LR schedule if warmup_epochs: scheduler = GradualWarmupScheduler( optimizer, multiplier=conf_lrs['warmup'].get_val('multiplier', 1.0), total_epoch=warmup_epochs, after_scheduler=scheduler) return scheduler, epoch_or_step
def __init__( self, max_lr: Union[float, List[float]], total_steps: Optional[int] = None, epochs: Optional[int] = None, steps_per_epoch: Optional[int] = None, pct_start: float = 0.3, anneal_strategy: str = "cos", cycle_momentum: bool = True, base_momentum: Union[float, List[float]] = 0.85, max_momentum: Union[float, List[float]] = 0.95, div_factor: float = 25.0, final_div_factor: float = 1e4, last_epoch: int = -1, ): """Constructor for OneCycleLR. Args: max_lr (float or list of float): Upper learning rate boundaries in the cycle for each parameter group. total_steps (int): The total number of steps in the cycle. Note that if a value is not provided here, then it must be inferred by providing a value for epochs and steps_per_epoch. Defaults to None. epochs (int): The number of epochs to train for. This is used along with steps_per_epoch in order to infer the total number of steps in the cycle if a value for total_steps is not provided. Defaults to None. steps_per_epoch (int): The number of steps per an epoch to train for. This is used along with epochs in order to infer the total number of steps in the cycle if a value for total_steps is not provided. Defaults to None. pct_start (float): The percentage of the cycle (in number of steps) spent increasing the learning rate. Defaults to 0.3. anneal_strategy (str): {'cos', 'linear'} Specifies the annealing strategy: "cos" for cosine annealing, "linear" for linear annealing. Defaults to 'cos'. cycle_momentum (bool): If ``True``, momentum is cycled inversely to learning rate between 'base_momentum' and 'max_momentum'. Defaults to True. base_momentum (float or list of float): Lower momentum boundaries in the cycle for each parameter group. Note that momentum is cycled inversely to learning rate; at the peak of a cycle, momentum is 'base_momentum' and learning rate is 'max_lr'. Defaults to 0.85. max_momentum (float or list of float): Upper momentum boundaries in the cycle for each parameter group. Functionally, it defines the cycle amplitude (max_momentum - base_momentum). Note that momentum is cycled inversely to learning rate; at the start of a cycle, momentum is 'max_momentum' and learning rate is 'base_lr' Defaults to 0.95. div_factor (float): Determines the initial learning rate via initial_lr = max_lr/div_factor Defaults to 25. final_div_factor (float): Determines the minimum learning rate via min_lr = initial_lr/final_div_factor Defaults to 1e4. last_epoch (int): The index of last epoch. Default: -1. """ super().__init__( lambda opt: _schedulers.OneCycleLR( opt, max_lr, total_steps=total_steps, epochs=epochs, steps_per_epoch=steps_per_epoch, pct_start=pct_start, anneal_strategy=anneal_strategy, cycle_momentum=cycle_momentum, base_momentum=base_momentum, max_momentum=max_momentum, div_factor=div_factor, final_div_factor=final_div_factor, last_epoch=last_epoch, ), step_on_batch=True, )