def test_update_function(): target_value_net = recnn.nn.Critic(1290, 128, 256) target_policy_net = recnn.nn.Actor(1290, 128, 256) target_policy_net.eval() target_value_net.eval() # soft update recnn.utils.soft_update(value_net, target_value_net, soft_tau=1.0) recnn.utils.soft_update(policy_net, target_policy_net, soft_tau=1.0) # define optimizers value_optimizer = optim.RAdam(value_net.parameters(), lr=1e-5, weight_decay=1e-2) policy_optimizer = optim.RAdam(policy_net.parameters(), lr=1e-5, weight_decay=1e-2) nets = { "value_net": value_net, "target_value_net": target_value_net, "policy_net": policy_net, "target_policy_net": target_policy_net, } optimizer = { "policy_optimizer": policy_optimizer, "value_optimizer": value_optimizer, } debug = {} writer = recnn.utils.misc.DummyWriter() step = 0 params = { "gamma": 0.99, "min_value": -10, "max_value": 10, "policy_step": 10, "soft_tau": 0.001, } loss = recnn.nn.update.ddpg_update(batch, params, nets, optimizer, torch.device("cpu"), debug, writer, step=step) check_loss_and_networks(loss, nets)
def get_optimizer(net_conf, model): if net_conf["optimizer"] == "adam": Gopt = optim.Adam(model["G"].parameters(), lr=net_conf["lr"]) elif net_conf["optimizer"] == "radam": Gopt = toptim.RAdam(model["G"].parameters(), lr=net_conf["lr"]) elif net_conf["optimizer"] == "lamb": Gopt = Lamb( model["G"].parameters(), lr=net_conf["lr"], weight_decay=0.01, betas=(0.9, 0.999), adam=False, ) optimizer = {"G": Gopt} if "D" in model: if net_conf["optimizer"] == "adam": Dopt = optim.Adam(model["D"].parameters(), lr=net_conf["discriminator_lr"]) elif net_conf["optimizer"] == "radam": Dopt = toptim.RAdam(model["D"].parameters(), lr=net_conf["discriminator_lr"]) elif net_conf["optimizer"] == "lamb": Dopt = Lamb( model["D"].parameters(), lr=net_conf["lr"], weight_decay=0.01, betas=(0.9, 0.999), adam=False, ) optimizer.update({"D": Dopt}) if "SPKRADV" in model: if net_conf["optimizer"] == "adam": SPKRADVopt = optim.Adam(model["SPKRADV"].parameters(), lr=net_conf["spkradv_lr"]) elif net_conf["optimizer"] == "radam": SPKRADVopt = toptim.RAdam(model["SPKRADV"].parameters(), lr=net_conf["spkradv_lr"]) elif net_conf["optimizer"] == "lamb": SPKRADVopt = Lamb( model["SPKRADV"].parameters(), lr=net_conf["spkradv_lr"], weight_decay=0.01, betas=(0.9, 0.999), adam=False, ) optimizer.update({"SPKRADV": SPKRADVopt}) return optimizer
def main(args): train_cfg = config_from_json(args.train_cfg) model_cfg = config_from_json(args.model_cfg) model_cfg.block_size = model_cfg.max_len // model_cfg.n_blocks set_seeds(train_cfg.seed) print("Loading dataset") loader = PreTrainDataset(args.data_file, train_cfg, model_cfg) model = BertInnerPreTrain(model_cfg) if train_cfg.optimizer == "lamb": optimizer = torch_optimizer.Lamb(model.parameters(), lr=train_cfg.lr, weight_decay=train_cfg.weigth_decay) elif train_cfg.optimizer == "radam": optimizer = torch_optimizer.RAdam(model.parameters(), lr=train_cfg.lr, weight_decay=train_cfg.weigth_decay) else: optimizer = optim4GPU(train_cfg, model) trainer = Trainer(loader, model, optimizer, args.save_dir, get_device(), train_cfg.parallel) if args.load_dir != "": print("Loading checkpoint") trainer.load_model(args.load_dir, args.load_dataset_state) trainer.train(train_cfg)
def get_optimizer(hparams, models): eps = 1e-8 parameters = get_parameters(models) if hparams.optimizer == 'sgd': optimizer = SGD(parameters, lr=hparams.lr, momentum=hparams.momentum, weight_decay=hparams.weight_decay) elif hparams.optimizer == 'adam': optimizer = Adam(parameters, lr=hparams.lr, eps=eps, weight_decay=hparams.weight_decay) elif hparams.optimizer == 'radam': optimizer = torch_optimizer.RAdam(parameters, lr=hparams.lr, eps=eps, weight_decay=hparams.weight_decay) elif hparams.optimizer == 'ranger': optimizer = torch_optimizer.Ranger(parameters, lr=hparams.lr, eps=eps, weight_decay=hparams.weight_decay) else: raise ValueError('optimizer not recognized!') return optimizer
def configure_optimizers(self): param_sets = [ {'params': self.encoder.parameters()}, {'params': self.decoder.parameters(), 'lr': self.lr * self.args.pcae.decoder.lr_coeff} ] if self.args.pcae.optimizer == 'sgd': opt = torch.optim.SGD(param_sets, lr=self.lr, weight_decay=self.weight_decay) elif self.args.pcae.optimizer == 'radam': opt = optim.RAdam(param_sets, lr=self.lr, weight_decay=self.weight_decay) else: raise NotImplementedError() if self.args.pcae.lr_scheduler == 'exp': scheduler_step = 'epoch' lr_sched = torch.optim.lr_scheduler.ExponentialLR(opt, gamma=self.lr_decay) elif self.args.pcae.lr_scheduler == 'cosrestarts': scheduler_step = 'step' lr_sched = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(opt, 469*8) # TODO scale by batch num else: raise NotImplementedError return [opt], [{ 'scheduler': lr_sched, 'interval': scheduler_step, 'name': 'pcae' }]
def retrain(self, dataset, max_epoch=10000, stopping_criterion=1e-3, lr=1e-3): logger.info("[+ +] Re-traning starts...") loss_fun = nn.MSELoss() # optimizer = optim.SGD(self.model.parameters(), lr=lr, momentum=0.9) optimizer = optim.RAdam(self._model.parameters(), lr=lr, weight_decay=1e-4) X_train, Y_train = self._process_data(dataset) for epoch in range(max_epoch): #### Training ### self._model.train() optimizer.zero_grad() Y_prediction = self._model(X_train) obj_train = loss_fun(Y_prediction, Y_train) obj_train.backward() optimizer.step() if obj_train.item( ) < stopping_criterion or epoch % 100 == 0: # Check stopping criterion logger.info("[+ +] Epoch: %5d Train Obj: %.5e" % (epoch + 1, obj_train.item())) if obj_train.item() < stopping_criterion: logger.info("[+ +] Re-training finished!") self._model.eval() return raise Exception("Maximum epoch in the retraining is reached!")
def __init__(self): super(Beta, self).__init__() self.net = nn.Sequential(nn.Linear(1024, num_items), nn.Softmax()) self.optim = optim.RAdam(self.net.parameters(), lr=1e-5, weight_decay=1e-5) self.criterion = nn.CrossEntropyLoss()
def main(args): train_cfg = config_from_json(args.train_cfg) model_cfg = config_from_json(args.model_cfg) model_cfg.block_size = model_cfg.max_len // model_cfg.n_blocks set_seeds(train_cfg.seed) print("Loading dataset") loader = PreTrainDataset(args.data_file, train_cfg, model_cfg) model = BertInnerForMaskedLM(model_cfg) if train_cfg.optimizer == "lamb": if train_cfg.opt_level != "" and train_cfg.opt_level is not None: optimizer = apex.optimizers.FusedLAMB( model.parameters(), **train_cfg.optimizer_parameters) else: optimizer = torch_optimizer.Lamb(model.parameters(), **train_cfg.optimizer_parameters) elif train_cfg.optimizer == "radam": optimizer = torch_optimizer.RAdam(model.parameters(), **train_cfg.optimizer_parameters) else: optimizer = optim4GPU(train_cfg, model) trainer = Trainer(loader, model, optimizer, args.save_dir, get_device(), train_cfg.parallel, train_cfg.opt_level) if args.load_model != "": print("Loading checkpoint") trainer.load_model(args.load_model, args.load_dataset_state) trainer.train(train_cfg)
def select_optimizer(opt_name, lr, model, sched_name="cos"): if opt_name == "adam": opt = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-6) elif opt_name == "radam": opt = torch_optimizer.RAdam(model.parameters(), lr=lr, weight_decay=0.00001) elif opt_name == "sgd": opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-4) else: raise NotImplementedError("Please select the opt_name [adam, sgd]") if sched_name == "cos": scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(opt, T_0=1, T_mult=2, eta_min=lr * 0.01) elif sched_name == "anneal": scheduler = optim.lr_scheduler.ExponentialLR(opt, 1 / 1.1, last_epoch=-1) elif sched_name == "multistep": scheduler = optim.lr_scheduler.MultiStepLR(opt, milestones=[30, 60, 80, 90], gamma=0.1) else: raise NotImplementedError( "Please select the sched_name [cos, anneal, multistep]") return opt, scheduler
def get_optimizer(model, config): if config.use_adam: if config.use_transformer: no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.AdamW( optimizer_grouped_parameters, lr=config.lr, ) else: # case of rnn based seq2seq. optimizer = optim.Adam(model.parameters(), lr=config.lr) elif config.use_radam: assert not config.use_noam_decay, "You need to turn-off noam decay, when you use RAdam." optimizer = custom_optim.RAdam(model.parameters(), lr=config.lr) else: optimizer = optim.SGD(model.parameters(), lr=config.lr) return optimizer
def get_optimizer(optimizer: str, model, optimizer_args): if optimizer == "sgd": return torch.optim.SGD(model.parameters(), **optimizer_args) elif optimizer == "adam": return torch.optim.Adam(model.parameters(), **optimizer_args) elif optimizer == "yogi": return optim.Yogi(model.parameters(), **optimizer_args) elif optimizer == "shampoo": return optim.Shampoo(model.parameters(), **optimizer_args) elif optimizer == "swats": return optim.SWATS(model.parameters(), **optimizer_args) elif optimizer == "sgdw": return optim.SGDW(model.parameters(), **optimizer_args) elif optimizer == "sgdp": return optim.SGDP(model.parameters(), **optimizer_args) elif optimizer == "rangerva": return optim.RangerVA(model.parameters(), **optimizer_args) elif optimizer == "rangerqh": return optim.RangerQH(model.parameters(), **optimizer_args) elif optimizer == "ranger": return optim.Ranger(model.parameters(), **optimizer_args) elif optimizer == "radam": return optim.RAdam(model.parameters(), **optimizer_args) elif optimizer == "qhm": return optim.QHM(model.parameters(), **optimizer_args) elif optimizer == "qhadam": return optim.QHAdam(model.parameters(), **optimizer_args) elif optimizer == "pid": return optim.PID(model.parameters(), **optimizer_args) elif optimizer == "novograd": return optim.NovoGrad(model.parameters(), **optimizer_args) elif optimizer == "lamb": return optim.Lamb(model.parameters(), **optimizer_args) elif optimizer == "diffgrad": return optim.DiffGrad(model.parameters(), **optimizer_args) elif optimizer == "apollo": return optim.Apollo(model.parameters(), **optimizer_args) elif optimizer == "aggmo": return optim.AggMo(model.parameters(), **optimizer_args) elif optimizer == "adamp": return optim.AdamP(model.parameters(), **optimizer_args) elif optimizer == "adafactor": return optim.Adafactor(model.parameters(), **optimizer_args) elif optimizer == "adamod": return optim.AdaMod(model.parameters(), **optimizer_args) elif optimizer == "adabound": return optim.AdaBound(model.parameters(), **optimizer_args) elif optimizer == "adabelief": return optim.AdaBelief(model.parameters(), **optimizer_args) elif optimizer == "accsgd": return optim.AccSGD(model.parameters(), **optimizer_args) elif optimizer == "a2graduni": return optim.A2GradUni(model.parameters(), **optimizer_args) elif optimizer == "a2gradinc": return optim.A2GradInc(model.parameters(), **optimizer_args) elif optimizer == "a2gradexp": return optim.A2GradExp(model.parameters(), **optimizer_args) else: raise Exception(f"Optimizer '{optimizer}' does not exist!")
def radam(parameters, lr=1e-3, betas=(0.9, 0.999), eps=1e-3, weight_decay=0): if isinstance(betas, str): betas = eval(betas) return torch_optimizer.RAdam(parameters, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
def return_optim(model, optim_type, lr): if optim_type == "adam": return optim.Adam(model.parameters(), lr=lr) elif optim_type == "radam": return toptim.RAdam(model.parameters(), lr=lr) elif optim_type == "lamb": return Lamb(model.parameters(), lr=lr) else: raise ValueError("Invalid optimizer type")
def LookaheadRAdam(params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, lalpha=0.5, k=6): return Lookahead( torch_optimizer.RAdam(params, lr, betas, eps, weight_decay), lalpha, k)
def main(args): train_cfg = config_from_json(args.train_cfg) model_cfg = config_from_json(args.model_cfg) model_cfg.block_size = model_cfg.max_len // model_cfg.n_blocks set_seeds(train_cfg.seed) if model_cfg.projection not in ["dense", "cnn"]: if args.max_len == 0: model_cfg.reduced_max_len = model_cfg.max_len else: model_cfg.reduced_max_len = args.max_len if args.reduce_block_size: assert model_cfg.reduced_max_len % model_cfg.n_blocks == 0, "Reduced len cannot be divided by n_blocks" model_cfg.block_size = model_cfg.reduced_max_len // model_cfg.n_blocks else: assert model_cfg.reduced_max_len % model_cfg.block_size == 0, "Reduced len cannot be divided by initial block_size" model_cfg.n_blocks = model_cfg.reduced_max_len // model_cfg.block_size print("max_len:", model_cfg.reduced_max_len, "block_size:", model_cfg.block_size, "n_blocks:", model_cfg.n_blocks) else: if args.max_len != 0: warnings.warn("Projection is incompatible with a reduced max len, using default max_len") print("Loading dataset") (data, labels), criterion = get_data_and_optimizer_from_dataset(args.data_file, train_cfg.task) loader = GlueDataset(data, labels, train_cfg, model_cfg) model = BertInnerForSequenceClassification(model_cfg, loader.get_n_labels(), criterion) if train_cfg.optimizer == "lamb": if train_cfg.opt_level != "" and train_cfg.opt_level is not None: optimizer = apex.optimizers.FusedLAMB(model.parameters(), **train_cfg.optimizer_parameters) else: optimizer = torch_optimizer.Lamb(model.parameters(), **train_cfg.optimizer_parameters) elif train_cfg.optimizer == "radam": optimizer = torch_optimizer.RAdam(model.parameters(), **train_cfg.optimizer_parameters) elif train_cfg.optimizer == "sgd": optimizer = optim.SGD(model.parameters(), **train_cfg.optimizer_parameters) else: optimizer = optim4GPU(train_cfg, model) trainer = GlueTrainer(loader, model, optimizer, args.save_dir, get_device(), train_cfg.parallel) if args.load_model != "": print("Loading checkpoint") trainer.load_model(args.load_model, args.load_dataset_state) if not args.eval: trainer.train(train_cfg) else: trainer.eval(train_cfg)
def get_optimizer(net_conf, model): if net_conf["optimizer"] == "adam": optimizer = { "generator": optim.Adam(model["G"].parameters(), lr=net_conf["lr"]), "discriminator": optim.Adam(model["D"].parameters(), lr=net_conf["discriminator_lr"]), } elif net_conf["optimizer"] == "radam": optimizer = { "generator": toptim.RAdam(model["G"].parameters(), lr=net_conf["lr"]), "discriminator": toptim.RAdam(model["D"].parameters(), lr=net_conf["discriminator_lr"]), } elif net_conf["optimizer"] == "lamb": optimizer = { "generator": Lamb( model["G"].parameters(), lr=net_conf["lr"], weight_decay=0.01, betas=(0.9, 0.999), adam=False, ), "discriminator": Lamb( model["D"].parameters(), lr=net_conf["lr"], weight_decay=0.01, betas=(0.9, 0.999), adam=False, ), } else: raise ValueError("optimizer must be [adam, radam, lamb]") return optimizer
def get_optimizer(model, config): if config.use_adam: if config.use_transformer: optimizer = optim.Adam(model.parameters(), lr=config.lr, betas=(.9, .98)) else: # case of rnn based seq2seq. optimizer = optim.Adam(model.parameters(), lr=config.lr) elif config.use_radam: optimizer = custom_optim.RAdam(model.parameters(), lr=config.lr) else: optimizer = optim.SGD(model.parameters(), lr=config.lr) return optimizer
def set_model(self): self.asr_model = Transformer(self.id2ch, self.config['asr_model']).cuda() self.asr_opt = optim.RAdam(self.asr_model.parameters(), betas=(0.9, 0.98), eps=1e-9) # self.asr_opt = TransformerOptimizer( # torch.optim.Adam(self.asr_model.parameters(), betas=(0.9, 0.98), eps=1e-09), # optim.RAdam(self.asr_model.parameters()) # self.config['asr_model']['optimizer_opt']['k'], # self.config['asr_model']['encoder']['d_model'], # self.config['asr_model']['optimizer_opt']['warmup_steps'] # ) self.label_smoothing = self.config['solver']['label_smoothing'] self.sos_id = self.asr_model.sos_id self.eos_id = self.asr_model.eos_id super().load_model()
def make_optimizer(config_dict: Dict[str, Any], model: nn.Module): cp: Dict[str, Any] = deepcopy(config_dict) n = cp.pop("name").lower() optimizer: Optimizer if n == "adam": optimizer = optim.Adam(model.parameters(), **cp) elif n == "radam": optimizer = torch_optimizer.RAdam(model.parameters(), **cp) elif n == "ranger": optimizer = torch_optimizer.Ranger(model.parameters(), **cp) elif n == "sgd": optimizer = optim.SGD(model.parameters(), **cp) else: raise ValueError(n) return optimizer
def configure_optimizers(self): optimizer = { "sgd": torch.optim.SGD(self.parameters(), lr=self.learning_rate, momentum=self.args.momentum), "adam": torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay), "adamw": torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay), "radam": optim.RAdam(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay), "fused_adam": apex.optimizers.FusedAdam(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay), }[self.args.optimizer.lower()] scheduler = { "none": None, "multistep": torch.optim.lr_scheduler.MultiStepLR(optimizer, self.args.steps, gamma=self.args.factor), "cosine": torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, self.args.max_epochs), "plateau": torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=self.args.factor, patience=self.args.lr_patience), }[self.args.scheduler.lower()] opt_dict = {"optimizer": optimizer, "monitor": "val_loss"} if scheduler is not None: opt_dict.update({"lr_scheduler": scheduler}) return opt_dict
def radam(parameters, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): """ The chosen optimizer - RAdam Info: https://pytorch-optimizer.readthedocs.io/en/latest/api.html#radam Paper: https://arxiv.org/abs/1908.03265 Param: parameters: The chosen model paramters lr (int), betas (tuple), eps (int), weight_decay (int): learning paramters Return: RAdam optimizer instance with the given parameters """ if isinstance(betas, str): betas = eval(betas) return optim.RAdam(parameters, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
def configure_optimizers(self): print(f"Initial Learning Rate: {self.hparams.learning_rate:.6f}") # optimizer = optim.Adam(self.parameters(), # lr=self.hparams.learning_rate, # weight_decay=weight_decay) # optimizer = torch.optim.SGD(self.parameters(), # lr=self.hparams.learning_rate, # momentum=0.9, # dampening=0, # weight_decay=weight_decay, # nesterov=False) optimizer = torch_optimizer.RAdam( self.parameters(), lr=self.hparams.learning_rate, betas=(0.9, 0.999), eps=1e-8, weight_decay=weight_decay, ) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_max, eta_min=0, last_epoch=-1) # scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts( # optimizer, # T_0=T_0, # T_mult=1, # eta_min=0, # last_epoch=-1) # scheduler = optim.lr_scheduler.OneCycleLR( # optimizer=optimizer, # pct_start=0.1, # div_factor=1e3, # max_lr=1e-1, # # max_lr=1e-2, # epochs=epochs, # steps_per_epoch=len(self.train_images) // batch_size) return [optimizer], [scheduler]
def get_optimizer(hparams, optimizer_grouped_parameters): if hparams.optimizer_type == "ranger": optimizer = torch_optimizer.Ranger( optimizer_grouped_parameters, lr=hparams.learning_rate, k=hparams.ranger_k, eps=hparams.adam_epsilon, ) elif hparams.optimizer_type == "qhadam": optimizer = torch_optimizer.QHAdam( optimizer_grouped_parameters, lr=hparams.learning_rate, nus=(0.1, 1.0), betas=(0.9, 0.999), eps=hparams.adam_epsilon, ) elif hparams.optimizer_type == "radam": optimizer = torch_optimizer.RAdam( optimizer_grouped_parameters, lr=hparams.learning_rate, betas=(0.9, 0.999), eps=hparams.adam_epsilon, ) elif hparams.optimizer_type == "adabound": optimizer = torch_optimizer.AdaBound( optimizer_grouped_parameters, lr=hparams.learning_rate, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3, eps=hparams.adam_epsilon, amsbound=False, ) else: optimizer = torch.optim.AdamW( optimizer_grouped_parameters, lr=hparams.learning_rate, eps=hparams.adam_epsilon, ) return optimizer
def select_optimizer(optimizer, net, learning_rate): global adam_beta1, adam_beta2, weight_decay, rmsprop_alpha, momentum, rmsprop_centered, \ adam_amsgrad, nesterov, dampening if optimizer == 'adamax': opt = optim.Adamax(filter(lambda p: p.requires_grad, net.parameters()), lr=learning_rate, betas=(adam_beta1, adam_beta2), weight_decay=weight_decay) elif optimizer == 'rmsprop': opt = optim.RMSprop(filter(lambda p: p.requires_grad, net.parameters()), lr=learning_rate, alpha=rmsprop_alpha, weight_decay=weight_decay, momentum=momentum, centered=rmsprop_centered) elif optimizer == 'adam': opt = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=learning_rate, betas=(adam_beta1, adam_beta2), weight_decay=weight_decay, amsgrad=adam_amsgrad, eps=eps) elif optimizer == 'radam': opt = new_optim.RAdam(filter(lambda p: p.requires_grad, net.parameters()), lr=learning_rate, betas=(adam_beta1, adam_beta2), eps=eps, weight_decay=weight_decay) else: # sgd opt = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) return opt
def create_optimizer(optimizer_config: Dict[str, Any], model: nn.Module): cp: Dict[str, Any] = copy(optimizer_config) n = cp.pop("name").lower() if n == "adam": optimizer: Optimizer = optim.Adam(model.parameters(), **cp) elif n == "sgd": optimizer = optim.SGD(model.parameters(), **cp) elif n == "adabound": optimizer = torch_optimizer.AdaBound(model.parameters(), **cp) elif n == "diffgrad": optimizer = torch_optimizer.DiffGrad(model.parameters(), **cp) elif n == "qhadam": optimizer = torch_optimizer.QHAdam(model.parameters(), **cp) elif n == "radam": optimizer = torch_optimizer.RAdam(model.parameters(), **cp) elif n == "yogi": optimizer = torch_optimizer.Yogi(model.parameters(), **cp) else: raise ValueError(n) return optimizer
def optimizer_chosen(self, model_param): try: optimizer_dict = { 'sgd': optim.SGD(params=model_param, lr=self.config.LEARNING_RATE, momentum=self.config.LEARNING_MOMENTUM, nesterov=True), 'adam': optim.Adam(params=model_param, lr=self.config.LEARNING_RATE), 'adadelta': optim.Adadelta(params=model_param, lr=self.config.LEARNING_RATE), 'adagrad': optim.Adagrad(params=model_param, lr=self.config.LEARNING_RATE), 'adamax': optim.Adamax(params=model_param, lr=self.config.LEARNING_RATE), 'adamw': optim.AdamW(params=model_param, lr=self.config.LEARNING_RATE), 'asgd': optim.ASGD(params=model_param, lr=self.config.LEARNING_RATE), 'rmsprop': optim.RMSprop(params=model_param, lr=self.config.LEARNING_RATE), 'radam': torch_optimizer.RAdam(params=model_param, lr=self.config.LEARNING_RATE), 'ranger': torch_optimizer.Ranger(params=model_param, lr=self.config.LEARNING_RATE) }[self.config.OPTIMIZER.lower()] return optimizer_dict except Exception as e: message = f"Invalid optimizers {e}" raise Exception(message)
def __create_optimizer(self, model): opt_parameters = [] named_parameters = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] set_2 = ["layer.4", "layer.5", "layer.6", "layer.7"] set_3 = ["layer.8", "layer.9", "layer.10", "layer.11"] init_lr = self.config.lr for i, (name, params) in enumerate(named_parameters): weight_decay = 0.0 if any(p in name for p in no_decay) else 0.01 if name.startswith("roberta.embeddings") or name.startswith( "roberta.encoder"): lr = init_lr lr = init_lr * 1.75 if any(p in name for p in set_2) else lr lr = init_lr * 3.5 if any(p in name for p in set_3) else lr opt_parameters.append({ "params": params, "weight_decay": weight_decay, "lr": lr }) if name.startswith("classifier"): lr = init_lr * 3.6 opt_parameters.append({ "params": params, "weight_decay": weight_decay, "lr": lr }) if self.config.optimizer_type != OptimizerType.RADAM: return AdamW(opt_parameters, lr=init_lr) return torch_optimizer.RAdam(opt_parameters, lr=init_lr)
def optimizer_chosen(self, model_param): try: optimizer_dict = { 'sgd': optim.SGD(params=model_param, lr=self.config.LEARNING_RATE, momentum=0.9, nesterov=True), 'adam': optim.Adam(params=model_param, lr=self.config.LEARNING_RATE), 'adadelta': optim.Adadelta(params=model_param, lr=self.config.LEARNING_RATE), 'adagrad': optim.Adagrad(params=model_param, lr=self.config.LEARNING_RATE), 'adamax': optim.Adamax(params=model_param, lr=self.config.LEARNING_RATE), 'adamw': optim.AdamW(params=model_param, lr=self.config.LEARNING_RATE), 'asgd': optim.ASGD(params=model_param, lr=self.config.LEARNING_RATE), 'rmsprop': optim.RMSprop(params=model_param, lr=self.config.LEARNING_RATE), 'radam': torch_optimizer.RAdam(params=model_param, lr=self.config.LEARNING_RATE), 'ranger': torch_optimizer.Ranger(params=model_param, lr=self.config.LEARNING_RATE) }[self.config.OPTIMIZER.lower()] return optimizer_dict except KeyError: print("Invalid optimizers")
def main(opt): train_data, valid_data = get_train_valid_split_data_names(opt.img_folder, opt.ano_folder, valid_size=1/8) # データの読み込み print("load data") train_dataset = Phase1Dataset(train_data, load_size=(640, 640), augment=True, limit=opt.limit) print("train data length : %d" % (len(train_dataset))) valid_dataset = Phase1Dataset(valid_data, load_size=(640, 640), augment=False, limit=opt.limit) print("valid data length : %d" % (len(valid_dataset))) # DataLoaderの作成 train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=1, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=True ) # GPUの設定(PyTorchでは明示的に指定する必要がある) device = torch.device('cuda' if opt.gpus > 0 else 'cpu') # モデルの作成 heads = {'hm': 1} model = get_pose_net(18, heads, 256).to(device) if opt.load_model != '': model, optimizer, start_epoch = load_model( model, opt.load_model, optimizer) # 最適化手法を定義 if opt.optimizer == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr)#, momentum=m, dampening=d, weight_decay=w, nesterov=n) elif opt.optimizer == "Adam": optimizer = torch.optim.Adam(model.parameters(), opt.lr) elif opt.optimizer == "RAdam": optimizer = optim.RAdam(model.parameters(), lr=opt.lr) # 損失関数を定義 criterion = HMLoss() # 学習率のスケジューリングを定義 scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1, eta_min=0.00001) start_epoch = 0 best_validation_loss = 1e10 # 保存用フォルダの作成 os.makedirs(os.path.join(opt.save_dir, opt.task, 'visualized'), exist_ok=True) # 学習 TODO エポック終了時点ごとにテスト用データで評価とモデル保存 for epoch in range(start_epoch + 1, opt.num_epochs + 1): print("learning rate : %f" % scheduler.get_last_lr()[0]) train(train_loader, model, optimizer, criterion, device, opt.num_epochs, epoch) if opt.optimizer == "SGD": scheduler.step() # 最新モデルの保存 save_model(os.path.join(opt.save_dir, opt.task, 'model_last.pth'), epoch, model, optimizer, scheduler) # テスト用データで評価 validation_loss, accumulate_datas = valid(valid_loader, model, criterion, device) # ベストスコア更新でモデルの保存 if validation_loss < best_validation_loss: best_validation_loss = validation_loss save_model(os.path.join(opt.save_dir, opt.task, 'model_best.pth'), epoch, model, optimizer, scheduler) print("saved best model") visualization(os.path.join(opt.save_dir, opt.task, 'visualized'), accumulate_datas)
x, x_noisy, y = datasets['train'][0] args.input_size = x.size() dataloaders = OrderedDict({ 'train': DataLoader(datasets['train'], shuffle=True, batch_size=args.batch_size), 'test': DataLoader(datasets['test'], shuffle=False, batch_size=args.batch_size) }) model = Model(args).to(args.device) # https://pypi.org/project/torch-optimizer/#radam if args.optimizer == 'radam': optimizer = optim.RAdam( model.parameters(), lr=args.learning_rate, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, ) def dict_list_append(dict, key, value): if key not in dict: dict[key] = [] dict[key].append(value) metrics_best = { 'best_test_loss': float('Inf'), 'best_test_loss_dir': -1 } count_batches = 0