def get_optimizer(args, model): regularized_params = [] non_regularized_params = [] # Filter biases and norm parameters. for param in model.parameters(): if param.requires_grad: if len(param.shape) == 1: non_regularized_params.append(param) else: regularized_params.append(param) params = [{ 'params': regularized_params, 'weight_decay': args.weight_decay }, { 'params': non_regularized_params, 'weight_decay': 0 }] optimizer = None if args.optimizer == 'sgd': optimizer = SGD(params, lr=args.lr, momentum=args.momentum, loss_scaling=args.initial_loss_scaling, use_combined_accum=False) elif args.optimizer == 'sgd_combined': optimizer = SGD(params, lr=args.lr, momentum=args.momentum, loss_scaling=args.initial_loss_scaling, velocity_scaling=args.initial_loss_scaling, use_combined_accum=True) elif args.optimizer == 'adamw': optimizer = AdamW(params, lr=args.lr, loss_scaling=args.initial_loss_scaling, eps=args.optimizer_eps) elif args.optimizer == 'rmsprop': optimizer = RMSprop(params, lr=args.lr, alpha=args.rmsprop_decay, momentum=args.momentum, loss_scaling=args.initial_loss_scaling, eps=args.optimizer_eps) elif args.optimizer == 'rmsprop_tf': optimizer = RMSprop(params, lr=args.lr, alpha=args.rmsprop_decay, momentum=args.momentum, loss_scaling=args.initial_loss_scaling, eps=args.optimizer_eps, use_tf_variant=True) return optimizer
def get_optimizer(config, model): # Do not apply weight_decay for one-dimensional parameters regularized_params = [] non_regularized_params = [] for param in model.parameters(): if param.requires_grad: if len(param.shape) == 1: non_regularized_params.append(param) else: regularized_params.append(param) params = [{ "params": regularized_params, "weight_decay": config.weight_decay }, { "params": non_regularized_params, "weight_decay": 0 }] first_order_type = float16 if config.enable_half_first_order_momentum else float32 if config.optimizer == "SGD": optimizer = SGD(params, lr=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay, loss_scaling=config.loss_scaling, accum_type=float16, velocity_accum_type=first_order_type, use_combined_accum=False) else: raise ValueError("Unknown Optimizer:", config.optimizer) return optimizer
def run_model(opts): input_data = torch.ones(4, 1) labels_data = torch.ones(4).long() model = torch.nn.Linear(1, 2, bias=False) model_with_loss = TrainingModelWithLoss(model, 0.1) optimizer = SGD(model_with_loss.parameters(), lr=0.1, momentum=0., use_combined_accum=True) training_model = poptorch.trainingModel(model_with_loss, opts, optimizer=optimizer) for _ in range(3): preds, loss, _ = training_model(input_data, labels_data) # return the weights of the model return list(model_with_loss.model.named_parameters())[0][1], loss
def get_optimizer(opts, model): if opts.optimizer == 'sgd': optimizer = SGD(model.parameters(), lr=opts.lr, momentum=opts.momentum, loss_scaling=opts.loss_scaling, velocity_scaling=opts.loss_scaling) else: optimizer = Adam(model.parameters(), lr=opts.lr) lr_scheduler = None if opts.lr_schedule == "step": if opts.warmup_epoch > 0: lr_scheduler = WarmupMultiStepLR(optimizer=optimizer, milestones=opts.lr_epoch_decay, lr=opts.lr, warmup_epoch=opts.warmup_epoch, gamma=opts.lr_decay) else: lr_scheduler = MultiStepLR(optimizer=optimizer, milestones=opts.lr_epoch_decay, gamma=opts.lr_decay) return optimizer, lr_scheduler
def train(model, recompute): input_data = torch.ones(1, 3, 224, 224) labels_data = torch.ones(1).long() opts = poptorch.Options() if recompute: opts._Popart.set("autoRecomputation", int(popart.RecomputationType.Standard)) opts.outputMode(poptorch.OutputMode.All) opts.randomSeed(0) opts.Training.gradientAccumulation(1) opts.Precision.enableStochasticRounding(False) model_with_loss = TrainingModelWithLoss(model) optimizer = SGD(model_with_loss.parameters(), lr=0.01, momentum=0., use_combined_accum=True) training_model = poptorch.trainingModel(model_with_loss, opts, optimizer=optimizer) predictions = [] for _ in range(3): preds, _, _ = training_model(input_data, labels_data) predictions.append(preds) training_model.destroy() return predictions
def train(model, recompute): input_data = torch.ones(1, 3, 224, 224) labels_data = torch.ones(1).long() model_opts = poptorch.Options() if recompute: model_opts.Popart.set("autoRecomputation", int(popart.RecomputationType.Standard)) model_opts.anchorMode(poptorch.AnchorMode.All) model_opts.randomSeed(0) model_opts.Training.gradientAccumulation(1) model_with_loss = TrainingModelWithLoss(model) optimizer = SGD(model_with_loss.parameters(), lr=0.01, momentum=0.) training_model = poptorch.trainingModel(model_with_loss, model_opts, optimizer=optimizer) predictions = [] for _ in range(3): preds, loss = training_model(input_data, labels_data) predictions.append(preds) return predictions
def get_optimizer(opts, model): regularized_params = [] non_regularized_params = [] for param in model.parameters(): if param.requires_grad: if len(param.shape) == 1: non_regularized_params.append(param) else: regularized_params.append(param) params = [{ 'params': regularized_params, 'weight_decay': opts.weight_decay }, { 'params': non_regularized_params, 'weight_decay': 0 }] if opts.optimizer == 'sgd': optimizer = SGD(params, lr=opts.lr, momentum=opts.momentum, loss_scaling=opts.initial_loss_scaling, velocity_scaling=opts.initial_loss_scaling / opts.loss_velocity_scaling_ratio) elif opts.optimizer == 'adamw': optimizer = AdamW(params, lr=opts.lr, loss_scaling=opts.initial_loss_scaling) elif opts.optimizer == 'rmsprop': optimizer = RMSprop(params, lr=opts.lr, alpha=opts.rmsprop_decay, momentum=opts.momentum, loss_scaling=opts.initial_loss_scaling) # Make optimizers distributed if opts.use_popdist: hvd.broadcast_parameters(model.state_dict(), root_rank=0) return optimizer
def get_optimizer(opts, model): regularized_params = [] non_regularized_params = [] for param in model.parameters(): if param.requires_grad: if len(param.shape) == 1: non_regularized_params.append(param) else: regularized_params.append(param) params = [{ 'params': regularized_params, 'weight_decay': opts.weight_decay }, { 'params': non_regularized_params, 'weight_decay': 0 }] if opts.optimizer == 'sgd': optimizer = SGD(params, lr=opts.lr, momentum=opts.momentum, weight_decay=0, loss_scaling=opts.loss_scaling, velocity_scaling=opts.loss_scaling) elif opts.optimizer == 'adamw': optimizer = AdamW(params, lr=opts.lr, weight_decay=0, loss_scaling=opts.loss_scaling) elif opts.optimizer == 'rmsprop': optimizer = RMSprop(params, lr=opts.lr, alpha=opts.rmsprop_decay, momentum=opts.momentum, weight_decay=0, loss_scaling=opts.loss_scaling) return optimizer