예제 #1
0
def get_optimizer(args, model):
    regularized_params = []
    non_regularized_params = []

    # Filter biases and norm parameters.
    for param in model.parameters():
        if param.requires_grad:
            if len(param.shape) == 1:
                non_regularized_params.append(param)
            else:
                regularized_params.append(param)

    params = [{
        'params': regularized_params,
        'weight_decay': args.weight_decay
    }, {
        'params': non_regularized_params,
        'weight_decay': 0
    }]

    optimizer = None
    if args.optimizer == 'sgd':
        optimizer = SGD(params,
                        lr=args.lr,
                        momentum=args.momentum,
                        loss_scaling=args.initial_loss_scaling,
                        use_combined_accum=False)
    elif args.optimizer == 'sgd_combined':
        optimizer = SGD(params,
                        lr=args.lr,
                        momentum=args.momentum,
                        loss_scaling=args.initial_loss_scaling,
                        velocity_scaling=args.initial_loss_scaling,
                        use_combined_accum=True)
    elif args.optimizer == 'adamw':
        optimizer = AdamW(params,
                          lr=args.lr,
                          loss_scaling=args.initial_loss_scaling,
                          eps=args.optimizer_eps)
    elif args.optimizer == 'rmsprop':
        optimizer = RMSprop(params,
                            lr=args.lr,
                            alpha=args.rmsprop_decay,
                            momentum=args.momentum,
                            loss_scaling=args.initial_loss_scaling,
                            eps=args.optimizer_eps)
    elif args.optimizer == 'rmsprop_tf':
        optimizer = RMSprop(params,
                            lr=args.lr,
                            alpha=args.rmsprop_decay,
                            momentum=args.momentum,
                            loss_scaling=args.initial_loss_scaling,
                            eps=args.optimizer_eps,
                            use_tf_variant=True)
    return optimizer
예제 #2
0
def get_optimizer(config, model):
    # Do not apply weight_decay for one-dimensional parameters
    regularized_params = []
    non_regularized_params = []
    for param in model.parameters():
        if param.requires_grad:
            if len(param.shape) == 1:
                non_regularized_params.append(param)
            else:
                regularized_params.append(param)

    params = [{
        "params": regularized_params,
        "weight_decay": config.weight_decay
    }, {
        "params": non_regularized_params,
        "weight_decay": 0
    }]

    first_order_type = float16 if config.enable_half_first_order_momentum else float32

    if config.optimizer == "SGD":
        optimizer = SGD(params,
                        lr=config.learning_rate,
                        momentum=config.momentum,
                        weight_decay=config.weight_decay,
                        loss_scaling=config.loss_scaling,
                        accum_type=float16,
                        velocity_accum_type=first_order_type,
                        use_combined_accum=False)
    else:
        raise ValueError("Unknown Optimizer:", config.optimizer)
    return optimizer
예제 #3
0
 def run_model(opts):
     input_data = torch.ones(4, 1)
     labels_data = torch.ones(4).long()
     model = torch.nn.Linear(1, 2, bias=False)
     model_with_loss = TrainingModelWithLoss(model, 0.1)
     optimizer = SGD(model_with_loss.parameters(), lr=0.1, momentum=0., use_combined_accum=True)
     training_model = poptorch.trainingModel(model_with_loss, opts, optimizer=optimizer)
     for _ in range(3):
         preds, loss, _ = training_model(input_data, labels_data)
     # return the weights of the model
     return list(model_with_loss.model.named_parameters())[0][1], loss
예제 #4
0
def get_optimizer(opts, model):
    if opts.optimizer == 'sgd':
        optimizer = SGD(model.parameters(), lr=opts.lr, momentum=opts.momentum, loss_scaling=opts.loss_scaling, velocity_scaling=opts.loss_scaling)
    else:
        optimizer = Adam(model.parameters(), lr=opts.lr)

    lr_scheduler = None
    if opts.lr_schedule == "step":
        if opts.warmup_epoch > 0:
            lr_scheduler = WarmupMultiStepLR(optimizer=optimizer, milestones=opts.lr_epoch_decay, lr=opts.lr, warmup_epoch=opts.warmup_epoch, gamma=opts.lr_decay)
        else:
            lr_scheduler = MultiStepLR(optimizer=optimizer, milestones=opts.lr_epoch_decay, gamma=opts.lr_decay)
    return optimizer, lr_scheduler
예제 #5
0
 def train(model, recompute):
     input_data = torch.ones(1, 3, 224, 224)
     labels_data = torch.ones(1).long()
     opts = poptorch.Options()
     if recompute:
         opts._Popart.set("autoRecomputation", int(popart.RecomputationType.Standard))
     opts.outputMode(poptorch.OutputMode.All)
     opts.randomSeed(0)
     opts.Training.gradientAccumulation(1)
     opts.Precision.enableStochasticRounding(False)
     model_with_loss = TrainingModelWithLoss(model)
     optimizer = SGD(model_with_loss.parameters(), lr=0.01, momentum=0., use_combined_accum=True)
     training_model = poptorch.trainingModel(model_with_loss, opts, optimizer=optimizer)
     predictions = []
     for _ in range(3):
         preds, _, _ = training_model(input_data, labels_data)
         predictions.append(preds)
     training_model.destroy()
     return predictions
예제 #6
0
 def train(model, recompute):
     input_data = torch.ones(1, 3, 224, 224)
     labels_data = torch.ones(1).long()
     model_opts = poptorch.Options()
     if recompute:
         model_opts.Popart.set("autoRecomputation",
                               int(popart.RecomputationType.Standard))
     model_opts.anchorMode(poptorch.AnchorMode.All)
     model_opts.randomSeed(0)
     model_opts.Training.gradientAccumulation(1)
     model_with_loss = TrainingModelWithLoss(model)
     optimizer = SGD(model_with_loss.parameters(), lr=0.01, momentum=0.)
     training_model = poptorch.trainingModel(model_with_loss,
                                             model_opts,
                                             optimizer=optimizer)
     predictions = []
     for _ in range(3):
         preds, loss = training_model(input_data, labels_data)
         predictions.append(preds)
     return predictions
예제 #7
0
def get_optimizer(opts, model):
    regularized_params = []
    non_regularized_params = []
    for param in model.parameters():
        if param.requires_grad:
            if len(param.shape) == 1:
                non_regularized_params.append(param)
            else:
                regularized_params.append(param)

    params = [{
        'params': regularized_params,
        'weight_decay': opts.weight_decay
    }, {
        'params': non_regularized_params,
        'weight_decay': 0
    }]

    if opts.optimizer == 'sgd':
        optimizer = SGD(params,
                        lr=opts.lr,
                        momentum=opts.momentum,
                        loss_scaling=opts.initial_loss_scaling,
                        velocity_scaling=opts.initial_loss_scaling /
                        opts.loss_velocity_scaling_ratio)
    elif opts.optimizer == 'adamw':
        optimizer = AdamW(params,
                          lr=opts.lr,
                          loss_scaling=opts.initial_loss_scaling)
    elif opts.optimizer == 'rmsprop':
        optimizer = RMSprop(params,
                            lr=opts.lr,
                            alpha=opts.rmsprop_decay,
                            momentum=opts.momentum,
                            loss_scaling=opts.initial_loss_scaling)

    # Make optimizers distributed
    if opts.use_popdist:
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    return optimizer
예제 #8
0
def get_optimizer(opts, model):
    regularized_params = []
    non_regularized_params = []
    for param in model.parameters():
        if param.requires_grad:
            if len(param.shape) == 1:
                non_regularized_params.append(param)
            else:
                regularized_params.append(param)

    params = [{
        'params': regularized_params,
        'weight_decay': opts.weight_decay
    }, {
        'params': non_regularized_params,
        'weight_decay': 0
    }]

    if opts.optimizer == 'sgd':
        optimizer = SGD(params,
                        lr=opts.lr,
                        momentum=opts.momentum,
                        weight_decay=0,
                        loss_scaling=opts.loss_scaling,
                        velocity_scaling=opts.loss_scaling)
    elif opts.optimizer == 'adamw':
        optimizer = AdamW(params,
                          lr=opts.lr,
                          weight_decay=0,
                          loss_scaling=opts.loss_scaling)
    elif opts.optimizer == 'rmsprop':
        optimizer = RMSprop(params,
                            lr=opts.lr,
                            alpha=opts.rmsprop_decay,
                            momentum=opts.momentum,
                            weight_decay=0,
                            loss_scaling=opts.loss_scaling)

    return optimizer