def sgd_momentum(learning_rate_fn: optax.Schedule, momentum: float = 0., nesterov: bool = False) -> optax.GradientTransformation: return optax.chain( optax.trace(decay=momentum, nesterov=nesterov), optax.scale_by_schedule(learning_rate_fn), optax.scale(-1.))
def make_optimizer(): """SGD with nesterov momentum and a custom lr schedule.""" return optax.chain( optax.trace( decay=FLAGS.optimizer_momentum, nesterov=FLAGS.optimizer_use_nesterov), optax.scale_by_schedule(lr_schedule), optax.scale(-1))
def update( self, gradient: Weights, state: GenericGradientState, parameters: Optional[Weights] ) -> Tuple[Weights, GenericGradientState]: return GenericGradientState.wrap( *trace(self.decay, self.nesterov, self.accumulator_dtype).update( gradient, state.data, parameters))
def polyak_hb( decay: float = 0.9, accumulator_dtype: Optional[Any] = None, ) -> optax.GradientTransformation: return optax.trace(decay=decay, nesterov=False, accumulator_dtype=accumulator_dtype)
def make_optimizer(momentum=True, schedule_fn = lambda x:-1e-3): """SGD with momentum and a fixed lr.""" if momentum: return optax.chain( optax.trace(decay=0.9, nesterov=False), # momentum optax.scale_by_schedule(schedule_fn)) else: return optax.chain( optax.scale_by_schedule(schedule_fn))
def test_regularized_training(self): """Test that adding regularization penalty to the training loss works.""" np.random.seed(0) # Set up the problem of recovering w given x and # y = x . w + noise # with the a priori assumption that w is sparse. There are fewer examples # than dimensions (x is a wide matrix), so the problem is underdetermined # without the sparsity assumption. num_examples, num_dim = 8, 10 x = np.random.randn(num_examples, num_dim).astype(np.float32) true_w = np.zeros((num_dim, 2), np.float32) true_w[[2, 4, 6], 0] = [1.0, 2.0, 3.0] true_w[[3, 5], 1] = [4.0, 5.0] y = np.dot(x, true_w) + 1e-3 * np.random.randn(num_examples, 2) # Get the least squares estimate for w. It isn't very accurate. least_squares_w = np.linalg.lstsq(x, y, rcond=None)[0] least_squares_w_error = hk_util.l2_loss(least_squares_w - true_w) # Get a better estimate by solving the L1 regularized problem # argmin_w ||x . w - y||_2^2 + c ||w||_1. w_regularizer = lambda w: 4.0 * hk_util.l1_loss(w) def model_fun(batch): x = batch['x'] return hk_util.Linear(2, use_bias=False, w_regularizer=w_regularizer)(x) model = hk_util.transform(model_fun) def loss_fun(params, batch): """Training loss with L1 regularization penalty term.""" y_predicted, penalties = model.apply(params, None, batch) return hk_util.l2_loss(y_predicted - batch['y']) + penalties batch = {'x': x, 'y': y} params = model.init(jax.random.PRNGKey(0), batch) optimizer = optax.chain( # Gradient descent with decreasing learning rate. optax.trace(decay=0.0, nesterov=False), optax.scale_by_schedule(lambda i: -0.05 / jnp.sqrt(1 + i))) opt_state = optimizer.init(params) @jax.jit def train_step(params, opt_state, batch): grads = jax.grad(loss_fun)(params, batch) updates, opt_state = optimizer.update(grads, opt_state) new_params = optax.apply_updates(params, updates) return new_params, opt_state for _ in range(1000): params, opt_state = train_step(params, opt_state, batch) l1_w = params['linear']['w'] l1_w_error = hk_util.l2_loss(l1_w - true_w).item() # The L1-regularized estimate is much more accurate. self.assertGreater(least_squares_w_error, 4.0) self.assertLess(l1_w_error, 1.0)
def get(self) -> optax.GradientTransformation: if "adam" in self.optimizer: opt = optax.adam(self.base_learning_rate) elif "sgd" == self.optimizer and self.lr_schedule == "linear": lr_schedule = warm_up_polynomial_schedule( base_learning_rate=self.base_learning_rate, end_learning_rate=self.final_decay_factor * self.base_learning_rate, decay_steps=(self.n_batches * (self.epochs - self.lr_warmup_epochs)), warmup_steps=self.n_batches * self.lr_warmup_epochs, decay_power=1.0, ) momentum = 1 - self.one_minus_momentum opt = optax.chain( optax.trace(decay=momentum, nesterov=True), optax.scale_by_schedule(lr_schedule), optax.scale(-1), ) elif "sgd" in self.optimizer and self.lr_schedule == "step": lr_decay_epochs = [ (int(start_epoch_str) * self.epochs) // DEFAULT_NUM_EPOCHS for start_epoch_str in self.lr_decay_epochs ] lr_schedule = warm_up_piecewise_constant_schedule( steps_per_epoch=self.n_batches, base_learning_rate=self.base_learning_rate, decay_ratio=self.lr_decay_ratio, decay_epochs=lr_decay_epochs, warmup_epochs=self.lr_warmup_epochs, ) momentum = 1 - self.one_minus_momentum opt = optax.chain( optax.trace(decay=momentum, nesterov=True), optax.scale_by_schedule(lr_schedule), optax.scale(-1), ) else: raise ValueError("No optimizer specified.") return opt
def _create_jax_optimizer(self): import optax process = [] if isinstance(self.learning_rate, LearningRateSchedule): scheduler = self.learning_rate._create_jax_schedule() process.append(optax.scale_by_schedule(scheduler)) last_process = optax.scale(-1.0) else: lr = self.learning_rate last_process = optax.scale(-1.0 * lr) process.append( optax.scale_by_rms(decay=self.decay, eps=self.epsilon, initial_scale=0.0)) if self.momentum is not None or self.momentum != 0.0: process.append(optax.trace(decay=self.momentum, nesterov=False)) process.append(last_process) return optax.chain(*process)
def make_optimizer(lr_schedule, momentum_decay): return optax.chain(optax.trace(decay=momentum_decay, nesterov=False), optax.scale_by_schedule(lr_schedule), optax.scale(-1))
def make_sgd_optimizer(lr_schedule, momentum_decay): """Make SGD optimizer with momentum.""" # Maximize log-prob instead of minimizing loss return optax.chain(optax.trace(decay=momentum_decay, nesterov=False), optax.scale_by_schedule(lr_schedule))
def init(self, parameters: Weights) -> GenericGradientState: return GenericGradientState( trace(self.decay, self.nesterov, self.accumulator_dtype).init(parameters))