def test_exp(): from eight_mile.tf import optz tf.compat.v1.reset_default_graph() sess = tf.compat.v1.Session() lr_sched = create_lr_scheduler(**EXP_LR_CONFIG) bl_exp = ExponentialDecayScheduler(**EXP_LR_CONFIG) decay_rate = EXP_LR_CONFIG["decay_rate"] lr_var = tf.compat.v1.placeholder(tf.float32, shape=(), name="lr") step_var = tf.compat.v1.placeholder(tf.int32, shape=(), name="step") gph = lr_sched(lr_var, step_var) sess.run(tf.compat.v1.global_variables_initializer()) lrs = [] lrs_bl = [] for step in range(NUM_STEPS): lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step}) lrs += [lr] lr_bl = bl_exp(step) lrs_bl += [lr_bl] inv_times = [(INIT_LR * decay_rate**(t / 100.0)) for t in range(NUM_STEPS)] assert np.allclose(inv_times, lrs) assert np.allclose(inv_times, lrs_bl)
def test_composite_error(): pytest.importorskip("torch") from eight_mile.pytorch.optz import CompositeLRSchedulerPyTorch with pytest.raises(AssertionError): _ = create_lr_scheduler( **{"lr_scheduler_type": ["exponential", "zaremba"]})
def get_lr_decay(sched_type, lr, steps_per_epoch, n_epochs, logger, decay_steps=None, decay_rate=None, alpha=None): if sched_type == 'cosine': decay_steps = decay_steps if decay_steps else steps_per_epoch * n_epochs alpha = alpha if alpha else 0. params = {'decay_steps': decay_steps, 'alpha': alpha} else: decay_steps = decay_steps if decay_steps else steps_per_epoch if not decay_rate: if sched_type == 'exponential': decay_rate = 0.5 elif sched_type == 'invtime': decay_rate = 1.0 params = {'decay_steps': decay_steps, 'decay_rate': decay_rate} lr_decay = create_lr_scheduler(lr_scheduler_type=sched_type, lr=lr, **params) logger.info( f"Using {sched_type} decay learning rate with params {params}.") return lr_decay
def test_zaremba(): from eight_mile.tf import optz tf.compat.v1.reset_default_graph() sess = tf.compat.v1.Session() lr_sched = create_lr_scheduler(**ZAREMBA_LR_CONFIG) bl_zaremba = ZarembaDecayScheduler(**ZAREMBA_LR_CONFIG) lr_var = tf.compat.v1.placeholder(tf.float32, shape=(), name="lr") step_var = tf.compat.v1.placeholder(tf.int32, shape=(), name="step") gph = lr_sched(lr_var, step_var) sess.run(tf.global_variables_initializer()) lrs = [] lrs_bl = [] expect_lrs = [] current_lr = INIT_LR for step in range(NUM_STEPS): lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step}) lr_bl = bl_zaremba(step) lrs += [lr] lrs_bl += [lr_bl] if step in BOUNDS: b = BOUNDS.index(step) current_lr = ZAREMBA_DECAY_VALUES[b] expect_lrs += [current_lr] np.allclose(expect_lrs, lrs) np.allclose(expect_lrs, lrs_bl)
def test_composite_warmup(): from eight_mile.tf import optz tf.compat.v1.reset_default_graph() warmup_steps = COMPOSITE_LR_CONFIG["warmup_steps"] decay_rate = EXP_LR_CONFIG["decay_rate"] with tf.compat.v1.Session() as sess: lr_sched = create_lr_scheduler(**COMPOSITE_LR_CONFIG) lr_var = tf.compat.v1.placeholder(tf.float32, name="lr") step_var = tf.compat.v1.placeholder(tf.int32, name="step") out = lr_sched(lr_var, step_var) sess.run(tf.compat.v1.global_variables_initializer()) lrs = [ sess.run(out, { lr_var: INIT_LR, step_var: step }) for step in range(NUM_STEPS) ] warmup_expected = [ INIT_LR * min(1.0, step / warmup_steps) for step in range(NUM_STEPS) ] exp_expected = [(INIT_LR * decay_rate**(t / 100.0)) for t in range(NUM_STEPS)] for step in range(NUM_STEPS): if step < warmup_steps: assert np.allclose(lrs[step], warmup_expected[step]) else: assert np.allclose(lrs[step], exp_expected[step - warmup_steps])
def optimizer(loss_fn, **kwargs): global_step = tf.train.get_or_create_global_step() clip = kwargs.get("clip", None) optim = kwargs.get("optim", "sgd") eta = kwargs.get("lr", kwargs.get("eta", 0.01)) lr_scheduler = create_lr_scheduler(**kwargs) decay_fn = None colocate_gradients_with_ops = bool( kwargs.get("colocate_gradients_with_ops", False)) sgd_mom = float(kwargs.get("mom", 0.9)) if optim == "adadelta": rho = float(kwargs.get("rho", 0.95)) eps = float(kwargs.get("epsilon", 1e-6)) logger.info("adadelta(eta=%f, rho=%f, epsilon=%f)", eta, rho, eps) optz = lambda lr: tf.train.AdadeltaOptimizer(lr, rho, eps) elif optim == "adam": beta1 = float(kwargs.get("beta1", 0.9)) beta2 = float(kwargs.get("beta2", 0.999)) eps = float(kwargs.get("epsilon", 1e-8)) logger.info("adam(eta=%f beta1=%f, beta2=%f, eps=%f)", eta, beta1, beta2, eps) optz = lambda lr: tf.train.AdamOptimizer(lr, beta1, beta2, eps) elif optim == "adamw": wd = float(kwargs.get("weight_decay", 0)) beta1 = float(kwargs.get("beta1", 0.9)) beta2 = float(kwargs.get("beta2", 0.999)) eps = float(kwargs.get("epsilon", 1e-8)) logger.info("adamw(eta=%f beta1=%f, beta2=%f, eps=%f)", eta, beta1, beta2, eps) optz = lambda lr: AdamWOptimizer(lr, wd, beta1, beta2, eps) elif optim == "rmsprop": # Get mom again with difference default mom = float(kwargs.get("mom", 0.0)) logger.info("rmsprop(eta=%f, mom=%f)", eta, mom) optz = lambda lr: tf.train.RMSPropOptimizer(lr, momentum=mom) elif sgd_mom > 0: logger.info("sgd-mom(eta=%f, mom=%f)", eta, sgd_mom) optz = lambda lr: tf.train.MomentumOptimizer(lr, sgd_mom) else: logger.info("sgd(eta=%f)", eta) optz = lambda lr: tf.train.GradientDescentOptimizer(lr) logger.info("clip gradients at %s", clip) return ( global_step, tf.contrib.layers.optimize_loss( loss_fn, global_step, eta, optz, colocate_gradients_with_ops=colocate_gradients_with_ops, clip_gradients=clip, learning_rate_decay_fn=lr_scheduler, increment_global_step=True, ), )
def __init__(self, model, global_step=0, **kwargs): self.global_step = global_step if "lr_function" in kwargs: self.lr_function = kwargs["lr_function"] else: if "lr_scheduler_type" not in kwargs: kwargs["lr_scheduler_type"] = "default" self.lr_function = create_lr_scheduler(**kwargs) self._init_optimizer(model, **kwargs)
def test_constant(): from eight_mile.tf import optz lr_sched = create_lr_scheduler(lr=INIT_LR, lr_scheduler_type="default") bl_const = ConstantScheduler(lr=INIT_LR) for step in range(NUM_STEPS): lr = lr_sched(step) assert np.isclose(INIT_LR, lr) assert np.isclose(INIT_LR, bl_const(step))
def __init__(self, loss, optimizer=None, **kwargs): self.loss = loss self.global_step = kwargs.get('global_step', 0) if "lr_function" in kwargs: lr_function = kwargs["lr_function"] else: if "lr_scheduler_type" not in kwargs: kwargs["lr_scheduler_type"] = "default" lr_function = create_lr_scheduler(**kwargs) # decay_fn = None # Right now this option is pointless since sparse updates dont work on GPU. We just turn it off sgd_mom = float(kwargs.get("mom", 0.9)) self.clip = kwargs.get("clip", 100) if optimizer: self.optimizer = optimizer else: optim = kwargs.get("optim", "sgd") lr = kwargs.get("lr", kwargs.get("eta", 0.01)) if optim == "adadelta": rho = float(kwargs.get("rho", 0.95)) eps = float(kwargs.get("epsilon", 1e-6)) logger.info("adadelta(eta=%f, rho=%f, epsilon=%f)", lr, rho, eps) self.optimizer = tf.optimizers.Adadelta(lr, rho, eps) elif optim == "adam": beta1 = float(kwargs.get("beta1", 0.9)) beta2 = float(kwargs.get("beta2", 0.999)) eps = float(kwargs.get("epsilon", 1e-8)) logger.info("adam(eta=%f beta1=%f, beta2=%f, eps=%f)", lr, beta1, beta2, eps) self.optimizer = tf.optimizers.Adam(lr_function, beta1, beta2, eps) elif optim == "adamw": import tensorflow_addons as tfa wd = float(kwargs.get("weight_decay", 0)) beta1 = float(kwargs.get("beta1", 0.9)) beta2 = float(kwargs.get("beta2", 0.999)) eps = float(kwargs.get("epsilon", 1e-8)) logger.info("adamw(eta=%f beta1=%f, beta2=%f, eps=%f)", lr, beta1, beta2, eps) self.optimizer = tfa.optimizers.AdamW( weight_decay=wd, learning_rate=lr_function, beta_1=beta1, beta_2=beta2, epsilon=eps ) elif optim == "rmsprop": # Get mom again with difference default mom = float(kwargs.get("mom", 0.0)) logger.info("rmsprop(eta=%f, mom=%f)", lr, mom) self.optimizer = tf.optimizers.RMSprop(lr_function, momentum=mom) elif sgd_mom > 0: logger.info("sgd-mom(eta=%f, mom=%f)", lr, sgd_mom) self.optimizer = tf.optimizers.SGD(lr_function, sgd_mom) else: logger.info("sgd(eta=%f)", lr) self.optimizer = tf.optimizers.SGD(lr_function) logger.info("clip gradients at %s", self.clip)
def test_cyclic(): from eight_mile.tf import optz tf.compat.v1.reset_default_graph() sess = tf.compat.v1.Session() lr_sched = create_lr_scheduler(**CYCLIC_LR_CONFIG) bl_const = CyclicLRScheduler(**CYCLIC_LR_CONFIG) for step in range(NUM_STEPS): lr = lr_sched(step) lr_bl = bl_const(step) assert np.isclose(lr, lr_bl)
def test_linear_warmup(): from eight_mile.tf import optz lr_sched = create_lr_scheduler(**LINEAR_WARMUP_LR_CONFIG) warmup_steps = LINEAR_WARMUP_LR_CONFIG["warmup_steps"] lrs = [] for step in range(NUM_STEPS): lr = lr_sched(step) lrs += [lr] expected_lrs = [INIT_LR * min(1.0, step / warmup_steps) for step in range(NUM_STEPS)] assert np.allclose(expected_lrs, lrs)
def __init__(self, model_or_params, global_step=0, **kwargs): if isinstance(model_or_params, torch.nn.Module): parameters = model_or_params.parameters() else: parameters = model_or_params self.global_step = global_step if "lr_function" in kwargs: self.lr_function = kwargs["lr_function"] else: if "lr_scheduler_type" not in kwargs: kwargs["lr_scheduler_type"] = "default" self.lr_function = create_lr_scheduler(**kwargs) self._init_optimizer(parameters, **kwargs) self.current_lr = 0
def test_composite_warmup(): from eight_mile.tf import optz warmup_steps = COMPOSITE_LR_CONFIG["warmup_steps"] decay_rate = EXP_LR_CONFIG["decay_rate"] lr_sched = create_lr_scheduler(**COMPOSITE_LR_CONFIG) lrs = [lr_sched(step) for step in range(NUM_STEPS)] warmup_expected = [INIT_LR * min(1.0, step / warmup_steps) for step in range(NUM_STEPS)] exp_expected = [(INIT_LR * decay_rate ** (t / 100.0)) for t in range(NUM_STEPS)] for step in range(NUM_STEPS): if step < warmup_steps: assert np.allclose(lrs[step], warmup_expected[step]) else: assert np.allclose(lrs[step], exp_expected[step - warmup_steps])
def test_invtime(): from eight_mile.tf import optz lr_sched = create_lr_scheduler(**INVTIME_LR_CONFIG) bl_invtime = InverseTimeDecayScheduler(**INVTIME_LR_CONFIG) decay_rate = INVTIME_LR_CONFIG["decay_rate"] lrs = [] lrs_bl = [] for step in range(NUM_STEPS): lr = lr_sched(step) lrs += [lr] lr_bl = bl_invtime(step) lrs_bl += [lr_bl] inv_times = [INIT_LR / (1.0 + decay_rate * t) for t in range(NUM_STEPS)] assert np.allclose(inv_times, lrs) assert np.allclose(inv_times, lrs_bl)
def test_linear(): from eight_mile.tf import optz lr_sched = create_lr_scheduler(**LINEAR_LR_CONFIG) bl_sched = LinearDecayScheduler(**LINEAR_LR_CONFIG) linear = [INIT_LR * (1.0 - step / NUM_STEPS) for step in range(NUM_STEPS)] lrs = [] lrs_bl = [] for step in range(NUM_STEPS): lr = lr_sched(step) lrs += [lr] lr_bl = bl_sched(step) lrs_bl += [lr_bl] assert np.allclose(lrs_bl, lrs) assert np.allclose(linear, lrs_bl)
def test_exp(): from eight_mile.tf import optz lr_sched = create_lr_scheduler(**EXP_LR_CONFIG) bl_exp = ExponentialDecayScheduler(**EXP_LR_CONFIG) decay_rate = EXP_LR_CONFIG["decay_rate"] lrs = [] lrs_bl = [] for step in range(NUM_STEPS): lr = lr_sched(step) lrs += [lr] lr_bl = bl_exp(step) lrs_bl += [lr_bl] inv_times = [(INIT_LR * decay_rate**(t / 100.0)) for t in range(NUM_STEPS)] assert np.allclose(inv_times, lrs) assert np.allclose(inv_times, lrs_bl)
def test_cyclic(): from eight_mile.tf import optz tf.compat.v1.reset_default_graph() sess = tf.compat.v1.Session() lr_sched = create_lr_scheduler(**CYCLIC_LR_CONFIG) bl_const = CyclicLRScheduler(**CYCLIC_LR_CONFIG) lr_var = tf.compat.v1.placeholder(tf.float32, shape=(), name="lr") step_var = tf.compat.v1.placeholder(tf.int32, shape=(), name="step") gph = lr_sched(lr_var, step_var) sess.run(tf.compat.v1.global_variables_initializer()) for step in range(NUM_STEPS): lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step}) lr_bl = bl_const(step) assert np.isclose(lr, lr_bl)
def test_constant(): from eight_mile.tf import optz tf.compat.v1.reset_default_graph() sess = tf.compat.v1.Session() lr_sched = create_lr_scheduler(lr=INIT_LR, lr_scheduler_type="default") bl_const = ConstantScheduler(lr=INIT_LR) lr_var = tf.compat.v1.placeholder(tf.float32, shape=(), name="lr") step_var = tf.compat.v1.placeholder(tf.int32, shape=(), name="step") gph = lr_sched(lr_var, step_var) sess.run(tf.compat.v1.global_variables_initializer()) for step in range(NUM_STEPS): lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step}) assert np.isclose(INIT_LR, lr) assert np.isclose(INIT_LR, bl_const(step))
def test_zaremba(): from eight_mile.tf import optz lr_sched = create_lr_scheduler(**ZAREMBA_LR_CONFIG) bl_zaremba = ZarembaDecayScheduler(**ZAREMBA_LR_CONFIG) lrs = [] lrs_bl = [] expect_lrs = [] current_lr = INIT_LR for step in range(NUM_STEPS): lr = lr_sched(step) lr_bl = bl_zaremba(step) lrs += [lr] lrs_bl += [lr_bl] if step in BOUNDS: b = BOUNDS.index(step) current_lr = ZAREMBA_DECAY_VALUES[b] expect_lrs += [current_lr] np.allclose(expect_lrs, lrs) np.allclose(expect_lrs, lrs_bl)
def test_linear_warmup(): from eight_mile.tf import optz tf.compat.v1.reset_default_graph() sess = tf.compat.v1.Session() lr_sched = create_lr_scheduler(**LINEAR_WARMUP_LR_CONFIG) warmup_steps = LINEAR_WARMUP_LR_CONFIG["warmup_steps"] lr_var = tf.compat.v1.placeholder(tf.float32, shape=(), name="lr") step_var = tf.compat.v1.placeholder(tf.int32, shape=(), name="step") gph = lr_sched(lr_var, step_var) sess.run(tf.compat.v1.global_variables_initializer()) lrs = [] for step in range(NUM_STEPS): lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step}) lrs += [lr] expected_lrs = [ INIT_LR * min(1.0, step / warmup_steps) for step in range(NUM_STEPS) ] assert np.allclose(expected_lrs, lrs)
def __init__(self, model_or_params, global_step=0, weight_decay=0.0, **kwargs): DONT_DECAY = ['ln.weight', 'bias'] if isinstance(model_or_params, torch.nn.Module): if weight_decay == 0.0: parameters = model_or_params.parameters() else: params_w_wd = [ p for n, p in model_or_params.named_parameters() if not any(nd in n for nd in DONT_DECAY) ] params_wo_wd = [ p for n, p in model_or_params.named_parameters() if any(nd in n for nd in DONT_DECAY) ] parameters = [{ 'params': params_w_wd, 'weight_decay': weight_decay }, { 'params': params_wo_wd, 'weight_decay': 0.0 }] else: parameters = model_or_params self.global_step = global_step if "lr_function" in kwargs: self.lr_function = kwargs["lr_function"] else: if "lr_scheduler_type" not in kwargs: kwargs["lr_scheduler_type"] = "default" self.lr_function = create_lr_scheduler(**kwargs) self._init_optimizer(parameters, **kwargs) self.current_lr = 0