def run_with_adam_and_nat(model, lr, iterations, callback=None, gamma=0.001): if gamma == 0: adam = AdamOptimizer(lr).make_optimize_action(model) actions = [adam] actions = actions if callback is None else actions + [callback] Loop(actions, stop=iterations)() model.anchor(model.enquire_session()) return var_list = [(model.f_latent.q_mu, model.f_latent.q_sqrt)] # we don't want adam optimizing these model.f_latent.q_mu.set_trainable(False) model.f_latent.q_sqrt.set_trainable(False) adam = AdamOptimizer(lr).make_optimize_action(model) natgrad = NatGradOptimizer(gamma).make_optimize_action(model, var_list=var_list) actions = [adam, natgrad] actions = actions if callback is None else actions + [callback] Loop(actions, stop=iterations)() model.anchor(model.enquire_session())
def train_with_adam(model, iterations, callback=None, **kwargs): #,initial_learning_rate=0.03,learning_rate_steps=2.3, # learning_rate_decay=1.5, with tf.variable_scope("learning_rate"): global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.03 decay_steps = int(iterations / 2.) decay_rate = 1. / 1.5 learning_rate = tf.train.exponential_decay(starter_learning_rate, tf.assign_add( global_step, 1), decay_steps, decay_rate, staircase=True) tf.summary.scalar("optimisation/learning_rate", learning_rate) sess = model.enquire_session() tf_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='learning_rate') sess.run(tf.variables_initializer(var_list=tf_vars)) assert isinstance(callback, (tuple, list)) adam = AdamOptimizer(learning_rate).make_optimize_action(model) actions = [adam] actions = actions if callback is None else actions + callback for c in callback: try: c.init() except: pass Loop(actions, stop=iterations)() model.anchor(model.enquire_session())
def run_with_adam(model, lr, iterations, callback=None): adam = AdamOptimizer(lr).make_optimize_action(model) actions = [adam] #natgrad, actions = actions if callback is None else actions + [callback] Loop(actions, stop=iterations)() model.anchor(model.enquire_session())
def train_model(self, dgp_model): ng_vars = [[dgp_model.layers[-1].q_mu, dgp_model.layers[-1].q_sqrt]] for v in ng_vars[0]: v.set_trainable(False) ng_action = NatGradOptimizer(gamma=0.1).make_optimize_action(dgp_model, var_list=ng_vars) adam_action = AdamOptimizer(0.01).make_optimize_action(dgp_model) iterations = 10000 try: Loop([ng_action, adam_action], stop=iterations)() except: print('Failure of Cholesky in Nat Gradient') # sess = dgp_model.enquire_session() # # gamma_start = 1e-2 # gamma_max = 1e-1 # gamma_step = 1e-2 # # gamma = tf.Variable(gamma_start, dtype=tf.float64) # gamma_incremented = tf.where(tf.less(gamma, gamma_max), gamma + gamma_step, gamma_max) # # op_ng = NatGradOptimizer(gamma).make_optimize_tensor(dgp_model, var_list=[[dgp_model.layers[-1].q_mu, # dgp_model.layers[-1].q_sqrt]]) # op_adam = AdamOptimizer(0.001).make_optimize_tensor(dgp_model) # op_increment_gamma = tf.assign(gamma, gamma_incremented) # # gamma_fallback = 1e-1 # we'll reduce by this factor if there's a cholesky failure # op_fallback_gamma = tf.assign(gamma, gamma * gamma_fallback) # # sess.run(tf.variables_initializer([gamma])) # # iterations = 10000 # for it in range(iterations): # try: # sess.run(op_ng) # sess.run(op_increment_gamma) # except tf.errors.InvalidArgumentError: # g = sess.run(gamma) # print('gamma = {} on iteration {} is too big! Falling back to {}'.format(it, g, g * gamma_fallback)) # sess.run(op_fallback_gamma) # # sess.run(op_adam) # # if it % 1000 == 0: # print('{} gamma={:.4f} ELBO={:.4f}'.format(it, *sess.run([gamma, dgp_model.likelihood_tensor]))) # # dgp_model.anchor(sess) # # print(len(tf.all_variables())) # # print(len(tf.get_default_graph().get_operations())) sess = dgp_model.enquire_session() dgp_model.anchor(sess) print('ELBO={:.4f}'.format(*sess.run([dgp_model.likelihood_tensor]))) return dgp_model
def _optimize(self, retry=0, error=None): numiter = self.flags.test_every max_retries = 5 if retry > max_retries: raise error try: Loop(self.loop, stop=numiter)() except tf.errors.InvalidArgumentError as exception: if self.flags.optimizer != "NatGrad": raise exception self.step_back_gamma() self._optimize(retry=retry + 1, error=exception)
def train_with_bfgs(model, learning_rate, iterations, callback=None): sess = model.enquire_session() assert isinstance(callback, (tuple, list)) for c in callback: c.init() adam = ScipyOptimizer().make_optimize_action(model) actions = [adam] actions = actions if callback is None else actions + callback Loop(actions)() model.anchor(model.enquire_session())
def test_hypers_SVGP_vs_SGPR(session_tf, svgp, sgpr): """ Test SVGP vs SGPR. Combined optimization. The logic is as follows: SVGP is given on nat grad step with gamma=1. Now it is identical to SGPR (which has analytic optimal variational distribution) We then take an ordinary gradient step on the hyperparameters (and inducing locations Z) Finally we update the variational parameters to their optimal values with another nat grad step with gamma=1. These three steps are equivalent to an ordinary gradient step on the parameters of SGPR In this test we simply make the variational parameters trainable=False, so they are not updated by the ordinary gradient step """ anchor = False variationals = [(svgp.q_mu, svgp.q_sqrt)] svgp.q_mu.trainable = False svgp.q_sqrt.trainable = False opt = NatGradOptimizer(Datum.gamma) opt.minimize(svgp, var_list=variationals, maxiter=1, anchor=anchor) sgpr_likelihood = sgpr.compute_log_likelihood() svgp_likelihood = svgp.compute_log_likelihood() assert_allclose(sgpr_likelihood, svgp_likelihood, atol=1e-5) # combination (doing GD first as we've already done the nat grad step a1 = GradientDescentOptimizer( Datum.learning_rate).make_optimize_action(svgp) a2 = NatGradOptimizer(Datum.gamma).make_optimize_action( svgp, var_list=variationals) Loop([a1, a2]).with_settings(stop=1)() GradientDescentOptimizer(Datum.learning_rate).minimize(sgpr, maxiter=1, anchor=anchor) sgpr_likelihood = sgpr.compute_log_likelihood() svgp_likelihood = svgp.compute_log_likelihood() assert_allclose(sgpr_likelihood, svgp_likelihood, atol=1e-5)
def run_adam(self, lr, iterations): adam = AdamOptimizer(lr).make_optimize_action(self) actions = [adam, PrintAction(self, "MF-DGP with Adam")] loop = Loop(actions, stop=iterations)() self.anchor(self.enquire_session())
def train_with_nat(model, gamma_start=1e-5, gamma_add=1e-3, gamma_mul=1.04, gamma_max=0.1, gamma_fallback=1e-1, iterations=500, var_list=None, callback=None, **kwargs): # we'll make use of this later when we use a XiTransform if var_list is None: var_list = [[model.q_mu, model.q_sqrt]] with tf.variable_scope("gamma"): gamma_start = tf.cast(gamma_start, tf.float64) gamma_max = tf.cast(gamma_max, tf.float64) mul_step = tf.cast(gamma_mul, tf.float64) add_step = tf.cast(gamma_add, tf.float64) gamma = tf.Variable(gamma_start, dtype=tf.float64, trainable=False) gamma_ref = tf.identity(gamma) gamma_fallback = tf.cast( gamma_fallback, tf.float64 ) # we'll reduce by this factor if there's a cholesky failure op_fallback_gamma = tf.assign(gamma, gamma_ref * gamma_fallback) diff = tf.where(gamma_ref * mul_step < add_step, gamma_ref * mul_step, add_step) op_gamma_inc = tf.assign( gamma, tf.where(gamma_ref + diff > gamma_max, gamma_max, gamma_ref + diff)) tf.summary.scalar("optimisation/gamma", gamma) sess = model.enquire_session() tf_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='gamma') sess.run(tf.variables_initializer(var_list=tf_vars)) natgrad = NatGradOptimizer(gamma_ref).make_optimize_action( model, var_list=var_list) actions = [natgrad, GammaSchedule(op_gamma_inc)] actions = actions if callback is None else actions + callback for c in callback: try: c.init() except: pass sess = model.enquire_session() it = 0 while it < iterations: try: looper = Loop(actions, start=it, stop=iterations) looper() it = looper.iteration except tf.errors.InvalidArgumentError: it = looper.iteration g, gf = sess.run([gamma_ref, op_fallback_gamma]) logging.info( 'gamma = {} on iteration {} is too big! Falling back to {}'. format(g, it, gf)) model.anchor(model.enquire_session())
def train_with_nat_and_adam(model, initial_learning_rate=0.03, learning_rate_steps=2, learning_rate_decay=1.5, gamma_start=1e-5, gamma_add=1e-3, gamma_mul=1.1, gamma_max=0.1, gamma_fallback=1e-1, iterations=500, var_list=None, callback=None, **kwargs): # we'll make use of this later when we use a XiTransform if var_list is None: var_list = [[model.q_mu, model.q_sqrt]] # we don't want adam optimizing these model.q_mu.set_trainable(False) model.q_sqrt.set_trainable(False) with tf.variable_scope("learning_rate"): global_step = tf.Variable(0, trainable=False) starter_learning_rate = initial_learning_rate decay_steps = int(iterations / learning_rate_steps) decay_rate = 1. / learning_rate_decay learning_rate = tf.train.exponential_decay(starter_learning_rate, tf.assign_add( global_step, 1), decay_steps, decay_rate, staircase=True) tf.summary.scalar("optimisation/learning_rate", learning_rate) sess = model.enquire_session() tf_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='learning_rate') sess.run(tf.variables_initializer(var_list=tf_vars)) with tf.variable_scope("gamma"): # gamma = tf.Variable(gamma_start, dtype=tf.float64) # beta = tf.Variable(1.,dtype=tf.float64) gamma_start = tf.cast(gamma_start, tf.float64) gamma_max = tf.cast(gamma_max, tf.float64) mul_step = tf.cast(gamma_mul, tf.float64) add_step = tf.cast(gamma_add, tf.float64) gamma = tf.Variable(gamma_start, dtype=tf.float64) gamma_ref = tf.identity(gamma) gamma_fallback = tf.cast( gamma_fallback, tf.float64 ) # we'll reduce by this factor if there's a cholesky failure op_fallback_gamma = tf.assign(gamma, gamma * gamma_fallback) diff = tf.where(gamma_ref * mul_step < add_step, gamma_ref * mul_step, add_step) op_gamma_inc = tf.assign( gamma, tf.where(gamma_ref + diff > gamma_max, gamma_max, gamma_ref + diff)) tf.summary.scalar("optimisation/gamma", gamma) sess = model.enquire_session() tf_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='gamma') sess.run(tf.variables_initializer(var_list=tf_vars)) natgrad = NatGradOptimizer(gamma_ref).make_optimize_action( model, var_list=var_list) adam = AdamOptimizer(learning_rate).make_optimize_action(model) actions = [adam, natgrad, GammaSchedule(op_gamma_inc)] actions = actions if callback is None else actions + callback for c in callback: try: c.init() except: pass sess = model.enquire_session() it = 0 while it < iterations: try: looper = Loop(actions, start=it, stop=iterations) looper() it = looper.iteration except tf.errors.InvalidArgumentError: it = looper.iteration g, gf = sess.run([gamma_ref, op_fallback_gamma]) logging.info( 'gamma = {} on iteration {} is too big! Falling back to {}'. format(g, it, gf)) model.anchor(model.enquire_session())