def run_with_adam_and_nat(model, lr, iterations, callback=None, gamma=0.001): if gamma == 0: adam = AdamOptimizer(lr).make_optimize_action(model) actions = [adam] actions = actions if callback is None else actions + [callback] Loop(actions, stop=iterations)() model.anchor(model.enquire_session()) return var_list = [(model.f_latent.q_mu, model.f_latent.q_sqrt)] # we don't want adam optimizing these model.f_latent.q_mu.set_trainable(False) model.f_latent.q_sqrt.set_trainable(False) adam = AdamOptimizer(lr).make_optimize_action(model) natgrad = NatGradOptimizer(gamma).make_optimize_action(model, var_list=var_list) actions = [adam, natgrad] actions = actions if callback is None else actions + [callback] Loop(actions, stop=iterations)() model.anchor(model.enquire_session())
def train_model(self, dgp_model): ng_vars = [[dgp_model.layers[-1].q_mu, dgp_model.layers[-1].q_sqrt]] for v in ng_vars[0]: v.set_trainable(False) ng_action = NatGradOptimizer(gamma=0.1).make_optimize_action(dgp_model, var_list=ng_vars) adam_action = AdamOptimizer(0.01).make_optimize_action(dgp_model) iterations = 10000 try: Loop([ng_action, adam_action], stop=iterations)() except: print('Failure of Cholesky in Nat Gradient') # sess = dgp_model.enquire_session() # # gamma_start = 1e-2 # gamma_max = 1e-1 # gamma_step = 1e-2 # # gamma = tf.Variable(gamma_start, dtype=tf.float64) # gamma_incremented = tf.where(tf.less(gamma, gamma_max), gamma + gamma_step, gamma_max) # # op_ng = NatGradOptimizer(gamma).make_optimize_tensor(dgp_model, var_list=[[dgp_model.layers[-1].q_mu, # dgp_model.layers[-1].q_sqrt]]) # op_adam = AdamOptimizer(0.001).make_optimize_tensor(dgp_model) # op_increment_gamma = tf.assign(gamma, gamma_incremented) # # gamma_fallback = 1e-1 # we'll reduce by this factor if there's a cholesky failure # op_fallback_gamma = tf.assign(gamma, gamma * gamma_fallback) # # sess.run(tf.variables_initializer([gamma])) # # iterations = 10000 # for it in range(iterations): # try: # sess.run(op_ng) # sess.run(op_increment_gamma) # except tf.errors.InvalidArgumentError: # g = sess.run(gamma) # print('gamma = {} on iteration {} is too big! Falling back to {}'.format(it, g, g * gamma_fallback)) # sess.run(op_fallback_gamma) # # sess.run(op_adam) # # if it % 1000 == 0: # print('{} gamma={:.4f} ELBO={:.4f}'.format(it, *sess.run([gamma, dgp_model.likelihood_tensor]))) # # dgp_model.anchor(sess) # # print(len(tf.all_variables())) # # print(len(tf.get_default_graph().get_operations())) sess = dgp_model.enquire_session() dgp_model.anchor(sess) print('ELBO={:.4f}'.format(*sess.run([dgp_model.likelihood_tensor]))) return dgp_model
def test_2layer_vs_nat_grad(self): Ns, N, M = 5, 1, 50 D_X, D_Y = 1, 1 lik_var = 0.1 X = np.random.uniform(size=(N, D_X)) Y = np.random.uniform(size=(N, D_Y)) Z = np.random.uniform(size=(M, D_Y)) Xs = np.random.uniform(size=(Ns, D_X)) Z[:N, :] = X[:M, :] def kerns(): return [RBF(D_X, lengthscales=0.1), RBF(D_X, lengthscales=0.5)] layers_col = init_layers_linear(X, Y, Z, kerns()) layers_ng = init_layers_linear(X, Y, Z, kerns()) def lik(): l = Gaussian() l.variance = lik_var return l last_layer = SGPR_Layer(layers_col[-1].kern, layers_col[-1].feature.Z.read_value(), D_Y, layers_col[-1].mean_function) layers_col = layers_col[:-1] + [last_layer] m_col = DGP_Collapsed(X, Y, lik(), layers_col) m_ng = DGP_Quad(X, Y, lik(), layers_ng, H=200) q_mu1 = np.random.randn(M, D_X) q_sqrt1 = np.random.randn(M, M) q_sqrt1 = np.tril(q_sqrt1)[None, :, :] for m in m_col, m_ng: m.layers[0].q_mu = q_mu1 m.layers[0].q_sqrt = q_sqrt1 p = [[m_ng.layers[-1].q_mu, m_ng.layers[-1].q_sqrt]] NatGradOptimizer(gamma=1.).minimize(m_ng, var_list=p, maxiter=1) assert_allclose(m_col.compute_log_likelihood(), m_ng.compute_log_likelihood())
def fit(self, X, Y): """ Optimize """ if not self.model: self.init_model(ODVGP, X, Y) var_list = [[self.model.basis.a_beta, self.model.basis.L]] self.model.basis.a_beta.set_trainable(False) op_ng = NatGradOptimizer(SETTINGS.ng_stepsize).make_optimize_tensor( self.model, var_list=var_list) op_adam = AdamOptimizer( SETTINGS.adam_stepsize).make_optimize_tensor(self.model) for it in range(SETTINGS.iterations): self.sess.run(op_ng) self.sess.run(op_adam) if it % 50 == 0: print('Iter: {}, Loss:{:.4f}'.format( it, self.sess.run(self.model.likelihood_tensor))) self.model.anchor(self.sess)
def build_model(ARGS, X, Y, apply_name=True): if ARGS.mode == 'CVAE': layers = [] for l in ARGS.configuration.split('_'): try: layers.append(int(l)) except: pass with defer_build(): name = 'CVAE' if apply_name else None model = CVAE(X, Y, 1, layers, batch_size=ARGS.minibatch_size, name=name) model.compile() global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, 0.98, staircase=True), dtype=tf.float64) op_adam = AdamOptimizer(lr).make_optimize_tensor(model) model.train_op = lambda s: s.run([op_adam, op_increment]) model.init_op = lambda s: s.run(tf.variables_initializer([global_step])) model.global_step = global_step model.compile() else: N, D = X.shape # first layer inducing points if N > ARGS.M: Z = kmeans2(X, ARGS.M, minit='points')[0] else: M_pad = ARGS.M - N Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0) #################################### layers P = np.linalg.svd(X, full_matrices=False)[2] # PX = P.copy() layers = [] # quad_layers = [] DX = D DY = 1 D_in = D D_out = D with defer_build(): lik = Gaussian() lik.variance = ARGS.likelihood_variance if len(ARGS.configuration) > 0: for c, d in ARGS.configuration.split('_'): if c == 'G': num_gps = int(d) A = np.zeros((D_in, D_out)) D_min = min(D_in, D_out) A[:D_min, :D_min] = np.eye(D_min) mf = Linear(A=A) mf.b.set_trainable(False) def make_kern(): k = RBF(D_in, lengthscales=float(D_in) ** 0.5, variance=1., ARD=True) k.variance.set_trainable(False) return k PP = np.zeros((D_out, num_gps)) PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)] ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] kern = SharedMixedMok(make_kern(), W=PP) inducing = MixedKernelSharedMof(InducingPoints(ZZ)) l = GPLayer(kern, inducing, num_gps, mean_function=mf) if ARGS.fix_linear is True: kern.W.set_trainable(False) mf.set_trainable(False) layers.append(l) D_in = D_out elif c == 'L': d = int(d) D_in += d layers.append(LatentVariableLayer(d, XY_dim=DX+1)) kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True) ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] layers.append(GPLayer(kern, InducingPoints(ZZ), DY)) #################################### model name = 'Model' if apply_name else None if ARGS.mode == 'VI': model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'SGHMC': for layer in layers: if hasattr(layer, 'q_sqrt'): del layer.q_sqrt layer.q_sqrt = None layer.q_mu.set_trainable(False) model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'IWAE': model = DGP_IWVI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name) global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) if not ('SGHMC' == ARGS.mode): for layer in model.layers[:-1]: if isinstance(layer, GPLayer): layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5 model.compile() #################################### optimization var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]] model.layers[-1].q_mu.set_trainable(False) model.layers[-1].q_sqrt.set_trainable(False) gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True), dtype=tf.float64) lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, ARGS.lr_decay, staircase=True), dtype=tf.float64) op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor(model, var_list=var_list) op_adam = AdamOptimizer(lr).make_optimize_tensor(model) def train(s): s.run(op_increment) s.run(op_ng) s.run(op_adam) model.train_op = train model.init_op = lambda s: s.run(tf.variables_initializer([global_step])) model.global_step = global_step else: model.compile() hmc_vars = [] for layer in layers: if hasattr(layer, 'q_mu'): hmc_vars.append(layer.q_mu.unconstrained_tensor) hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model) sghmc_optimizer = SGHMC(model, hmc_vars, hyper_train_op, 100) def train_op(s): s.run(op_increment), sghmc_optimizer.sghmc_step(s), sghmc_optimizer.train_hypers(s) model.train_op = train_op model.sghmc_optimizer = sghmc_optimizer def init_op(s): epsilon = 0.01 mdecay = 0.05 with tf.variable_scope('hmc'): sghmc_optimizer.generate_update_step(epsilon, mdecay) v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='hmc') s.run(tf.variables_initializer(v)) s.run(tf.variables_initializer([global_step])) model.init_op = init_op model.global_step = global_step return model
def build_model(ARGS, X, Y, apply_name=True): N, D = X.shape # first layer inducing points if N > ARGS.M: Z = kmeans2(X, ARGS.M, minit="points")[0] else: M_pad = ARGS.M - N Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0) #################################### layers P = np.linalg.svd(X, full_matrices=False)[2] layers = [] DX = D DY = 1 D_in = D D_out = D with defer_build(): lik = Gaussian() lik.variance = ARGS.likelihood_variance if len(ARGS.configuration) > 0: for c, d in ARGS.configuration.split("_"): if c == "G": num_gps = int(d) A = np.zeros((D_in, D_out)) D_min = min(D_in, D_out) A[:D_min, :D_min] = np.eye(D_min) mf = Linear(A=A) mf.b.set_trainable(False) def make_kern(): k = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1.0, ARD=True) k.variance.set_trainable(False) return k PP = np.zeros((D_out, num_gps)) PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)] ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] kern = SharedMixedMok(make_kern(), W=PP) inducing = MixedKernelSharedMof(InducingPoints(ZZ)) l = GPLayer(kern, inducing, num_gps, layer_num=len(layers), mean_function=mf) if ARGS.fix_linear is True: kern.W.set_trainable(False) mf.set_trainable(False) layers.append(l) D_in = D_out elif c == "L": d = int(d) D_in += d encoder_dims = [ int(dim.strip()) for dim in ARGS.encoder_dims.split(",") ] layers.append( LatentVariableLayer(d, XY_dim=DX + 1, encoder_dims=encoder_dims, qz_mode=ARGS.qz_mode)) kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1.0, ARD=True) ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] layers.append(GPLayer(kern, InducingPoints(ZZ), DY)) #################################### model name = "Model" if apply_name else None if ARGS.mode == "VI": model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == "IWAE": model = DGP_IWVI( X=X, Y=Y, layers=layers, likelihood=lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name, encoder_minibatch_size=ARGS.encoder_minibatch_size, ) elif ARGS.mode == "CIWAE": model = DGP_CIWAE( X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name, beta=ARGS.beta, ) else: raise ValueError(f"Unknown mode {ARGS.mode}.") global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) for layer in model.layers[:-1]: if isinstance(layer, GPLayer): layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5 model.compile() #################################### optimization # Whether to train the final layer with the other parameters, using Adam, or by itself, using natural # gradients. if ARGS.use_nat_grad_for_final_layer: # Turn off training so the parameters are not optimised by Adam. We pass them directly to the natgrad # optimiser, which bypasses this flag. model.layers[-1].q_mu.set_trainable(False) model.layers[-1].q_sqrt.set_trainable(False) gamma = tf.cast( tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True), dtype=tf.float64, ) final_layer_vars = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]] final_layer_opt_op = NatGradOptimizer( gamma=gamma).make_optimize_tensor(model, var_list=final_layer_vars) else: final_layer_opt_op = NoOp() lr = tf.cast( tf.train.exponential_decay(ARGS.lr, global_step, decay_steps=1000, decay_rate=ARGS.lr_decay, staircase=True), dtype=tf.float64, ) encoder_lr = tf.cast( tf.train.exponential_decay( ARGS.encoder_lr, global_step, decay_steps=1000, decay_rate=ARGS.encoder_lr_decay, staircase=True, ), dtype=tf.float64, ) dreg_optimizer = DregOptimizer( enable_dreg=ARGS.use_dreg, optimizer=ARGS.optimizer, encoder_optimizer=ARGS.encoder_optimizer, learning_rate=lr, encoder_learning_rate=encoder_lr, assert_no_nans=ARGS.assert_no_nans, encoder_grad_clip_value=ARGS.clip_encoder_grads, ) other_layers_opt_op = dreg_optimizer.make_optimize_tensor(model) model.lr = lr model.train_op = tf.group(op_increment, final_layer_opt_op, other_layers_opt_op) model.init_op = lambda s: s.run(tf.variables_initializer([global_step])) model.global_step = global_step return model
def build_model(self, ARGS, X, Y, conditioning=False, apply_name=True, noise_var=None, mean_function=None): if conditioning == False: N, D = X.shape # first layer inducing points if N > ARGS.M: Z = kmeans2(X, ARGS.M, minit='points')[0] else: # This is the old way of initializing Zs # M_pad = ARGS.M - N # Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0) # This is the new way of initializing Zs min_x, max_x = self.bounds[0] min_x = (min_x - self.x_mean) / self.x_std max_x = (max_x - self.x_mean) / self.x_std Z = np.linspace(min_x, max_x, num=ARGS.M) # * X.shape[1]) Z = Z.reshape((-1, X.shape[1])) #print(min_x) #print(max_x) #print(Z) #################################### layers P = np.linalg.svd(X, full_matrices=False)[2] # PX = P.copy() layers = [] # quad_layers = [] DX = D DY = 1 D_in = D D_out = D with defer_build(): # variance initialiaztion lik = Gaussian() lik.variance = ARGS.likelihood_variance if len(ARGS.configuration) > 0: for c, d in ARGS.configuration.split('_'): if c == 'G': num_gps = int(d) A = np.zeros((D_in, D_out)) D_min = min(D_in, D_out) A[:D_min, :D_min] = np.eye(D_min) mf = Linear(A=A) mf.b.set_trainable(False) def make_kern(): k = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True) k.variance.set_trainable(False) return k PP = np.zeros((D_out, num_gps)) PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)] ZZ = np.random.randn(ARGS.M, D_in) # print(Z.shape) # print(ZZ.shape) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] kern = SharedMixedMok(make_kern(), W=PP) inducing = MixedKernelSharedMof(InducingPoints(ZZ)) l = GPLayer(kern, inducing, num_gps, mean_function=mf) if ARGS.fix_linear is True: kern.W.set_trainable(False) mf.set_trainable(False) layers.append(l) D_in = D_out elif c == 'L': d = int(d) D_in += d layers.append(LatentVariableLayer(d, XY_dim=DX + 1)) # kernel initialization kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True) ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] layers.append(GPLayer(kern, InducingPoints(ZZ), DY)) self.layers = layers self.lik = lik # global_step = tf.Variable(0, dtype=tf.int32) # self.global_step = global_step else: lik = self._gp.likelihood layers = self._gp.layers._list # val = self.session.run(self.global_step) # global_step = tf.Variable(val, dtype=tf.int32) # self.global_step = global_step self._gp.clear() with defer_build(): #################################### model name = 'Model' if apply_name else None if ARGS.mode == 'VI': model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'SGHMC': for layer in layers: if hasattr(layer, 'q_sqrt'): del layer.q_sqrt layer.q_sqrt = None layer.q_mu.set_trainable(False) model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'IWAE': model = DGP_IWVI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name) global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) if not ('SGHMC' == ARGS.mode): for layer in model.layers[:-1]: if isinstance(layer, GPLayer): layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5 model.compile() #################################### optimization var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]] model.layers[-1].q_mu.set_trainable(False) model.layers[-1].q_sqrt.set_trainable(False) gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True), dtype=tf.float64) lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, ARGS.lr_decay, staircase=True), dtype=tf.float64) op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor( model, var_list=var_list) op_adam = AdamOptimizer(lr).make_optimize_tensor(model) def train(s): s.run(op_increment) s.run(op_ng) s.run(op_adam) model.train_op = train model.init_op = lambda s: s.run( tf.variables_initializer([global_step])) model.global_step = global_step else: model.compile() sghmc_vars = [] for layer in layers: if hasattr(layer, 'q_mu'): sghmc_vars.append(layer.q_mu.unconstrained_tensor) hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model) self.sghmc_optimizer = SGHMC(model, sghmc_vars, hyper_train_op, 100) def train_op(s): s.run(op_increment), self.sghmc_optimizer.sghmc_step(s), self.sghmc_optimizer.train_hypers(s) model.train_op = train_op model.sghmc_optimizer = self.sghmc_optimizer def init_op(s): epsilon = 0.01 mdecay = 0.05 with tf.variable_scope('sghmc'): self.sghmc_optimizer.generate_update_step(epsilon, mdecay) v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='sghmc') s.run(tf.variables_initializer(v)) s.run(tf.variables_initializer([global_step])) # Added jitter due to input matrix invertability problems custom_config = gpflow.settings.get_settings() custom_config.numerics.jitter_level = 1e-8 model.init_op = init_op model.global_step = global_step # build the computation graph for the gradient self.X_placeholder = tf.placeholder(tf.float64, shape=[None, X.shape[1]]) self.Fs, Fmu, Fvar = model._build_predict(self.X_placeholder) self.mean_grad = tf.gradients(Fmu, self.X_placeholder) self.var_grad = tf.gradients(Fvar, self.X_placeholder) # calculated the gradient of the mean for the quantile-filtered distribution # print(Fs) # q = np.quantile(Fs, self.quantile, axis=0) # qFs = [f for f in Fs if f < q] # q_mean = np.mean(qFs, axis=0) # q_var = np.var(qFs, axis=0) # self.qmean_grad = tf.gradients(q_mean, self.X_placeholder) # self.qvar_grad = tf.gradients(q_var, self.X_placeholder) return model
def test_vs_DGP2(self): lik = Gaussian() lik_var = 0.1 lik.variance = lik_var N, Ns, D_Y, D_X = self.X.shape[0], self.Xs.shape[ 0], self.D_Y, self.X.shape[1] q_mu = np.random.randn(N, D_X) Y = np.random.randn(N, D_Y) Ys = np.random.randn(Ns, D_Y) kern1 = Matern52(self.X.shape[1], lengthscales=0.5) kern2 = Matern52(self.X.shape[1], lengthscales=0.5) kerns = [kern1, kern2] # mf = Linear(A=np.random.randn(D_X, D_Y), b=np.random.randn(D_Y)) mf = Zero() m_dgp = DGP(self.X, Y, self.X, kerns, lik, mean_function=mf, white=True) m_dgp.layers[0].q_mu = q_mu m_dgp.layers[0].q_sqrt = m_dgp.layers[0].q_sqrt.read_value( ) * 1e-24 Fs, ms, vs = m_dgp.predict_all_layers(self.Xs, 1) Z = self.X.copy() Z[:len(self.Xs)] = ms[0][0] m_dgp.layers[ 1].feature.Z = Z # need to put the inducing points in the right place var_list = [[m_dgp.layers[1].q_mu, m_dgp.layers[1].q_sqrt]] NatGradOptimizer(gamma=1).minimize(m_dgp, var_list=var_list, maxiter=1) mean_dgp, var_dgp = m_dgp.predict_y(self.Xs, 1) test_lik_dgp = m_dgp.predict_density(self.Xs, Ys, 1) pred_m_dgp, pred_v_gpr = m_dgp.predict_f(self.Xs, 1) pred_mfull_dgp, pred_vfull_dgp = m_dgp.predict_f_full_cov( self.Xs, 1) # mean_functions = [Identity(), mf] layer0 = GPMC_Layer(kerns[0], self.X.copy(), D_X, Identity()) layer1 = GPR_Layer(kerns[1], mf, D_Y) m_heinonen = DGP_Heinonen(self.X, Y, lik, [layer0, layer1]) m_heinonen.layers[0].q_mu = q_mu mean_heinonen, var_heinonen = m_heinonen.predict_y(self.Xs, 1) test_lik_heinonen = m_heinonen.predict_density(self.Xs, Ys, 1) pred_m_heinonen, pred_v_heinonen = m_heinonen.predict_f(self.Xs, 1) pred_mfull_heinonen, pred_vfull_heinonen = m_heinonen.predict_f_full_cov( self.Xs, 1) tol = 1e-4 assert_allclose(mean_dgp, mean_heinonen, atol=tol, rtol=tol) assert_allclose(test_lik_dgp, test_lik_heinonen, atol=tol, rtol=tol) assert_allclose(pred_m_dgp, pred_m_heinonen, atol=tol, rtol=tol) assert_allclose(pred_mfull_dgp, pred_mfull_heinonen, atol=tol, rtol=tol) assert_allclose(pred_vfull_dgp, pred_vfull_heinonen, atol=tol, rtol=tol)
def train_with_nat(model, gamma_start=1e-5, gamma_add=1e-3, gamma_mul=1.04, gamma_max=0.1, gamma_fallback=1e-1, iterations=500, var_list=None, callback=None, **kwargs): # we'll make use of this later when we use a XiTransform if var_list is None: var_list = [[model.q_mu, model.q_sqrt]] with tf.variable_scope("gamma"): gamma_start = tf.cast(gamma_start, tf.float64) gamma_max = tf.cast(gamma_max, tf.float64) mul_step = tf.cast(gamma_mul, tf.float64) add_step = tf.cast(gamma_add, tf.float64) gamma = tf.Variable(gamma_start, dtype=tf.float64, trainable=False) gamma_ref = tf.identity(gamma) gamma_fallback = tf.cast( gamma_fallback, tf.float64 ) # we'll reduce by this factor if there's a cholesky failure op_fallback_gamma = tf.assign(gamma, gamma_ref * gamma_fallback) diff = tf.where(gamma_ref * mul_step < add_step, gamma_ref * mul_step, add_step) op_gamma_inc = tf.assign( gamma, tf.where(gamma_ref + diff > gamma_max, gamma_max, gamma_ref + diff)) tf.summary.scalar("optimisation/gamma", gamma) sess = model.enquire_session() tf_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='gamma') sess.run(tf.variables_initializer(var_list=tf_vars)) natgrad = NatGradOptimizer(gamma_ref).make_optimize_action( model, var_list=var_list) actions = [natgrad, GammaSchedule(op_gamma_inc)] actions = actions if callback is None else actions + callback for c in callback: try: c.init() except: pass sess = model.enquire_session() it = 0 while it < iterations: try: looper = Loop(actions, start=it, stop=iterations) looper() it = looper.iteration except tf.errors.InvalidArgumentError: it = looper.iteration g, gf = sess.run([gamma_ref, op_fallback_gamma]) logging.info( 'gamma = {} on iteration {} is too big! Falling back to {}'. format(g, it, gf)) model.anchor(model.enquire_session())
def train_with_nat_and_adam(model, initial_learning_rate=0.03, learning_rate_steps=2, learning_rate_decay=1.5, gamma_start=1e-5, gamma_add=1e-3, gamma_mul=1.1, gamma_max=0.1, gamma_fallback=1e-1, iterations=500, var_list=None, callback=None, **kwargs): # we'll make use of this later when we use a XiTransform if var_list is None: var_list = [[model.q_mu, model.q_sqrt]] # we don't want adam optimizing these model.q_mu.set_trainable(False) model.q_sqrt.set_trainable(False) with tf.variable_scope("learning_rate"): global_step = tf.Variable(0, trainable=False) starter_learning_rate = initial_learning_rate decay_steps = int(iterations / learning_rate_steps) decay_rate = 1. / learning_rate_decay learning_rate = tf.train.exponential_decay(starter_learning_rate, tf.assign_add( global_step, 1), decay_steps, decay_rate, staircase=True) tf.summary.scalar("optimisation/learning_rate", learning_rate) sess = model.enquire_session() tf_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='learning_rate') sess.run(tf.variables_initializer(var_list=tf_vars)) with tf.variable_scope("gamma"): # gamma = tf.Variable(gamma_start, dtype=tf.float64) # beta = tf.Variable(1.,dtype=tf.float64) gamma_start = tf.cast(gamma_start, tf.float64) gamma_max = tf.cast(gamma_max, tf.float64) mul_step = tf.cast(gamma_mul, tf.float64) add_step = tf.cast(gamma_add, tf.float64) gamma = tf.Variable(gamma_start, dtype=tf.float64) gamma_ref = tf.identity(gamma) gamma_fallback = tf.cast( gamma_fallback, tf.float64 ) # we'll reduce by this factor if there's a cholesky failure op_fallback_gamma = tf.assign(gamma, gamma * gamma_fallback) diff = tf.where(gamma_ref * mul_step < add_step, gamma_ref * mul_step, add_step) op_gamma_inc = tf.assign( gamma, tf.where(gamma_ref + diff > gamma_max, gamma_max, gamma_ref + diff)) tf.summary.scalar("optimisation/gamma", gamma) sess = model.enquire_session() tf_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='gamma') sess.run(tf.variables_initializer(var_list=tf_vars)) natgrad = NatGradOptimizer(gamma_ref).make_optimize_action( model, var_list=var_list) adam = AdamOptimizer(learning_rate).make_optimize_action(model) actions = [adam, natgrad, GammaSchedule(op_gamma_inc)] actions = actions if callback is None else actions + callback for c in callback: try: c.init() except: pass sess = model.enquire_session() it = 0 while it < iterations: try: looper = Loop(actions, start=it, stop=iterations) looper() it = looper.iteration except tf.errors.InvalidArgumentError: it = looper.iteration g, gf = sess.run([gamma_ref, op_fallback_gamma]) logging.info( 'gamma = {} on iteration {} is too big! Falling back to {}'. format(g, it, gf)) model.anchor(model.enquire_session())