def build_model(ARGS, X, Y, apply_name=True): N, D = X.shape # first layer inducing points if N > ARGS.M: Z = kmeans2(X, ARGS.M, minit="points")[0] else: M_pad = ARGS.M - N Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0) #################################### layers P = np.linalg.svd(X, full_matrices=False)[2] layers = [] DX = D DY = 1 D_in = D D_out = D with defer_build(): lik = Gaussian() lik.variance = ARGS.likelihood_variance if len(ARGS.configuration) > 0: for c, d in ARGS.configuration.split("_"): if c == "G": num_gps = int(d) A = np.zeros((D_in, D_out)) D_min = min(D_in, D_out) A[:D_min, :D_min] = np.eye(D_min) mf = Linear(A=A) mf.b.set_trainable(False) def make_kern(): k = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1.0, ARD=True) k.variance.set_trainable(False) return k PP = np.zeros((D_out, num_gps)) PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)] ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] kern = SharedMixedMok(make_kern(), W=PP) inducing = MixedKernelSharedMof(InducingPoints(ZZ)) l = GPLayer(kern, inducing, num_gps, layer_num=len(layers), mean_function=mf) if ARGS.fix_linear is True: kern.W.set_trainable(False) mf.set_trainable(False) layers.append(l) D_in = D_out elif c == "L": d = int(d) D_in += d encoder_dims = [ int(dim.strip()) for dim in ARGS.encoder_dims.split(",") ] layers.append( LatentVariableLayer(d, XY_dim=DX + 1, encoder_dims=encoder_dims, qz_mode=ARGS.qz_mode)) kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1.0, ARD=True) ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] layers.append(GPLayer(kern, InducingPoints(ZZ), DY)) #################################### model name = "Model" if apply_name else None if ARGS.mode == "VI": model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == "IWAE": model = DGP_IWVI( X=X, Y=Y, layers=layers, likelihood=lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name, encoder_minibatch_size=ARGS.encoder_minibatch_size, ) elif ARGS.mode == "CIWAE": model = DGP_CIWAE( X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name, beta=ARGS.beta, ) else: raise ValueError(f"Unknown mode {ARGS.mode}.") global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) for layer in model.layers[:-1]: if isinstance(layer, GPLayer): layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5 model.compile() #################################### optimization # Whether to train the final layer with the other parameters, using Adam, or by itself, using natural # gradients. if ARGS.use_nat_grad_for_final_layer: # Turn off training so the parameters are not optimised by Adam. We pass them directly to the natgrad # optimiser, which bypasses this flag. model.layers[-1].q_mu.set_trainable(False) model.layers[-1].q_sqrt.set_trainable(False) gamma = tf.cast( tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True), dtype=tf.float64, ) final_layer_vars = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]] final_layer_opt_op = NatGradOptimizer( gamma=gamma).make_optimize_tensor(model, var_list=final_layer_vars) else: final_layer_opt_op = NoOp() lr = tf.cast( tf.train.exponential_decay(ARGS.lr, global_step, decay_steps=1000, decay_rate=ARGS.lr_decay, staircase=True), dtype=tf.float64, ) encoder_lr = tf.cast( tf.train.exponential_decay( ARGS.encoder_lr, global_step, decay_steps=1000, decay_rate=ARGS.encoder_lr_decay, staircase=True, ), dtype=tf.float64, ) dreg_optimizer = DregOptimizer( enable_dreg=ARGS.use_dreg, optimizer=ARGS.optimizer, encoder_optimizer=ARGS.encoder_optimizer, learning_rate=lr, encoder_learning_rate=encoder_lr, assert_no_nans=ARGS.assert_no_nans, encoder_grad_clip_value=ARGS.clip_encoder_grads, ) other_layers_opt_op = dreg_optimizer.make_optimize_tensor(model) model.lr = lr model.train_op = tf.group(op_increment, final_layer_opt_op, other_layers_opt_op) model.init_op = lambda s: s.run(tf.variables_initializer([global_step])) model.global_step = global_step return model
def build_model(ARGS, X, Y, apply_name=True): if ARGS.mode == 'CVAE': layers = [] for l in ARGS.configuration.split('_'): try: layers.append(int(l)) except: pass with defer_build(): name = 'CVAE' if apply_name else None model = CVAE(X, Y, 1, layers, batch_size=ARGS.minibatch_size, name=name) model.compile() global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, 0.98, staircase=True), dtype=tf.float64) op_adam = AdamOptimizer(lr).make_optimize_tensor(model) model.train_op = lambda s: s.run([op_adam, op_increment]) model.init_op = lambda s: s.run(tf.variables_initializer([global_step])) model.global_step = global_step model.compile() else: N, D = X.shape # first layer inducing points if N > ARGS.M: Z = kmeans2(X, ARGS.M, minit='points')[0] else: M_pad = ARGS.M - N Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0) #################################### layers P = np.linalg.svd(X, full_matrices=False)[2] # PX = P.copy() layers = [] # quad_layers = [] DX = D DY = 1 D_in = D D_out = D with defer_build(): lik = Gaussian() lik.variance = ARGS.likelihood_variance if len(ARGS.configuration) > 0: for c, d in ARGS.configuration.split('_'): if c == 'G': num_gps = int(d) A = np.zeros((D_in, D_out)) D_min = min(D_in, D_out) A[:D_min, :D_min] = np.eye(D_min) mf = Linear(A=A) mf.b.set_trainable(False) def make_kern(): k = RBF(D_in, lengthscales=float(D_in) ** 0.5, variance=1., ARD=True) k.variance.set_trainable(False) return k PP = np.zeros((D_out, num_gps)) PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)] ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] kern = SharedMixedMok(make_kern(), W=PP) inducing = MixedKernelSharedMof(InducingPoints(ZZ)) l = GPLayer(kern, inducing, num_gps, mean_function=mf) if ARGS.fix_linear is True: kern.W.set_trainable(False) mf.set_trainable(False) layers.append(l) D_in = D_out elif c == 'L': d = int(d) D_in += d layers.append(LatentVariableLayer(d, XY_dim=DX+1)) kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True) ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] layers.append(GPLayer(kern, InducingPoints(ZZ), DY)) #################################### model name = 'Model' if apply_name else None if ARGS.mode == 'VI': model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'SGHMC': for layer in layers: if hasattr(layer, 'q_sqrt'): del layer.q_sqrt layer.q_sqrt = None layer.q_mu.set_trainable(False) model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'IWAE': model = DGP_IWVI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name) global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) if not ('SGHMC' == ARGS.mode): for layer in model.layers[:-1]: if isinstance(layer, GPLayer): layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5 model.compile() #################################### optimization var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]] model.layers[-1].q_mu.set_trainable(False) model.layers[-1].q_sqrt.set_trainable(False) gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True), dtype=tf.float64) lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, ARGS.lr_decay, staircase=True), dtype=tf.float64) op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor(model, var_list=var_list) op_adam = AdamOptimizer(lr).make_optimize_tensor(model) def train(s): s.run(op_increment) s.run(op_ng) s.run(op_adam) model.train_op = train model.init_op = lambda s: s.run(tf.variables_initializer([global_step])) model.global_step = global_step else: model.compile() hmc_vars = [] for layer in layers: if hasattr(layer, 'q_mu'): hmc_vars.append(layer.q_mu.unconstrained_tensor) hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model) sghmc_optimizer = SGHMC(model, hmc_vars, hyper_train_op, 100) def train_op(s): s.run(op_increment), sghmc_optimizer.sghmc_step(s), sghmc_optimizer.train_hypers(s) model.train_op = train_op model.sghmc_optimizer = sghmc_optimizer def init_op(s): epsilon = 0.01 mdecay = 0.05 with tf.variable_scope('hmc'): sghmc_optimizer.generate_update_step(epsilon, mdecay) v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='hmc') s.run(tf.variables_initializer(v)) s.run(tf.variables_initializer([global_step])) model.init_op = init_op model.global_step = global_step return model
def build_model(self, ARGS, X, Y, conditioning=False, apply_name=True, noise_var=None, mean_function=None): if conditioning == False: N, D = X.shape # first layer inducing points if N > ARGS.M: Z = kmeans2(X, ARGS.M, minit='points')[0] else: # This is the old way of initializing Zs # M_pad = ARGS.M - N # Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0) # This is the new way of initializing Zs min_x, max_x = self.bounds[0] min_x = (min_x - self.x_mean) / self.x_std max_x = (max_x - self.x_mean) / self.x_std Z = np.linspace(min_x, max_x, num=ARGS.M) # * X.shape[1]) Z = Z.reshape((-1, X.shape[1])) #print(min_x) #print(max_x) #print(Z) #################################### layers P = np.linalg.svd(X, full_matrices=False)[2] # PX = P.copy() layers = [] # quad_layers = [] DX = D DY = 1 D_in = D D_out = D with defer_build(): # variance initialiaztion lik = Gaussian() lik.variance = ARGS.likelihood_variance if len(ARGS.configuration) > 0: for c, d in ARGS.configuration.split('_'): if c == 'G': num_gps = int(d) A = np.zeros((D_in, D_out)) D_min = min(D_in, D_out) A[:D_min, :D_min] = np.eye(D_min) mf = Linear(A=A) mf.b.set_trainable(False) def make_kern(): k = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True) k.variance.set_trainable(False) return k PP = np.zeros((D_out, num_gps)) PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)] ZZ = np.random.randn(ARGS.M, D_in) # print(Z.shape) # print(ZZ.shape) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] kern = SharedMixedMok(make_kern(), W=PP) inducing = MixedKernelSharedMof(InducingPoints(ZZ)) l = GPLayer(kern, inducing, num_gps, mean_function=mf) if ARGS.fix_linear is True: kern.W.set_trainable(False) mf.set_trainable(False) layers.append(l) D_in = D_out elif c == 'L': d = int(d) D_in += d layers.append(LatentVariableLayer(d, XY_dim=DX + 1)) # kernel initialization kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True) ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] layers.append(GPLayer(kern, InducingPoints(ZZ), DY)) self.layers = layers self.lik = lik # global_step = tf.Variable(0, dtype=tf.int32) # self.global_step = global_step else: lik = self._gp.likelihood layers = self._gp.layers._list # val = self.session.run(self.global_step) # global_step = tf.Variable(val, dtype=tf.int32) # self.global_step = global_step self._gp.clear() with defer_build(): #################################### model name = 'Model' if apply_name else None if ARGS.mode == 'VI': model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'SGHMC': for layer in layers: if hasattr(layer, 'q_sqrt'): del layer.q_sqrt layer.q_sqrt = None layer.q_mu.set_trainable(False) model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'IWAE': model = DGP_IWVI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name) global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) if not ('SGHMC' == ARGS.mode): for layer in model.layers[:-1]: if isinstance(layer, GPLayer): layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5 model.compile() #################################### optimization var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]] model.layers[-1].q_mu.set_trainable(False) model.layers[-1].q_sqrt.set_trainable(False) gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True), dtype=tf.float64) lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, ARGS.lr_decay, staircase=True), dtype=tf.float64) op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor( model, var_list=var_list) op_adam = AdamOptimizer(lr).make_optimize_tensor(model) def train(s): s.run(op_increment) s.run(op_ng) s.run(op_adam) model.train_op = train model.init_op = lambda s: s.run( tf.variables_initializer([global_step])) model.global_step = global_step else: model.compile() sghmc_vars = [] for layer in layers: if hasattr(layer, 'q_mu'): sghmc_vars.append(layer.q_mu.unconstrained_tensor) hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model) self.sghmc_optimizer = SGHMC(model, sghmc_vars, hyper_train_op, 100) def train_op(s): s.run(op_increment), self.sghmc_optimizer.sghmc_step(s), self.sghmc_optimizer.train_hypers(s) model.train_op = train_op model.sghmc_optimizer = self.sghmc_optimizer def init_op(s): epsilon = 0.01 mdecay = 0.05 with tf.variable_scope('sghmc'): self.sghmc_optimizer.generate_update_step(epsilon, mdecay) v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='sghmc') s.run(tf.variables_initializer(v)) s.run(tf.variables_initializer([global_step])) # Added jitter due to input matrix invertability problems custom_config = gpflow.settings.get_settings() custom_config.numerics.jitter_level = 1e-8 model.init_op = init_op model.global_step = global_step # build the computation graph for the gradient self.X_placeholder = tf.placeholder(tf.float64, shape=[None, X.shape[1]]) self.Fs, Fmu, Fvar = model._build_predict(self.X_placeholder) self.mean_grad = tf.gradients(Fmu, self.X_placeholder) self.var_grad = tf.gradients(Fvar, self.X_placeholder) # calculated the gradient of the mean for the quantile-filtered distribution # print(Fs) # q = np.quantile(Fs, self.quantile, axis=0) # qFs = [f for f in Fs if f < q] # q_mean = np.mean(qFs, axis=0) # q_var = np.var(qFs, axis=0) # self.qmean_grad = tf.gradients(q_mean, self.X_placeholder) # self.qvar_grad = tf.gradients(q_var, self.X_placeholder) return model