def log_joint(observed): model, _, _ = module.p_Y_Xw(observed, X, DROP_RATE, n_basis, net_sizes, n_samples, task) log_py_xw = model.local_log_prob('y') log_j = zs.log_mean_exp(log_py_xw, 0) * N if (len(w_names)): log_pws = model.local_log_prob(w_names) log_j += tf.add_n(log_pws) return log_j
def log_likelihood(log_py_xw, std_y_train): """ Log Likelihood. :param log_py_xw: [n_particles, batch_size] or [batch_size] :param std_y_train: float :return : tensor of shape []. RMSE. """ rank = len(log_py_xw.get_shape()) if rank == 1: log_py_xw = tf.expand_dims(log_py_xw, [0]) ll = tf.reduce_mean(zs.log_mean_exp(log_py_xw, 0)) - tf.log(std_y_train) return ll
def forward(self, observed, reduce_mean=True): nodes_q = self.variational(observed).nodes _v_inputs = {k: v.tensor for k, v in nodes_q.items()} _observed = {**_v_inputs, **observed} nodes_p = self.generator(_observed).nodes logpxz = self.log_joint(nodes_p) logqz = self.log_joint(nodes_q) lower_bound = logpxz - logqz if self._axis is not None: lower_bound = log_mean_exp(lower_bound, self._axis) if reduce_mean: return fluid.layers.reduce_mean(-lower_bound) else: return -lower_bound
def main(hps): tf.set_random_seed(hps.seed) np.random.seed(hps.seed) # Load data data_path = os.path.join(hps.data_dir, hps.dataset + '.data') data_func = dataset.data_dict()[hps.dataset] x_train, y_train, x_valid, y_valid, x_test, y_test = data_func(data_path) x_train = np.vstack([x_train, x_valid]) y_train = np.hstack([y_train, y_valid]) n_train, x_dim = x_train.shape x_train, x_test, mean_x_train, std_x_train = dataset.standardize( x_train, x_test) y_train, y_test, mean_y_train, std_y_train = dataset.standardize( y_train, y_test) # Define model parameters n_hiddens = hps.layers # Build the computation graph x = tf.placeholder(tf.float32, shape=[None, x_dim]) y = tf.placeholder(tf.float32, shape=[None]) layer_sizes = [x_dim] + n_hiddens + [1] w_names = ["w" + str(i) for i in range(len(layer_sizes) - 1)] meta_model = build_model(x, layer_sizes, hps.n_particles, hps.fix_variance) def log_joint(bn): log_pws = bn.cond_log_prob(w_names) log_py_xw = bn.cond_log_prob('y') return tf.add_n(log_pws) + tf.reduce_mean(log_py_xw, 1) * n_train meta_model.log_joint = log_joint latent = {} for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): buf = tf.get_variable( 'buf_'+str(i), initializer=init_bnn_weight(hps.n_particles, n_in, n_out)) latent['w'+str(i)] = buf hmc = zs.HMC(step_size=hps.lr, n_leapfrogs=10, adapt_step_size=True) sample_op, hmc_info = hmc.sample(meta_model, observed={'y': y}, latent=latent) var_bn = meta_model.observe(**latent) log_joint = var_bn.log_joint() optimizer = tf.train.AdamOptimizer(learning_rate=hps.lr) global_step = tf.get_variable( 'global_step', initializer=0, trainable=False) opt_op = optimizer.minimize( -log_joint, var_list=[var_bn.y_logstd], global_step=global_step) # prediction: rmse & log likelihood y_mean = var_bn["y_mean"] y_pred = tf.reduce_mean(y_mean, 0) rmse = tf.sqrt(tf.reduce_mean((y_pred - y) ** 2)) * std_y_train log_py_xw = var_bn.cond_log_prob("y") log_likelihood = tf.reduce_mean(zs.log_mean_exp(log_py_xw, 0)) - \ tf.log(std_y_train) ystd_avg = var_bn.y_logstd # Define training/evaluation parameters epochs = hps.n_epoch batch_size = hps.batch_size iters = int(np.ceil(x_train.shape[0] / float(batch_size))) test_freq = hps.test_freq # Run the inference dump_buf = [] with wrapped_supervisor.create_sv(hps, global_step=global_step) as sv: sess = sv.sess_ for epoch in range(1, epochs + 1): lbs = [] perm = np.arange(x_train.shape[0]) np.random.shuffle(perm) x_train = x_train[perm] y_train = y_train[perm] for t in range(iters): x_batch = x_train[t * batch_size:(t + 1) * batch_size] y_batch = y_train[t * batch_size:(t + 1) * batch_size] _, _, accr = sess.run( [sample_op, opt_op, hmc_info.acceptance_rate], feed_dict={x: x_batch, y: y_batch}) lbs.append(accr) if epoch % 10 == 0: print('Epoch {}: Acceptance rate = {}'.format(epoch, np.mean(lbs))) if epoch % test_freq == 0: test_rmse, test_ll = sess.run( [rmse, log_likelihood], feed_dict={x: x_test, y: y_test}) print('>> TEST') print('>> Test rmse = {}, log_likelihood = {}' .format(test_rmse, test_ll)) if epoch>epochs //3 and epoch % hps.dump_freq == 0: dump_buf.append(sess.run(var_bn['y_mean'], {x:x_test, y: y_test})) if len(hps.dump_pred_dir) > 0: pred_out = sess.run([var_bn['y_mean'], var_bn.y_logstd], {x: x_test, y: y_test}) pred_out[0] = np.concatenate(dump_buf, axis=0) pred_out[0] = pred_out[0] * std_y_train + mean_y_train pred_out[1] = np.exp(pred_out[1]) f = lambda a, b: [a*std_x_train + mean_x_train, b*std_y_train + mean_y_train] todump = pred_out + f(x_test, y_test) + f(x_train, y_train) with open(hps.dump_pred_dir, 'wb') as fout: import pickle pickle.dump(todump, fout)
log_joint, observed={'y': y_obs}, latent=latent, axis=0) cost = tf.reduce_mean(lower_bound.sgvb()) lower_bound = tf.reduce_mean(lower_bound) learning_rate_ph = tf.placeholder(tf.float32, shape=[]) optimizer = tf.train.AdamOptimizer(learning_rate_ph) infer_op = optimizer.minimize(cost) # prediction: rmse & log likelihood observed = dict((w_name, latent[w_name][0]) for w_name in w_names) observed.update({'y': y_obs}) model, y_mean = bayesianNN(observed, x, n_x, layer_sizes, n_particles) y_pred = tf.reduce_mean(y_mean, 0) rmse = tf.sqrt(tf.reduce_mean((y_pred - y) ** 2)) * std_y_train log_py_xw = model.local_log_prob('y') log_likelihood = tf.reduce_mean(zs.log_mean_exp(log_py_xw, 0)) - \ tf.log(std_y_train) params = tf.trainable_variables() for i in params: print(i.name, i.get_shape()) # Run the inference with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(1, epochs + 1): time_epoch = -time.time() if epoch % anneal_lr_freq == 0: learning_rate *= anneal_lr_rate lbs = [] for t in range(iters):
log_qWs = [log_qW / x_train.shape[0] for log_qW in log_qWs] W_dict = dict(zip(W_names, zip(qW_samples, log_qWs))) # wdict of the form {'W_names': [qW_samples, log_qWs] } input to the ELBO lower_bound = zs.variational.elbo(log_joint, {'y': y_obs}, W_dict, axis=0) cost = tf.reduce_mean(lower_bound.sgvb()) lower_bound = tf.reduce_mean(lower_bound) # Predictions model, h_pred = var_dropout(dict(zip(W_names, qW_samples)), x_obs, n, net_size, n_particles, is_training) h_pred = tf.reduce_mean(tf.nn.softmax(h_pred), 0) y_pred = tf.argmax(h_pred, 1, output_type=tf.int32) acc = tf.reduce_mean(tf.cast(tf.equal(y_pred, y), tf.float32)) log_py_xw = model.local_log_prob('y') log_likelihood = zs.log_mean_exp(log_py_xw, 0) optimizer = tf.train.AdamOptimizer(learning_rate_ph, epsilon=1e-4) infer = optimizer.minimize(cost) params = tf.trainable_variables() for i in params: print('variable name = {}, shape = {}'.format(i.name, i.get_shape())) # Run the inference with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(1, epochs + 1): if epoch % anneal_lr_freq == 0: learning_rate *= anneal_lr_rate time_epoch = -time.time()
def main(): np.random.seed(1234) tf.set_random_seed(1237) # Load UCI Boston housing data data_path = os.path.join(conf.data_dir, 'housing.data') x_train, y_train, x_valid, y_valid, x_test, y_test = \ dataset.load_uci_boston_housing(data_path) N, n_x = x_train.shape # Standardize data x_train, x_test, _, _ = dataset.standardize(x_train, x_test) y_train, y_test, mean_y_train, std_y_train = dataset.standardize( y_train, y_test) # Define model parameters n_hiddens = [50] @zs.reuse('model') def bayesianNN(observed, x, n_x, layer_sizes, n_particles): with zs.BayesianNet(observed=observed) as model: ws = [] for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): w_mu = tf.zeros([1, n_out, n_in + 1]) ws.append( zs.Normal('w' + str(i), w_mu, std=1., n_samples=n_particles, group_event_ndims=2)) # forward ly_x = tf.expand_dims( tf.tile(tf.expand_dims(x, 0), [n_particles, 1, 1]), 3) for i in range(len(ws)): w = tf.tile(ws[i], [1, tf.shape(x)[0], 1, 1]) ly_x = tf.concat( [ly_x, tf.ones([n_particles, tf.shape(x)[0], 1, 1])], 2) ly_x = tf.matmul(w, ly_x) / \ tf.sqrt(tf.to_float(tf.shape(ly_x)[2])) if i < len(ws) - 1: ly_x = tf.nn.relu(ly_x) y_mean = tf.squeeze(ly_x, [2, 3]) y_logstd = tf.get_variable('y_logstd', shape=[], initializer=tf.constant_initializer(0.)) y = zs.Normal('y', y_mean, logstd=y_logstd) return model, y_mean def mean_field_variational(layer_sizes, n_particles): with zs.BayesianNet() as variational: ws = [] for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): w_mean = tf.get_variable( 'w_mean_' + str(i), shape=[1, n_out, n_in + 1], initializer=tf.constant_initializer(0.)) w_logstd = tf.get_variable( 'w_logstd_' + str(i), shape=[1, n_out, n_in + 1], initializer=tf.constant_initializer(0.)) ws.append( zs.Normal('w' + str(i), w_mean, logstd=w_logstd, n_samples=n_particles, group_event_ndims=2)) return variational # Build the computation graph n_particles = tf.placeholder(tf.int32, shape=[], name='n_particles') x = tf.placeholder(tf.float32, shape=[None, n_x]) y = tf.placeholder(tf.float32, shape=[None]) layer_sizes = [n_x] + n_hiddens + [1] w_names = ['w' + str(i) for i in range(len(layer_sizes) - 1)] def log_joint(observed): model, _ = bayesianNN(observed, x, n_x, layer_sizes, n_particles) log_pws = model.local_log_prob(w_names) log_py_xw = model.local_log_prob('y') return tf.add_n(log_pws) + log_py_xw * N variational = mean_field_variational(layer_sizes, n_particles) qw_outputs = variational.query(w_names, outputs=True, local_log_prob=True) latent = dict(zip(w_names, qw_outputs)) y_obs = tf.tile(tf.expand_dims(y, 0), [n_particles, 1]) lower_bound = tf.reduce_mean( zs.sgvb(log_joint, {'y': y_obs}, latent, axis=0)) optimizer = tf.train.AdamOptimizer(learning_rate=0.01) grads = optimizer.compute_gradients(-lower_bound) infer = optimizer.apply_gradients(grads) # prediction: rmse & log likelihood observed = dict((w_name, latent[w_name][0]) for w_name in w_names) observed.update({'y': y_obs}) model, y_mean = bayesianNN(observed, x, n_x, layer_sizes, n_particles) y_pred = tf.reduce_mean(y_mean, 0) rmse = tf.sqrt(tf.reduce_mean((y_pred - y)**2)) * std_y_train log_py_xw = model.local_log_prob('y') log_likelihood = tf.reduce_mean(zs.log_mean_exp(log_py_xw, 0)) - \ tf.log(std_y_train) # Define training/evaluation parameters lb_samples = 10 ll_samples = 5000 epochs = 500 batch_size = 10 iters = int(np.floor(x_train.shape[0] / float(batch_size))) test_freq = 10 # Run the inference with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(1, epochs + 1): lbs = [] for t in range(iters): x_batch = x_train[t * batch_size:(t + 1) * batch_size] y_batch = y_train[t * batch_size:(t + 1) * batch_size] _, lb = sess.run([infer, lower_bound], feed_dict={ n_particles: lb_samples, x: x_batch, y: y_batch }) lbs.append(lb) print('Epoch {}: Lower bound = {}'.format(epoch, np.mean(lbs))) if epoch % test_freq == 0: test_lb, test_rmse, test_ll = sess.run( [lower_bound, rmse, log_likelihood], feed_dict={ n_particles: ll_samples, x: x_test, y: y_test }) print('>> TEST') print('>> lower bound = {}, rmse = {}, log_likelihood = {}'. format(test_lb, test_rmse, test_ll))
def main(): # tf.set_random_seed(1237) # np.random.seed(1234) hps = parser.parse_args() # Load data data_path = os.path.join(conf.data_dir, hps.dataset + '.data') data_func = getattr(dataset, 'load_uci_' + hps.dataset) x_train, y_train, x_valid, y_valid, x_test, y_test = data_func(data_path) x_train = np.vstack([x_train, x_valid]) y_train = np.hstack([y_train, y_valid]) n_train, n_covariates = x_train.shape hps.dtype = getattr(tf, hps.dtype) # Standardize data x_train, x_test, _, _ = dataset.standardize(x_train, x_test) y_train, y_test, mean_y_train, std_y_train = dataset.standardize( y_train, y_test) # Build model kernel = RBFKernel(n_covariates) x_ph = tf.placeholder(hps.dtype, [None, n_covariates], 'x') y_ph = tf.placeholder(hps.dtype, [None], 'y') z_pos = tf.get_variable('z/pos', [hps.n_z, n_covariates], hps.dtype, initializer=tf.random_uniform_initializer(-1, 1)) n_particles_ph = n_particles_ph = tf.placeholder(tf.int32, [], 'n_particles') batch_size = tf.cast(tf.shape(x_ph)[0], hps.dtype) model = build_model(hps, kernel, z_pos, x_ph, n_particles_ph) variational = build_variational(hps, kernel, z_pos, x_ph, n_particles_ph) # ELBO = E_q log (p(y|fx)p(fx|fz)p(fz) / p(fx|fz)q(fz)) # So we remove p(fx|fz) in both log_joint and latent def log_joint(bn): prior, log_py_given_fx = bn.cond_log_prob(['fz', 'y']) return prior + log_py_given_fx / batch_size * n_train model.log_joint = log_joint [var_fz, var_fx] = variational.query(['fz', 'fx'], outputs=True, local_log_prob=True) var_fx = (var_fx[0], tf.zeros_like(var_fx[1])) lower_bound = zs.variational.elbo(model, observed={'y': y_ph}, latent={ 'fz': var_fz, 'fx': var_fx }, axis=0) cost = lower_bound.sgvb() optimizer = tf.train.AdamOptimizer(learning_rate=hps.lr) infer_op = optimizer.minimize(cost) # Prediction ops model = model.observe(fx=var_fx[0], y=y_ph) log_likelihood = model.cond_log_prob('y') std_y_train = tf.cast(std_y_train, hps.dtype) log_likelihood = zs.log_mean_exp(log_likelihood, 0) / batch_size - \ tf.log(std_y_train) y_pred_mean = tf.reduce_mean(model['y'].distribution.mean, axis=0) pred_mse = tf.reduce_mean((y_pred_mean - y_ph)**2) * std_y_train**2 def infer_step(sess, x_batch, y_batch): fd = {x_ph: x_batch, y_ph: y_batch, n_particles_ph: hps.n_particles} return sess.run([infer_op, lower_bound], fd)[1] def predict_step(sess, x_batch, y_batch): fd = { x_ph: x_batch, y_ph: y_batch, n_particles_ph: hps.n_particles_test } return sess.run([log_likelihood, pred_mse], fd) iters = int(np.ceil(x_train.shape[0] / float(hps.batch_size))) test_freq = 100 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(1, hps.n_epoch + 1): lbs = [] indices = np.arange(x_train.shape[0]) np.random.shuffle(indices) x_train = x_train[indices] y_train = y_train[indices] for t in range(iters): lb = infer_step( sess, x_train[t * hps.batch_size:(t + 1) * hps.batch_size], y_train[t * hps.batch_size:(t + 1) * hps.batch_size]) lbs.append(lb) if 10 * epoch % test_freq == 0: print('Epoch {}: Lower bound = {}'.format(epoch, np.mean(lbs))) if epoch % test_freq == 0: test_lls = [] test_mses = [] for t in range(0, x_test.shape[0], hps.batch_size): ll, mse = predict_step(sess, x_test[t:t + hps.batch_size], y_test[t:t + hps.batch_size]) test_lls.append(ll) test_mses.append(mse) print('>> TEST') print('>> Test log likelihood = {}, rmse = {}'.format( np.mean(test_lls), np.sqrt(np.mean(test_mses))))
def main(): tf.set_random_seed(1237) np.random.seed(2345) # Load UCI Boston housing data data_path = os.path.join(conf.data_dir, "housing.data") x_train, y_train, x_valid, y_valid, x_test, y_test = \ dataset.load_uci_boston_housing(data_path) x_train = np.vstack([x_train, x_valid]) y_train = np.hstack([y_train, y_valid]) n_train, x_dim = x_train.shape # Standardize data x_train, x_test, _, _ = dataset.standardize(x_train, x_test) y_train, y_test, mean_y_train, std_y_train = dataset.standardize( y_train, y_test) # Define model parameters n_hiddens = [50] # Build the computation graph n_particles = tf.placeholder(tf.int32, shape=[], name="n_particles") x = tf.placeholder(tf.float32, shape=[None, x_dim]) y = tf.placeholder(tf.float32, shape=[None]) layer_sizes = [x_dim] + n_hiddens + [1] w_names = ["w" + str(i) for i in range(len(layer_sizes) - 1)] model = build_bnn(x, layer_sizes, n_particles) variational = build_mean_field_variational(layer_sizes, n_particles) def log_joint(bn): log_pws = bn.cond_log_prob(w_names) log_py_xw = bn.cond_log_prob('y') return tf.add_n(log_pws) + tf.reduce_mean(log_py_xw, 1) * n_train model.log_joint = log_joint lower_bound = zs.variational.elbo(model, {'y': y}, variational=variational, axis=0) cost = lower_bound.sgvb() optimizer = tf.train.AdamOptimizer(learning_rate=0.01) infer_op = optimizer.minimize(cost) # prediction: rmse & log likelihood y_mean = lower_bound.bn["y_mean"] y_pred = tf.reduce_mean(y_mean, 0) rmse = tf.sqrt(tf.reduce_mean((y_pred - y)**2)) * std_y_train log_py_xw = lower_bound.bn.cond_log_prob("y") log_likelihood = tf.reduce_mean(zs.log_mean_exp(log_py_xw, 0)) - tf.log(std_y_train) # Define training/evaluation parameters lb_samples = 10 ll_samples = 5000 epochs = 500 batch_size = 10 iters = (n_train - 1) // batch_size + 1 test_freq = 10 # Run the inference with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(1, epochs + 1): perm = np.random.permutation(x_train.shape[0]) x_train = x_train[perm, :] y_train = y_train[perm] lbs = [] for t in range(iters): x_batch = x_train[t * batch_size:(t + 1) * batch_size] y_batch = y_train[t * batch_size:(t + 1) * batch_size] _, lb = sess.run([infer_op, lower_bound], feed_dict={ n_particles: lb_samples, x: x_batch, y: y_batch }) lbs.append(lb) print('Epoch {}: Lower bound = {}'.format(epoch, np.mean(lbs))) if epoch % test_freq == 0: test_rmse, test_ll = sess.run([rmse, log_likelihood], feed_dict={ n_particles: ll_samples, x: x_test, y: y_test }) print('>> TEST') print('>> Test rmse = {}, log_likelihood = {}'.format( test_rmse, test_ll))
def main(): tf.set_random_seed(1237) np.random.seed(1234) # Load UCI Boston housing data data_path = os.path.join(conf.data_dir, 'housing.data') x_train, y_train, x_valid, y_valid, x_test, y_test = \ dataset.load_uci_boston_housing(data_path) x_train = np.vstack([x_train, x_valid]) y_train = np.hstack([y_train, y_valid]) N, n_x = x_train.shape # Standardize data x_train, x_test, _, _ = dataset.standardize(x_train, x_test) y_train, y_test, mean_y_train, std_y_train = dataset.standardize( y_train, y_test) # Define model parameters n_hiddens = [50] # Build the computation graph n_particles = tf.placeholder(tf.int32, shape=[], name='n_particles') x = tf.placeholder(tf.float32, shape=[None, n_x]) y = tf.placeholder(tf.float32, shape=[None]) layer_sizes = [n_x] + n_hiddens + [1] w_names = ['w' + str(i) for i in range(len(layer_sizes) - 1)] def log_joint(observed): model, _ = bayesianNN(observed, x, n_x, layer_sizes, n_particles) log_pws = model.local_log_prob(w_names) log_py_xw = model.local_log_prob('y') return tf.add_n(log_pws) + log_py_xw * N variational = mean_field_variational(layer_sizes, n_particles) qw_outputs = variational.query(w_names, outputs=True, local_log_prob=True) latent = dict(zip(w_names, qw_outputs)) lower_bound = zs.variational.elbo(log_joint, observed={'y': y}, latent=latent, axis=0) cost = tf.reduce_mean(lower_bound.sgvb()) lower_bound = tf.reduce_mean(lower_bound) optimizer = tf.train.AdamOptimizer(learning_rate=0.01) infer_op = optimizer.minimize(cost) # prediction: rmse & log likelihood observed = dict((w_name, latent[w_name][0]) for w_name in w_names) observed.update({'y': y}) model, y_mean = bayesianNN(observed, x, n_x, layer_sizes, n_particles) y_pred = tf.reduce_mean(y_mean, 0) rmse = tf.sqrt(tf.reduce_mean((y_pred - y)**2)) * std_y_train log_py_xw = model.local_log_prob('y') log_likelihood = tf.reduce_mean(zs.log_mean_exp(log_py_xw, 0)) - \ tf.log(std_y_train) # Define training/evaluation parameters lb_samples = 10 ll_samples = 5000 epochs = 500 batch_size = 10 iters = int(np.floor(x_train.shape[0] / float(batch_size))) test_freq = 10 # Run the inference with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(1, epochs + 1): lbs = [] for t in range(iters): x_batch = x_train[t * batch_size:(t + 1) * batch_size] y_batch = y_train[t * batch_size:(t + 1) * batch_size] _, lb = sess.run([infer_op, lower_bound], feed_dict={ n_particles: lb_samples, x: x_batch, y: y_batch }) lbs.append(lb) print('Epoch {}: Lower bound = {}'.format(epoch, np.mean(lbs))) if epoch % test_freq == 0: test_lb, test_rmse, test_ll = sess.run( [lower_bound, rmse, log_likelihood], feed_dict={ n_particles: ll_samples, x: x_test, y: y_test }) print('>> TEST') print( '>> Test lower bound = {}, rmse = {}, log_likelihood = {}'. format(test_lb, test_rmse, test_ll))
def __init__(self, hps, S): # Build the computation graph x = tf.placeholder(tf.float32, shape=[None, S.x_dim]) if hps.regression: y = tf.placeholder(tf.float32, shape=[None]) layer_sizes = [S.x_dim] + hps.layers + [1] else: y = tf.placeholder(tf.int32, shape=[None]) layer_sizes = [S.x_dim] + hps.layers + [S.y_dim] # ===== MODEL & VARIATIONAL ===== svgd_latent = dict() svgd_variables = dict() if hps.regression: # observation noise std_raw = tf.get_variable('std_raw', shape=[hps.n_particles, 1], initializer=tf.constant_initializer( inv_softplus(0.5))) svgd_variables['y_std'] = std_raw y_std_sym = tf.nn.softplus(std_raw) if hps.fix_variance > 0: y_std_sym = tf.clip_by_value(y_std_sym, hps.fix_variance, hps.fix_variance + 1e-5) svgd_latent['y_std'] = y_std_sym # weight variance if hps.model_spec == 'lq': w_std_raw = tf.get_variable('w_std_raw', shape=[ hps.n_particles, ], initializer=tf.constant_initializer( inv_softplus(0.5))) svgd_variables['w_scale'] = w_std_raw w_std_sym = tf.nn.softplus(w_std_raw) svgd_latent['w_scale'] = w_std_sym meta_model = build_model(x, layer_sizes, hps.n_particles, hps.model_spec, hps.regression, hps.logits_w_sd) def log_joint(bn): rv_names = ["w" + str(i) for i in range(len(layer_sizes) - 1)] if hps.regression: rv_names += ['y_std'] if hps.model_spec == 'lq': rv_names.append('w_scale') log_pws = bn.cond_log_prob(rv_names) log_py_xw = bn.cond_log_prob('y') return tf.add_n(log_pws) + tf.reduce_mean(log_py_xw, 1) * S.n_train meta_model.log_joint = log_joint # variational: w for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): buf = tf.get_variable('buf_' + str(i), initializer=init_bnn_weight( hps.n_particles, n_in, n_out)) svgd_latent['w' + str(i)] = svgd_variables['w' + str(i)] = buf grad_and_var_w, var_bn = stein_variational_gradient_stationary( meta_model, {'y': y}, svgd_latent, variables=svgd_variables, method=hps.psvi_method) optimizer_class = { 'adam': tf.train.AdamOptimizer, 'adagrad': tf.train.AdagradOptimizer }[hps.optimizer] optimizer = optimizer_class(learning_rate=hps.lr) global_step = tf.get_variable('global_step', initializer=0, trainable=False) infer_op = optimizer.apply_gradients([(-g, v) for g, v in grad_and_var_w], global_step=global_step) # prediction: rmse & log likelihood log_py_xw = var_bn.cond_log_prob("y") log_likelihood = tf.reduce_mean(zs.log_mean_exp(log_py_xw, 0)) if hps.regression: y_mean = var_bn["y_mean"] y_pred = tf.reduce_mean(y_mean, 0) rmse = tf.sqrt(tf.reduce_mean((y_pred - y)**2)) * S.std_y_train log_likelihood -= tf.log(S.std_y_train) ystd_avg = tf.reduce_mean(y_std_sym) else: y_pred = tf.reduce_mean(tf.exp(var_bn['y_mean']), axis=0) rmse = 1 - tf.reduce_mean(tf.to_float(tf.nn.in_top_k(y_pred, y, 1))) ystd_avg = tf.constant(-1.) self.__dict__.update(locals())
def __init__(self, hps, S): # Build the computation graph x = tf.placeholder(tf.float32, shape=[None, S.x_dim]) if hps.regression: y = tf.placeholder(tf.float32, shape=[None]) layer_sizes = [S.x_dim] + hps.layers + [1] else: y = tf.placeholder(tf.int32, shape=[None]) layer_sizes = [S.x_dim] + hps.layers + [S.y_dim] # ===== MODEL & VARIATIONAL ===== svgd_latent = dict() svgd_variables = dict() if hps.regression: std_raw = tf.get_variable('std_raw', shape=[hps.n_particles, 1], initializer=tf.constant_initializer( inv_softplus(0.5))) svgd_variables['y_std'] = std_raw y_std_sym = tf.nn.softplus(std_raw) if hps.fix_variance > 0: y_std_sym = tf.clip_by_value(y_std_sym, hps.fix_variance, hps.fix_variance + 1e-5) svgd_latent['y_std'] = y_std_sym real_batch_size = tf.shape(x)[0] inp, x_extra = add_perturb_input(x, hps.extra_batch_size, S.n_train, ptb_type=hps.ptb_type, ptb_scale=hps.ptb_scale) meta_model = build_model(inp, layer_sizes, hps.n_particles, real_batch_size, hps.regression, hps.logits_w_sd) def log_likelihood_fn(bn): log_py_xw = bn.cond_log_prob('y') return tf.reduce_mean(log_py_xw, 1) * S.n_train meta_model.log_joint = log_likelihood_fn # variational: w for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): buf = tf.get_variable('buf_' + str(i), initializer=init_bnn_weight( hps.n_particles, n_in, n_out)) svgd_latent['w' + str(i)] = svgd_variables['w' + str(i)] = buf # combined observed_bn_ = { 'y': y } observed_bn_.update(svgd_latent) var_bn = meta_model.observe(**observed_bn_) log_likelihood = var_bn.log_joint() fval_all = var_bn['y_mean_all'] log_joint_svgd = log_likelihood # ===== PRIOR GRADIENT ===== fv_all_prior = tf.concat([ meta_model.observe()['y_mean_all'] for i in range(hps.mm_n_particles // hps.n_particles) ], axis=0) pg_indices = tf.concat([ tf.range(hps.n_mm_sample // 2), tf.range( tf.shape(fval_all)[1] - hps.n_mm_sample // 2, tf.shape(fval_all)[1]) ], axis=0) prior_fval = tf.gather(fv_all_prior, pg_indices, axis=1) var_fval = tf.to_double(tf.gather(fval_all, pg_indices, axis=1)) if not hps.regression: prior_fval = merge_last_axes(prior_fval, 1) var_fval = merge_last_axes(var_fval, 1) hpmean, hpcov = reduce_moments_ax0(prior_fval) hpprec = matrix_inverse(hpcov, hps.mm_jitter) log_joint_svgd += tf.to_float(mvn_log_prob(var_fval, hpprec, hpmean)) # ===== SVGD-F GRADIENT ===== svgd_grad, _ = svgd._svgd_stationary(hps.n_particles, log_joint_svgd, [fval_all], svgd.rbf_kernel, additional_grad=None, method=hps.psvi_method)[0] # ===== INFER OP ===== optimizer_class = { 'adam': tf.train.AdamOptimizer, 'adagrad': tf.train.AdagradOptimizer }[hps.optimizer] global_step = tf.get_variable('global_step', initializer=0, trainable=False) if hps.lr_decay: lr_sym = tf.train.exponential_decay(hps.lr, global_step, 10000, 0.2, staircase=True) else: lr_sym = hps.lr optimizer = optimizer_class(learning_rate=lr_sym) targ = tf.stop_gradient(svgd_grad + fval_all) infer_op = optimizer.minimize(tf.reduce_mean((targ - fval_all)**2), global_step=global_step) if hps.regression: # the target above doesn't include std. Use MAP with tf.control_dependencies([infer_op]): infer_op = optimizer_class(learning_rate=lr_sym).minimize( -(log_likelihood + var_bn.cond_log_prob('y_std')), var_list=[std_raw]) # prediction: rmse & log likelihood log_py_xw = var_bn.cond_log_prob("y") log_likelihood = tf.reduce_mean(zs.log_mean_exp(log_py_xw, 0)) if hps.regression: log_likelihood -= tf.log(S.std_y_train) y_pred = tf.reduce_mean(var_bn['y_mean_sup'], 0) rmse = tf.sqrt(tf.reduce_mean((y_pred - y)**2)) * S.std_y_train ystd_avg = tf.reduce_mean(y_std_sym) else: y_pred = tf.reduce_mean(tf.exp(var_bn['y_mean_sup']), axis=0) rmse = 1 - tf.reduce_mean(tf.to_float(tf.nn.in_top_k(y_pred, y, 1))) ystd_avg = tf.constant(-1.) self.__dict__.update(locals())
def build_bnn(x_ph, y_ph, weight_ph, n_train_ph, hps): inp, n_supervised = inplace_perturb(x_ph, hps.interp_batch_size, n_train_ph) layer_sizes = [x_ph.get_shape().as_list()[1]] + hps.layer_sizes + \ [y_ph.get_shape().as_list()[1]] out_mask = weight_ph[None, :n_supervised, :] # ============== MODEL ======================= weight_sd = np.sqrt(hps.prior_variance) meta_model = bnn_meta_model(inp, layer_sizes, hps.n_particles, n_supervised, weight_sd) def log_likelihood_fn(bn): log_py_xw = bn.cond_log_prob('y') assert len(log_py_xw.get_shape().as_list()) == 3 # [nPar, nBa, nOut] log_py_xw = tf.reduce_sum(log_py_xw * out_mask, axis=-1) return tf.reduce_mean(log_py_xw, 1) * n_train_ph meta_model.log_joint = log_likelihood_fn # ============== VARIATIONAL ================== svgd_latent = dict() svgd_variables = dict() if hps.use_sigma_exp_transform: sigma_transform = tfd.bijectors.Exp() else: sigma_transform = tfd.bijectors.Softplus() std_raw = tf.get_variable('std_raw', shape=[hps.n_particles, 1, layer_sizes[-1]], initializer=tf.zeros_initializer()) svgd_variables['y_std'] = std_raw y_std_sym = sigma_transform.forward( std_raw + sigma_transform.inverse(hps.noise_sigma)) svgd_latent['y_std'] = y_std_sym # w for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): w_init = init_bnn_weight(hps.n_particles, n_in, n_out) * weight_sd buf = tf.get_variable('buf_' + str(i), initializer=w_init) svgd_latent['w' + str(i)] = svgd_variables['w' + str(i)] = buf # combined observed_bn_ = {'y': y_ph[:n_supervised]} observed_bn_.update(svgd_latent) var_bn = meta_model.observe(**observed_bn_) log_likelihood = var_bn.log_joint() fval_all = var_bn['y_mean_all'] log_joint_svgd = log_likelihood # ===== PRIOR GRADIENT ===== param_names = [name for name in svgd_variables if name.startswith('w')] log_prior = var_bn.cond_log_prob(param_names) hpv = [] for i in range(hps.mm_n_particles // hps.n_particles): temp_bn = meta_model.observe() hpv.append(temp_bn['y_mean_all']) hpv = tf.concat(hpv, axis=0) n_mms = hps.n_mm_sample // 2 hp_val = tf.concat([hpv[:, :n_mms], hpv[:, -n_mms:]], axis=1) mm_fval = tf.to_double( tf.concat([fval_all[:, :n_mms], fval_all[:, -n_mms:]], axis=1)) hp_val = merge_last_axes(hp_val, 1) mm_fval = merge_last_axes(mm_fval, 1) hpmean, hpcov = reduce_moments_ax0(hp_val) hpprec = matrix_inverse(hpcov, hps.mm_jitter) pd = tf.to_float(mvn_log_prob(mm_fval, hpprec, hpmean)) / tf.to_float( hps.n_mm_sample) log_joint_svgd += pd # ===== SVGD-F GRADIENT ===== svgd_grad, _ = svgd._svgd_stationary(hps.n_particles, log_joint_svgd, [fval_all], svgd.rbf_kernel)[0] optimizer = tf.train.AdamOptimizer(learning_rate=hps.lr) global_step = tf.get_variable('global_step', initializer=0, trainable=False) targ = tf.stop_gradient(svgd_grad + fval_all) infer_op = optimizer.minimize(tf.reduce_mean((targ - fval_all)**2), global_step=global_step) if getattr(hps, "infer_noise_sigma", False): with tf.control_dependencies([infer_op]): infer_op = tf.train.AdamOptimizer(learning_rate=hps.lr).minimize( -(log_likelihood + var_bn.cond_log_prob('y_std')), var_list=[std_raw]) log_py_xw = tf.reduce_sum(var_bn.cond_log_prob("y") * out_mask, axis=-1) log_likelihood = tf.reduce_mean(zs.log_mean_exp(log_py_xw, 0)) y_pred = tf.reduce_mean(var_bn['y_mean_sup'], axis=0) rmse = tf.sqrt( tf.reduce_mean( (y_pred - y_ph[:n_supervised])**2 * weight_ph[:n_supervised])) logs = { 'rmse': rmse, 'log_likelihood': log_likelihood, 'mean_std': tf.reduce_mean(y_std_sym), 'std_first': y_std_sym[0, 0, 0], 'std_last': y_std_sym[0, 0, -1] } for k in logs: tf.summary.scalar(k, logs[k]) return infer_op, var_bn['y_mean_sup'], locals()