def sample_from_Omega_to_learn(self): Omega_from_q = [] for i in range(self.n_Omega): z = utils.get_normal_samples(self.mc, self.d_in[i], self.d_out[i]) Omega_from_q.append(tf.add(tf.multiply(z, tf.exp(self.log_var_Omega[i] / 2)), self.mean_Omega[i])) return Omega_from_q
def sample_from_W(self): W_from_q = [] for i in range(self.n_W): z = utils.get_normal_samples(self.mc, self.dhat_in[i], self.dhat_out[i]) self.z = z W_from_q.append(tf.add(tf.multiply(z, tf.exp(self.log_var_W[i] / 2)), self.mean_W[i])) return W_from_q
def __init__(self, likelihood_fun, num_examples, d_in, d_out, n_layers, n_rff, df, kernel_type, kernel_arccosine_degree, is_ard, feed_forward, q_Omega_fixed, theta_fixed, learn_Omega): """ :param likelihood_fun: Likelihood function :param num_examples: total number of input samples :param d_in: Dimensionality of the input :param d_out: Dimensionality of the output :param n_layers: Number of hidden layers :param n_rff: Number of random features for each layer :param df: Number of GPs for each layer :param kernel_type: Kernel type: currently only random Fourier features for RBF and arccosine kernels are implemented :param kernel_arccosine_degree: degree parameter of the arccosine kernel :param is_ard: Whether the kernel is ARD or isotropic :param feed_forward: Whether the original inputs should be fed forward as input to each layer :param Omega_fixed: Whether the Omega weights should be fixed throughout the optimization :param theta_fixed: Whether covariance parameters should be fixed throughout the optimization :param learn_Omega: How to treat Omega - fixed (from the prior), optimized, or learned variationally """ self.likelihood = likelihood_fun self.kernel_type = kernel_type self.is_ard = is_ard self.feed_forward = feed_forward self.q_Omega_fixed = q_Omega_fixed self.theta_fixed = theta_fixed self.q_Omega_fixed_flag = q_Omega_fixed > 0 self.theta_fixed_flag = theta_fixed > 0 self.learn_Omega = learn_Omega self.arccosine_degree = kernel_arccosine_degree ## These are all scalars self.num_examples = num_examples self.nl = n_layers ## Number of hidden layers self.n_Omega = n_layers ## Number of weigh matrices is "Number of hidden layers" self.n_W = n_layers ## These are arrays to allow flexibility in the future self.n_rff = n_rff * np.ones(n_layers, dtype=np.int64) self.df = df * np.ones(n_layers, dtype=np.int64) ## Dimensionality of Omega matrices if self.feed_forward: self.d_in = np.concatenate([[d_in], self.df[:(n_layers - 1)] + d_in]) else: self.d_in = np.concatenate([[d_in], self.df[:(n_layers - 1)]]) self.d_out = self.n_rff ## Dimensionality of W matrices if self.kernel_type == "RBF": self.dhat_in = self.n_rff * 2 self.dhat_out = np.concatenate([self.df[:-1], [d_out]]) if self.kernel_type == "arccosine": self.dhat_in = self.n_rff self.dhat_out = np.concatenate([self.df[:-1], [d_out]]) ## When Omega is learned variationally, define the right KL function and the way Omega are constructed if self.learn_Omega == "var": self.get_kl = self.get_kl_Omega_to_learn self.sample_from_Omega = self.sample_from_Omega_to_learn ## When Omega is optimized, fix some standard normals throughout the execution that will be used to construct Omega if self.learn_Omega == "optim": self.get_kl = self.get_kl_Omega_to_learn self.sample_from_Omega = self.sample_from_Omega_optim self.z_for_Omega_fixed = [] for i in range(self.n_Omega): tmp = utils.get_normal_samples(1, self.d_in[i], self.d_out[i]) self.z_for_Omega_fixed.append( tf.Variable(tmp[0, :, :], trainable=False)) ## When Omega is fixed, fix some standard normals throughout the execution that will be used to construct Omega if self.learn_Omega == "no": self.get_kl = self.get_kl_Omega_fixed self.sample_from_Omega = self.sample_from_Omega_fixed self.z_for_Omega_fixed = [] for i in range(self.n_Omega): tmp = utils.get_normal_samples(1, self.d_in[i], self.d_out[i]) self.z_for_Omega_fixed.append( tf.Variable(tmp[0, :, :], trainable=False)) ## Parameters defining prior over Omega self.log_theta_sigma2 = tf.Variable(tf.zeros([n_layers]), name="log_theta_sigma2") if self.is_ard: self.llscale0 = [] for i in range(self.nl): self.llscale0.append( tf.constant(0.5 * np.log(self.d_in[i]), tf.float32)) else: self.llscale0 = tf.constant(0.5 * np.log(self.d_in), tf.float32) if self.is_ard: self.log_theta_lengthscale = [] for i in range(self.nl): self.log_theta_lengthscale.append( tf.Variable(tf.mul(tf.ones([self.d_in[i]]), self.llscale0[i]), name="log_theta_lengthscale")) else: self.log_theta_lengthscale = tf.Variable( self.llscale0, name="log_theta_lengthscale") self.prior_mean_Omega, self.log_prior_var_Omega = self.get_prior_Omega( self.log_theta_lengthscale) ## Set the prior over weights self.prior_mean_W, self.log_prior_var_W = self.get_prior_W() ## Initialize posterior parameters if self.learn_Omega == "var": self.mean_Omega, self.log_var_Omega = self.init_posterior_Omega() if self.learn_Omega == "optim": self.mean_Omega, self.log_var_Omega = self.init_posterior_Omega() self.mean_W, self.log_var_W = self.init_posterior_W() ## Set the number of Monte Carlo samples as a placeholder so that it can be different for training and test self.mc = tf.placeholder(tf.int32) ## Batch data placeholders Din = d_in Dout = d_out self.X = tf.placeholder(tf.float32, [None, Din]) self.Y = tf.placeholder(tf.float32, [None, Dout]) ## Builds whole computational graph with relevant quantities as part of the class self.loss, self.kl, self.ell, self.layer_out = self.get_nelbo() ## Initialize the session self.session = tf.Session()
def get_ell(self): Din = self.d_in[0] MC = self.mc N_L = self.nl X = self.X Y = self.Y batch_size = tf.shape( X )[0] # This is the actual batch size when X is passed to the graph of computations ## The representation of the information is based on 3-dimensional tensors (one for each layer) ## Each slice [i,:,:] of these tensors is one Monte Carlo realization of the value of the hidden units ## At layer zero we simply replicate the input matrix X self.mc times self.layer = [] self.layer.append(tf.multiply(tf.ones([self.mc, batch_size, Din]), X)) ## Forward propagate information from the input to the output through hidden layers Omega_from_q = self.sample_from_Omega() for i in range(N_L): layer_times_Omega = tf.matmul(self.layer[i], Omega_from_q[i]) # X * Omega ## Apply the activation function corresponding to the chosen kernel - PHI if self.kernel_type == "RBF": Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast( tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat( values=[ tf.cos(layer_times_Omega), tf.sin(layer_times_Omega) ], axis=2) if self.kernel_type == "arccosine": if self.arccosine_degree == 0: Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast( tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat( values=[ tf.sign(tf.maximum(layer_times_Omega, 0.0)) ], axis=2) if self.arccosine_degree == 1: Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast( tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat( values=[tf.maximum(layer_times_Omega, 0.0)], axis=2) if self.arccosine_degree == 2: Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast( tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat( values=[ tf.square(tf.maximum(layer_times_Omega, 0.0)) ], axis=2) if self.local_reparam: z_for_F_sample = utils.get_normal_samples( self.mc, tf.shape(Phi)[1], self.dhat_out[i]) mean_F = tf.tensordot(Phi, self.mean_W[i], [[2], [0]]) var_F = tf.tensordot(tf.pow(Phi, 2), tf.exp(self.log_var_W[i]), [[2], [0]]) F = tf.add(tf.multiply(z_for_F_sample, tf.sqrt(var_F)), mean_F) else: W_from_q = self.sample_from_W() F = tf.matmul(Phi, W_from_q[i]) if self.feed_forward and not ( i == (N_L - 1) ): ## In the feed-forward case, no concatenation in the last layer so that F has the same dimensions of Y F = tf.concat(values=[F, self.layer[0]], axis=2) self.layer.append(F) ## Output layer layer_out = self.layer[N_L] ## Given the output layer, we compute the conditional likelihood across all samples ll = self.likelihood.log_cond_prob(Y, layer_out) ## Mini-batch estimation of the expected log-likelihood term ell = tf.reduce_sum(tf.reduce_mean( ll, 0)) * self.num_examples / tf.cast(batch_size, "float32") return ell, layer_out