示例#1
0
    def sample_from_Omega_to_learn(self):
        Omega_from_q = []
        for i in range(self.n_Omega):
            z = utils.get_normal_samples(self.mc, self.d_in[i], self.d_out[i])
            Omega_from_q.append(tf.add(tf.multiply(z, tf.exp(self.log_var_Omega[i] / 2)), self.mean_Omega[i]))

        return Omega_from_q
示例#2
0
 def sample_from_W(self):
     W_from_q = []
     for i in range(self.n_W):
         z = utils.get_normal_samples(self.mc, self.dhat_in[i], self.dhat_out[i])
         self.z = z
         W_from_q.append(tf.add(tf.multiply(z, tf.exp(self.log_var_W[i] / 2)), self.mean_W[i]))
     return W_from_q
示例#3
0
    def __init__(self, likelihood_fun, num_examples, d_in, d_out, n_layers,
                 n_rff, df, kernel_type, kernel_arccosine_degree, is_ard,
                 feed_forward, q_Omega_fixed, theta_fixed, learn_Omega):
        """
        :param likelihood_fun: Likelihood function
        :param num_examples: total number of input samples
        :param d_in: Dimensionality of the input
        :param d_out: Dimensionality of the output
        :param n_layers: Number of hidden layers
        :param n_rff: Number of random features for each layer
        :param df: Number of GPs for each layer
        :param kernel_type: Kernel type: currently only random Fourier features for RBF and arccosine kernels are implemented
        :param kernel_arccosine_degree: degree parameter of the arccosine kernel
        :param is_ard: Whether the kernel is ARD or isotropic
        :param feed_forward: Whether the original inputs should be fed forward as input to each layer
        :param Omega_fixed: Whether the Omega weights should be fixed throughout the optimization
        :param theta_fixed: Whether covariance parameters should be fixed throughout the optimization
        :param learn_Omega: How to treat Omega - fixed (from the prior), optimized, or learned variationally
        """
        self.likelihood = likelihood_fun
        self.kernel_type = kernel_type
        self.is_ard = is_ard
        self.feed_forward = feed_forward
        self.q_Omega_fixed = q_Omega_fixed
        self.theta_fixed = theta_fixed
        self.q_Omega_fixed_flag = q_Omega_fixed > 0
        self.theta_fixed_flag = theta_fixed > 0
        self.learn_Omega = learn_Omega
        self.arccosine_degree = kernel_arccosine_degree

        ## These are all scalars
        self.num_examples = num_examples
        self.nl = n_layers  ## Number of hidden layers
        self.n_Omega = n_layers  ## Number of weigh matrices is "Number of hidden layers"
        self.n_W = n_layers

        ## These are arrays to allow flexibility in the future
        self.n_rff = n_rff * np.ones(n_layers, dtype=np.int64)
        self.df = df * np.ones(n_layers, dtype=np.int64)

        ## Dimensionality of Omega matrices
        if self.feed_forward:
            self.d_in = np.concatenate([[d_in],
                                        self.df[:(n_layers - 1)] + d_in])
        else:
            self.d_in = np.concatenate([[d_in], self.df[:(n_layers - 1)]])
        self.d_out = self.n_rff

        ## Dimensionality of W matrices
        if self.kernel_type == "RBF":
            self.dhat_in = self.n_rff * 2
            self.dhat_out = np.concatenate([self.df[:-1], [d_out]])

        if self.kernel_type == "arccosine":
            self.dhat_in = self.n_rff
            self.dhat_out = np.concatenate([self.df[:-1], [d_out]])

        ## When Omega is learned variationally, define the right KL function and the way Omega are constructed
        if self.learn_Omega == "var":
            self.get_kl = self.get_kl_Omega_to_learn
            self.sample_from_Omega = self.sample_from_Omega_to_learn

        ## When Omega is optimized, fix some standard normals throughout the execution that will be used to construct Omega
        if self.learn_Omega == "optim":
            self.get_kl = self.get_kl_Omega_to_learn
            self.sample_from_Omega = self.sample_from_Omega_optim

            self.z_for_Omega_fixed = []
            for i in range(self.n_Omega):
                tmp = utils.get_normal_samples(1, self.d_in[i], self.d_out[i])
                self.z_for_Omega_fixed.append(
                    tf.Variable(tmp[0, :, :], trainable=False))

        ## When Omega is fixed, fix some standard normals throughout the execution that will be used to construct Omega
        if self.learn_Omega == "no":
            self.get_kl = self.get_kl_Omega_fixed
            self.sample_from_Omega = self.sample_from_Omega_fixed

            self.z_for_Omega_fixed = []
            for i in range(self.n_Omega):
                tmp = utils.get_normal_samples(1, self.d_in[i], self.d_out[i])
                self.z_for_Omega_fixed.append(
                    tf.Variable(tmp[0, :, :], trainable=False))

        ## Parameters defining prior over Omega
        self.log_theta_sigma2 = tf.Variable(tf.zeros([n_layers]),
                                            name="log_theta_sigma2")

        if self.is_ard:
            self.llscale0 = []
            for i in range(self.nl):
                self.llscale0.append(
                    tf.constant(0.5 * np.log(self.d_in[i]), tf.float32))
        else:
            self.llscale0 = tf.constant(0.5 * np.log(self.d_in), tf.float32)

        if self.is_ard:
            self.log_theta_lengthscale = []
            for i in range(self.nl):
                self.log_theta_lengthscale.append(
                    tf.Variable(tf.mul(tf.ones([self.d_in[i]]),
                                       self.llscale0[i]),
                                name="log_theta_lengthscale"))
        else:
            self.log_theta_lengthscale = tf.Variable(
                self.llscale0, name="log_theta_lengthscale")
        self.prior_mean_Omega, self.log_prior_var_Omega = self.get_prior_Omega(
            self.log_theta_lengthscale)

        ## Set the prior over weights
        self.prior_mean_W, self.log_prior_var_W = self.get_prior_W()

        ## Initialize posterior parameters
        if self.learn_Omega == "var":
            self.mean_Omega, self.log_var_Omega = self.init_posterior_Omega()
        if self.learn_Omega == "optim":
            self.mean_Omega, self.log_var_Omega = self.init_posterior_Omega()

        self.mean_W, self.log_var_W = self.init_posterior_W()

        ## Set the number of Monte Carlo samples as a placeholder so that it can be different for training and test
        self.mc = tf.placeholder(tf.int32)

        ## Batch data placeholders
        Din = d_in
        Dout = d_out
        self.X = tf.placeholder(tf.float32, [None, Din])
        self.Y = tf.placeholder(tf.float32, [None, Dout])

        ## Builds whole computational graph with relevant quantities as part of the class
        self.loss, self.kl, self.ell, self.layer_out = self.get_nelbo()

        ## Initialize the session
        self.session = tf.Session()
    def get_ell(self):
        Din = self.d_in[0]
        MC = self.mc
        N_L = self.nl
        X = self.X
        Y = self.Y
        batch_size = tf.shape(
            X
        )[0]  # This is the actual batch size when X is passed to the graph of computations

        ## The representation of the information is based on 3-dimensional tensors (one for each layer)
        ## Each slice [i,:,:] of these tensors is one Monte Carlo realization of the value of the hidden units
        ## At layer zero we simply replicate the input matrix X self.mc times
        self.layer = []
        self.layer.append(tf.multiply(tf.ones([self.mc, batch_size, Din]), X))

        ## Forward propagate information from the input to the output through hidden layers
        Omega_from_q = self.sample_from_Omega()

        for i in range(N_L):
            layer_times_Omega = tf.matmul(self.layer[i],
                                          Omega_from_q[i])  # X * Omega

            ## Apply the activation function corresponding to the chosen kernel - PHI
            if self.kernel_type == "RBF":
                Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast(
                    tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat(
                        values=[
                            tf.cos(layer_times_Omega),
                            tf.sin(layer_times_Omega)
                        ],
                        axis=2)
            if self.kernel_type == "arccosine":
                if self.arccosine_degree == 0:
                    Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast(
                        tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat(
                            values=[
                                tf.sign(tf.maximum(layer_times_Omega, 0.0))
                            ],
                            axis=2)
                if self.arccosine_degree == 1:
                    Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast(
                        tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat(
                            values=[tf.maximum(layer_times_Omega, 0.0)],
                            axis=2)
                if self.arccosine_degree == 2:
                    Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast(
                        tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat(
                            values=[
                                tf.square(tf.maximum(layer_times_Omega, 0.0))
                            ],
                            axis=2)

            if self.local_reparam:

                z_for_F_sample = utils.get_normal_samples(
                    self.mc,
                    tf.shape(Phi)[1], self.dhat_out[i])
                mean_F = tf.tensordot(Phi, self.mean_W[i], [[2], [0]])
                var_F = tf.tensordot(tf.pow(Phi, 2), tf.exp(self.log_var_W[i]),
                                     [[2], [0]])
                F = tf.add(tf.multiply(z_for_F_sample, tf.sqrt(var_F)), mean_F)
            else:
                W_from_q = self.sample_from_W()
                F = tf.matmul(Phi, W_from_q[i])

            if self.feed_forward and not (
                    i == (N_L - 1)
            ):  ## In the feed-forward case, no concatenation in the last layer so that F has the same dimensions of Y
                F = tf.concat(values=[F, self.layer[0]], axis=2)

            self.layer.append(F)

        ## Output layer
        layer_out = self.layer[N_L]

        ## Given the output layer, we compute the conditional likelihood across all samples
        ll = self.likelihood.log_cond_prob(Y, layer_out)

        ## Mini-batch estimation of the expected log-likelihood term
        ell = tf.reduce_sum(tf.reduce_mean(
            ll, 0)) * self.num_examples / tf.cast(batch_size, "float32")

        return ell, layer_out