class DGP_Base(Model): """ The base class for Deep Gaussian process models. Implements a Monte-Carlo variational bound and convenience functions. """ def __init__(self, X, Y, likelihood, layers, minibatch_size=None, num_samples=1): Model.__init__(self) self.num_samples = num_samples self.num_data = X.shape[0] if minibatch_size: self.X = Minibatch(X, minibatch_size, seed=0) self.Y = Minibatch(Y, minibatch_size, seed=0) else: self.X = DataHolder(X) self.Y = DataHolder(Y) self.likelihood = BroadcastingLikelihood(likelihood) self.layers = ParamList(layers) @params_as_tensors def propagate(self, X, full_cov=False, S=1, zs=None): sX = tf.tile(tf.expand_dims(X, 0), [S, 1, 1]) Fs, Fmeans, Fvars = [], [], [] F = sX zs = zs or [None, ] * len(self.layers) for layer, z in zip(self.layers, zs): F, Fmean, Fvar = layer.sample_from_conditional(F, z=z, full_cov=full_cov) Fs.append(F) Fmeans.append(Fmean) Fvars.append(Fvar) return Fs, Fmeans, Fvars @params_as_tensors def _build_predict(self, X, full_cov=False, S=1): Fs, Fmeans, Fvars = self.propagate(X, full_cov=full_cov, S=S) return Fmeans[-1], Fvars[-1] def E_log_p_Y(self, X, Y): """ Calculate the expectation of the data log likelihood under the variational distribution with MC samples """ Fmean, Fvar = self._build_predict(X, full_cov=False, S=self.num_samples) var_exp = self.likelihood.variational_expectations(Fmean, Fvar, Y) # S, N, D return tf.reduce_mean(var_exp, 0) # N, D @params_as_tensors def _build_likelihood(self): L = tf.reduce_sum(self.E_log_p_Y(self.X, self.Y)) KL = tf.reduce_sum([layer.KL() for layer in self.layers]) scale = tf.cast(self.num_data, float_type) scale /= tf.cast(tf.shape(self.X)[0], float_type) # minibatch size return L * scale - KL @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_f(self, Xnew, num_samples): return self._build_predict(Xnew, full_cov=False, S=num_samples) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_f_full_cov(self, Xnew, num_samples): return self._build_predict(Xnew, full_cov=True, S=num_samples) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_all_layers(self, Xnew, num_samples): return self.propagate(Xnew, full_cov=False, S=num_samples) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_all_layers_full_cov(self, Xnew, num_samples): return self.propagate(Xnew, full_cov=True, S=num_samples) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_y(self, Xnew, num_samples): Fmean, Fvar = self._build_predict(Xnew, full_cov=False, S=num_samples) return self.likelihood.predict_mean_and_var(Fmean, Fvar) @autoflow((float_type, [None, None]), (float_type, [None, None]), (tf.int32, [])) def predict_density(self, Xnew, Ynew, num_samples): Fmean, Fvar = self._build_predict(Xnew, full_cov=False, S=num_samples) l = self.likelihood.predict_density(Fmean, Fvar, Ynew) log_num_samples = tf.log(tf.cast(num_samples, float_type)) return tf.reduce_logsumexp(l - log_num_samples, axis=0)
class LikelihoodTester(Model): def __init__(self, likelihood): Model.__init__(self) self.wrapped_likelihood = BroadcastingLikelihood(likelihood) self.likelihood = likelihood def _build_likelihood(self): return tf.cast(0., dtype=settings.float_type) @params_as_tensors @autoflow((settings.float_type, [None, None, None]), (settings.float_type, [None, None])) def logp1(self, F, Y): return self.wrapped_likelihood.logp(F, Y) @params_as_tensors @autoflow((settings.float_type, [None, None, None]), (settings.float_type, [None, None])) def logp2(self, F, Y): f = lambda a: self.likelihood.logp(a, Y) return tf.stack(tf.map_fn(f, F, dtype=settings.float_type)) @params_as_tensors @autoflow((settings.float_type, [None, None, None])) def conditional_mean1(self, F): return self.wrapped_likelihood.conditional_mean(F) @params_as_tensors @autoflow((settings.float_type, [None, None, None])) def conditional_mean2(self, F): f = lambda a: tf.cast(self.likelihood.conditional_mean(a), dtype=settings.float_type) return tf.stack(tf.map_fn(f, F, dtype=settings.float_type)) @params_as_tensors @autoflow((settings.float_type, [None, None, None])) def conditional_variance1(self, F): return self.wrapped_likelihood.conditional_variance(F) @params_as_tensors @autoflow((settings.float_type, [None, None, None])) def conditional_variance2(self, F): f = lambda a: tf.cast(self.likelihood.conditional_variance(a), dtype=settings.float_type) return tf.stack(tf.map_fn(f, F, dtype=settings.float_type)) @params_as_tensors @autoflow((settings.float_type, [None, None, None]), (settings.float_type, [None, None, None])) def predict_mean_and_var1(self, Fmu, Fvar): return self.wrapped_likelihood.predict_mean_and_var(Fmu, Fvar) @params_as_tensors @autoflow((settings.float_type, [None, None, None]), (settings.float_type, [None, None, None])) def predict_mean_and_var2(self, Fmu, Fvar): f = lambda a: list(self.likelihood.predict_mean_and_var(a[0], a[1])) m, v = tf.map_fn(f, [Fmu, Fvar], dtype=[settings.float_type, settings.float_type]) return tf.stack(m), tf.stack(v) @params_as_tensors @autoflow((settings.float_type, [None, None, None]), (settings.float_type, [None, None, None]), (settings.float_type, [None, None])) def predict_density1(self, Fmu, Fvar, Y): return self.wrapped_likelihood.predict_density(Fmu, Fvar, Y) @params_as_tensors @autoflow((settings.float_type, [None, None, None]), (settings.float_type, [None, None, None]), (settings.float_type, [None, None])) def predict_density2(self, Fmu, Fvar, Y): f = lambda a: self.likelihood.predict_density(a[0], a[1], Y) return tf.stack(tf.map_fn(f, [Fmu, Fvar], dtype=settings.float_type)) @params_as_tensors @autoflow((settings.float_type, [None, None, None]), (settings.float_type, [None, None, None]), (settings.float_type, [None, None])) def variational_expectations1(self, Fmu, Fvar, Y): return self.wrapped_likelihood.variational_expectations(Fmu, Fvar, Y) @params_as_tensors @autoflow((settings.float_type, [None, None, None]), (settings.float_type, [None, None, None]), (settings.float_type, [None, None])) def variational_expectations2(self, Fmu, Fvar, Y): f = lambda a: self.likelihood.variational_expectations(a[0], a[1], Y) return tf.stack(tf.map_fn(f, [Fmu, Fvar], dtype=settings.float_type))
class DGP_Base(Model): """ The base class for Deep Gaussian process models. Implements a Monte-Carlo variational bound and convenience functions. """ def __init__(self, X, Y, likelihood, layers, minibatch_size=None, num_samples=1, **kwargs): """ :param X: List of training inputs where each element of the list is a numpy array corresponding to the inputs of one fidelity. :param Y: List of training targets where each element of the list is a numpy array corresponding to the inputs of one fidelity. :param likelihood: gpflow likelihood object for use at the final layer :param layers: List of doubly_stochastic_dgp.layers.Layer objects :param minibatch_size: Minibatch size if using minibatch trainingz :param num_samples: Number of samples when propagating predictions through layers :param kwargs: kwarg inputs to gpflow.models.Model """ Model.__init__(self, **kwargs) self.Y_list = Y self.X_list = X self.minibatch_size = minibatch_size self.num_samples = num_samples # This allows a training regime where the first layer is trained first by itself, then the subsequent layer # and so on. self._train_upto_fidelity = -1 if minibatch_size: for i, (x, y) in enumerate(zip(X, Y)): setattr(self, "num_data" + str(i), x.shape[0]) setattr(self, "X" + str(i), Minibatch(x, minibatch_size, seed=0)) setattr(self, "Y" + str(i), Minibatch(y, minibatch_size, seed=0)) else: for i, (x, y) in enumerate(zip(X, Y)): setattr(self, "num_data" + str(i), x.shape[0]) setattr(self, "X" + str(i), DataHolder(x)) setattr(self, "Y" + str(i), DataHolder(y)) self.num_layers = len(layers) self.layers = ParamList(layers) self.likelihood = BroadcastingLikelihood(likelihood) @params_as_tensors def propagate(self, X, full_cov=False, S=1, zs=None): """ Propagate some prediction to the final layer and return predictions at each intermediate layer :param X: Input(s) at which to predict at :param full_cov: Whether the predict with the full covariance matrix :param S: Number of samples to use for sampling at intermediate layers :param zs: ?? :return: """ sX = tf.tile(tf.expand_dims(X, 0), [S, 1, 1]) Fs, Fmeans, Fvars = [], [], [] F = sX zs = (zs or [ None, ] * len(self.layers)) for i, (layer, z) in enumerate(zip(self.layers, zs)): if i == 0: F, Fmean, Fvar = layer.sample_from_conditional( F, z=z, full_cov=full_cov) else: """ KC - At all layers 1..L, the input to the next layer is original input augmented with the realisation of the function at the previous layer at that input. """ F_aug = tf.concat([sX, F], 2) F, Fmean, Fvar = layer.sample_from_conditional( F_aug, z=z, full_cov=full_cov) Fs.append(F) Fmeans.append(Fmean) Fvars.append(Fvar) return Fs, Fmeans, Fvars @params_as_tensors def _build_predict(self, X, full_cov=False, S=1, fidelity=None): """ Predicts from the fidelity level specified. If fidelity is not specified, return prediction at highest fidelity. :param X: Location at which to predict :param full_cov: Whether to predict full covariance matrix :param S: Number of samples to use for MC sampling between layers :param fidelity: zero based fidelity index at which to predict :return: (mean, variance) where each is [S, N, 1] where S is number of samples and N is number of predicted points. """ if fidelity is None: fidelity = -1 _, Fmeans, Fvars = self.propagate(X, full_cov=full_cov, S=S) return Fmeans[fidelity], Fvars[fidelity] def _likelihood_at_fidelity(self, Fmu, Fvar, Y, variance): """ Calculate likelihood term for observations corresponding to one fidelity :param Fmu: Posterior mean :param Fvar: Posterior variance :param Y: training observations :param variance: likelihood variance :return: """ return -0.5 * np.log(2 * np.pi) - 0.5 * tf.log(variance) - 0.5 * ( tf.square(Y - Fmu) + Fvar) / variance def E_log_p_Y(self, X_f, Y_f, fidelity=None): """ Calculate the expectation of the data log likelihood under the variational distribution with MC samples :param X_f: Training inputs for a given :param Y_f: :param fidelity: :return: """ Fmean, Fvar = self._build_predict(X_f, full_cov=False, S=self.num_samples, fidelity=fidelity) if fidelity == (self.num_layers - 1): """ KC - The likelihood of the observations at the last layer is computed using the model's 'likelihood' object """ var_exp = self.likelihood.variational_expectations( Fmean, Fvar, Y_f) # S, N, D else: """ KC - The Gaussian likelihood of the observations at the intermediate layers is computed using the noise parameter pertaining to the White noise kernel. This assumes that a White kernel should be added to all layers except for the last! If no noise is desired, the variance parameter in the White kernel should be set to zero and fixed. """ variance = self.layers[fidelity].kern.kernels[-1].variance f = lambda vars_SND, vars_ND, vars_N: self._likelihood_at_fidelity( vars_SND[0], vars_SND[1], vars_ND[0], vars_N) var_exp = f([Fmean, Fvar], [tf.expand_dims(Y_f, 0)], variance) return tf.reduce_mean(var_exp, 0) # N, D @params_as_tensors def _build_likelihood(self): """ ELBO calculation :return: MC estimate of lower bound """ L = 0.0 KL = 0.0 for fidelity in range(self.num_layers): if (self._train_upto_fidelity != -1) and (fidelity > self._train_upto_fidelity): continue X_l = getattr(self, "X" + str(fidelity)) Y_l = getattr(self, "Y" + str(fidelity)) n_data = getattr(self, "num_data" + str(fidelity)) scale = tf.cast(n_data, float_type) / tf.cast( tf.shape(X_l)[0], float_type) L += tf.reduce_sum(self.E_log_p_Y(X_l, Y_l, fidelity)) * scale KL += tf.reduce_sum(self.layers[fidelity].KL()) self.L = L self.KL = KL return self.L - self.KL @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_f(self, Xnew, num_samples, fidelity=None): return self._build_predict(Xnew, full_cov=False, S=num_samples, fidelity=fidelity) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_f_full_cov(self, Xnew, num_samples, fidelity=None): return self._build_predict(Xnew, full_cov=True, S=num_samples, fidelity=fidelity) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_all_layers(self, Xnew, num_samples): return self.propagate(Xnew, full_cov=False, S=num_samples) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_all_layers_full_cov(self, Xnew, num_samples): return self.propagate(Xnew, full_cov=True, S=num_samples) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_y(self, Xnew, num_samples): Fmean, Fvar = self._build_predict(Xnew, full_cov=False, S=num_samples) return self.likelihood.predict_mean_and_var(Fmean, Fvar) @autoflow((float_type, [None, None]), (float_type, [None, None]), (tf.int32, [])) def predict_density(self, Xnew, Ynew, num_samples): Fmean, Fvar = self._build_predict(Xnew, full_cov=False, S=num_samples) l = self.likelihood.predict_density(Fmean, Fvar, Ynew) log_num_samples = tf.log(tf.cast(num_samples, float_type)) return tf.reduce_logsumexp(l - log_num_samples, axis=0) @classmethod def make_mf_dgp(cls, X, Y, Z, add_linear=True, minibatch_size=None): """ Constructor for convenience. Constructs a mf-dgp model from training data and inducing point locations :param X: List of target :param Y: :param Z: :param add_linear: :return: """ n_fidelities = len(X) Din = X[0].shape[1] Dout = Y[0].shape[1] kernels = [ RBF(Din, active_dims=list(range(Din)), variance=1.0, lengthscales=1, ARD=True) ] for l in range(1, n_fidelities): D = Din + Dout D_range = list(range(D)) k_corr = RBF(Din, active_dims=D_range[:Din], lengthscales=1, variance=1.0, ARD=True) k_prev = RBF(Dout, active_dims=D_range[Din:], variance=1.0, lengthscales=1.0) k_in = RBF(Din, active_dims=D_range[:Din], variance=1.0, lengthscales=1, ARD=True) if add_linear: k_l = k_corr * (k_prev + Linear( Dout, active_dims=D_range[Din:], variance=1.0)) + k_in else: k_l = k_corr * k_prev + k_in kernels.append(k_l) """ A White noise kernel is currently expected by Mf-DGP at all layers except the last. In cases where no noise is desired, this should be set to 0 and fixed, as follows: white = White(1, variance=0.) white.variance.trainable = False kernels[i] += white """ for i, kernel in enumerate(kernels[:-1]): kernels[i] += White(1, variance=1e-6) num_data = 0 for i in range(len(X)): _log.info("\nData at Fidelity {}".format(i + 1)) _log.info("X - {}".format(X[i].shape)) _log.info("Y - {}".format(Y[i].shape)) _log.info("Z - {}".format(Z[i].shape)) num_data += X[i].shape[0] layers = init_layers_mf(Y, Z, kernels, num_outputs=Dout) model = DGP_Base(X, Y, Gaussian(), layers, num_samples=10, minibatch_size=minibatch_size) return model def multi_step_training(self, n_iter=5000, n_iter_2=15000): """ Train with variational covariance fixed to be small first, then free up and train covariance alongside other parameters. Inducing point locations are fixed throughout. """ for layer in self.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-8 layer.q_sqrt.trainable = False self.layers[-1].q_sqrt = self.layers[-1].q_sqrt.value * self.Y_list[ -1].var() * 0.01 self.layers[-1].q_sqrt.trainable = False self.likelihood.likelihood.variance = self.Y_list[-1].var() * 0.01 self.likelihood.likelihood.variance.trainable = False # Run with covariance fixed self.run_adam(3e-3, n_iter) # Run with covariance free self.likelihood.likelihood.variance.trainable = True for layer in self.layers: layer.q_sqrt.trainable = True self.run_adam(1e-3, n_iter_2) def fix_inducing_point_locations(self): """ Fix all inducing point locations """ for layer in self.layers: layer.feature.Z.trainable = False def run_adam(self, lr, iterations): adam = AdamOptimizer(lr).make_optimize_action(self) actions = [adam, PrintAction(self, "MF-DGP with Adam")] loop = Loop(actions, stop=iterations)() self.anchor(self.enquire_session())
class DGP_Base(Model): """ The base class for Deep Gaussian process models. Implements a Monte-Carlo variational bound and convenience functions. """ def __init__(self, X, Y, likelihood, layers, minibatch_size=None, num_samples=1, num_data=None, div_weights=None, **kwargs): Model.__init__(self, **kwargs) self.num_samples = num_samples self.num_data = num_data or X.shape[0] if minibatch_size: self.X = Minibatch(X, minibatch_size, seed=0) self.Y = Minibatch(Y, minibatch_size, seed=0) else: self.X = DataHolder(X) self.Y = DataHolder(Y) self.likelihood = BroadcastingLikelihood(likelihood) self.layers = ParamList(layers) """CHANGES START""" """Weights for the uncertainty quantifiers (per layer)""" if div_weights is None: div_weights = [1.0] * len( layers) #multiply by 1, i.e. don't change elif type(div_weights) == list and len(div_weights) != len(layers): print( "WARNING! You specified a list of weights for the " + "uncertainty quantifiers, but your DGP has more/less layers " + "than the number of weights you specified! " + "We set all weights to 1.0") div_weights = [1.0] * len(layers) elif type(div_weights) == list and len(div_weights) == len(layers): div_weights = div_weights """Distribute the weights into the layers""" for layer, weight in zip(layers, div_weights): layer.set_weight(weight) """CHANGES EEND""" @params_as_tensors def propagate(self, X, full_cov=False, S=1, zs=None): sX = tf.tile(tf.expand_dims(X, 0), [S, 1, 1]) Fs, Fmeans, Fvars = [], [], [] F = sX zs = zs or [ None, ] * len(self.layers) for layer, z in zip(self.layers, zs): F, Fmean, Fvar = layer.sample_from_conditional(F, z=z, full_cov=full_cov) Fs.append(F) Fmeans.append(Fmean) Fvars.append(Fvar) return Fs, Fmeans, Fvars @params_as_tensors def _build_predict(self, X, full_cov=False, S=1): Fs, Fmeans, Fvars = self.propagate(X, full_cov=full_cov, S=S) return Fmeans[-1], Fvars[-1] def E_log_p_Y(self, X, Y): """ Calculate the expectation of the data log likelihood under the variational distribution with MC samples """ Fmean, Fvar = self._build_predict(X, full_cov=False, S=self.num_samples) """ Below function: Compute the expected log density of the data, given a Gaussian distribution for the function values. if q(f) = N(Fmu, Fvar) and this object represents p(y|f) then this method computes \int (\log p(y|f)) q(f) df. Here, we implement a default Gauss-Hermite quadrature routine, but some likelihoods (Gaussian, Poisson) will implement specific cases. """ """CHANGES START""" # convert from S, N, D => N, D if (isinstance(self.likelihood.likelihood, gammaDivGaussian)): #For the gamma-div, the loss is strictly > 0 so we can log everything return tf.reduce_logsumexp( tf.cast(self.likelihood.variational_expectations( Fmean, Fvar, Y), dtype=tf.float64), 0) - np.log(self.num_samples) elif (isinstance(self.likelihood.likelihood, betaDivGaussian)): #For the beta-div, we need to treat both terms separately log_tempered, log_integral = self.likelihood.variational_expectations( Fmean, Fvar, Y) log_tempered_avg = tf.reduce_logsumexp( tf.cast(log_tempered, dtype=tf.float64), 0) - np.log( self.num_samples) return log_tempered_avg, log_integral else: #Standard procedure of original code var_exp = self.likelihood.variational_expectations(Fmean, Fvar, Y) # S, N, D return tf.reduce_mean(var_exp, 0) # N, D """CHANGES END""" @params_as_tensors def _build_likelihood(self): """CHANGES START""" if isinstance(self.likelihood.likelihood, gammaDivGaussian): L = tf.exp(tf.reduce_logsumexp(self.E_log_p_Y(self.X, self.Y))) elif isinstance(self.likelihood.likelihood, betaDivGaussian): L1, L2 = self.E_log_p_Y(self.X, self.Y) L = tf.exp(tf.reduce_logsumexp(L1)) + self.num_data * tf.cast( tf.shape(self.X)[0], float_type) else: L = tf.reduce_sum(self.E_log_p_Y(self.X, self.Y)) """CHANGES END""" KL = tf.reduce_sum([layer.KL() for layer in self.layers]) scale = tf.cast(self.num_data, float_type) scale /= tf.cast(tf.shape(self.X)[0], float_type) # minibatch size return L * scale - KL @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_f(self, Xnew, num_samples): return self._build_predict(Xnew, full_cov=False, S=num_samples) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_f_full_cov(self, Xnew, num_samples): return self._build_predict(Xnew, full_cov=True, S=num_samples) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_all_layers(self, Xnew, num_samples): return self.propagate(Xnew, full_cov=False, S=num_samples) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_all_layers_full_cov(self, Xnew, num_samples): return self.propagate(Xnew, full_cov=True, S=num_samples) @autoflow((float_type, [None, None]), (tf.int32, [])) def predict_y(self, Xnew, num_samples): Fmean, Fvar = self._build_predict(Xnew, full_cov=False, S=num_samples) return self.likelihood.predict_mean_and_var(Fmean, Fvar) @autoflow((float_type, [None, None]), (float_type, [None, None]), (tf.int32, [])) def predict_density(self, Xnew, Ynew, num_samples): Fmean, Fvar = self._build_predict(Xnew, full_cov=False, S=num_samples) l = self.likelihood.predict_density(Fmean, Fvar, Ynew) log_num_samples = tf.log(tf.cast(num_samples, float_type)) return tf.reduce_logsumexp(l - log_num_samples, axis=0)