def sample_functions(self, X_test, n_funcs=1): """ Samples F function values from the current posterior at the N specified test point. Parameters ---------- X_test: np.ndarray (N, D) Input test points n_funcs: int Number of function values that are drawn at each test point. Returns ---------- np.array(F, N) The F function values drawn at the N test points. """ if self.normalize_input: X_test_norm, _, _ = zero_mean_unit_var_normalization(X_test, self.x_mean, self.x_std) else: X_test_norm = X_test f = np.zeros([n_funcs, X_test_norm.shape[0]]) for i in range(n_funcs): lasagne.layers.set_all_param_values(self.net, self.samples[i]) out = self.single_predict(X_test_norm)[:, 0] if self.normalize_output: f[i, :] = zero_mean_unit_var_unnormalization(out, self.y_mean, self.y_std) else: f[i, :] = out return f
def predict(self, X_test, return_individual_predictions=False, *args, **kwargs): """ Returns the predictive mean and variance of the objective function at the given test points. Parameters ---------- X_test: np.ndarray (N, D) Input test points return_individual_predictions: bool If set to true than the individual predictions of all samples are returned. Returns ---------- np.array(N,) predictive mean np.array(N,) predictive variance """ if not self.is_trained: logging.error("Model is not trained!") return # Normalize input if self.normalize_input: X_, _, _ = zero_mean_unit_var_normalization(X_test, self.x_mean, self.x_std) else: X_ = X_test f_out = [] theta_noise = [] for sample in self.samples: lasagne.layers.set_all_param_values(self.net, sample) out = self.single_predict(X_) f_out.append(out[:, 0]) theta_noise.append(np.exp(out[:, 1])) f_out = np.asarray(f_out) theta_noise = np.asarray(theta_noise) if return_individual_predictions: if self.normalize_output: f_out = zero_mean_unit_var_unnormalization(f_out, self.y_mean, self.y_std) theta_noise *= self.y_std**2 return f_out, theta_noise m = np.mean(f_out, axis=0) # Total variance # v = np.mean(f_out ** 2 + theta_noise, axis=0) - m ** 2 v = np.mean((f_out - m) ** 2, axis=0) if self.normalize_output: m = zero_mean_unit_var_unnormalization(m, self.y_mean, self.y_std) v *= self.y_std ** 2 return m, v
def train(self, X, y, do_optimize=True): """ Computes the Cholesky decomposition of the covariance of X and estimates the GP hyperparameters by optimizing the marginal loglikelihood. The prior mean of the GP is set to the empirical mean of X. Parameters ---------- X: np.ndarray (N, D) Input data points. The dimensionality of X is (N, D), with N as the number of points and D is the number of features. y: np.ndarray (N,) The corresponding target values. do_optimize: boolean If set to true the hyperparameters are optimized otherwise the default hyperparameters of the kernel are used. """ if self.normalize_input: # Normalize input to be in [0, 1] self.X, self.lower, self.upper = normalization.zero_one_normalization( X, self.lower, self.upper) else: self.X = X if self.normalize_output: # Normalize output to have zero mean and unit standard deviation self.y, self.y_mean, self.y_std = normalization.zero_mean_unit_var_normalization( y) if self.y_std == 0: raise ValueError( "Cannot normalize output. All targets have the same value") else: self.y = y # Use the empirical mean of the data as mean for the GP self.mean = np.mean(self.y, axis=0) self.gp = george.GP(self.kernel, mean=self.mean) if do_optimize: self.hypers = self.optimize() self.gp.kernel[:] = self.hypers[:-1] self.noise = np.exp(self.hypers[-1]) # sigma^2 else: self.hypers = self.gp.kernel[:] self.hypers = np.append(self.hypers, np.log(self.noise)) logger.debug("GP Hyperparameters: " + str(self.hypers)) try: self.gp.compute(self.X, yerr=np.sqrt(self.noise)) except np.linalg.LinAlgError: self.noise *= 10 self.gp.compute(self.X, yerr=np.sqrt(self.noise)) self.is_trained = True
def predict(self, X_test): r""" Returns the predictive mean and variance of the objective function at the given test points. Parameters ---------- X_test: np.ndarray (N, D) N input test points Returns ---------- np.array(N,) predictive mean np.array(N,) predictive variance """ # Normalize inputs if self.normalize_input: X_, _, _ = zero_mean_unit_var_normalization( X_test, self.X_mean, self.X_std) else: X_ = X_test # Get features from the net layers = lasagne.layers.get_all_layers(self.network) theta = lasagne.layers.get_output(layers[:-1], X_)[-1].eval() # Marginalise predictions over hyperparameters of the BLR mu = np.zeros([len(self.models), X_test.shape[0]]) var = np.zeros([len(self.models), X_test.shape[0]]) for i, m in enumerate(self.models): mu[i], var[i] = m.predict(theta) # See the algorithm runtime prediction paper by Hutter et al # for the derivation of the total variance m = np.mean(mu, axis=0) v = np.mean(mu**2 + var, axis=0) - m**2 # Clip negative variances and set them to the smallest # positive float value if v.shape[0] == 1: v = np.clip(v, np.finfo(v.dtype).eps, np.inf) else: v = np.clip(v, np.finfo(v.dtype).eps, np.inf) v[np.where((v < np.finfo(v.dtype).eps) & (v > -np.finfo(v.dtype).eps))] = 0 if self.normalize_output: m = zero_mean_unit_var_unnormalization(m, self.y_mean, self.y_std) v *= self.y_std**2 return m, v
def train(self, X, y, do_optimize=True): """ Trains the model on the provided data. Parameters ---------- X: np.ndarray (N, D) Input data points. The dimensionality of X is (N, D), with N as the number of points and D is the number of features. y: np.ndarray (N,) The corresponding target values. do_optimize: boolean If set to true the hyperparameters are optimized otherwise the default hyperparameters are used. """ start_time = time.time() # Normalize inputs if self.normalize_input: self.X, self.X_mean, self.X_std = zero_mean_unit_var_normalization( X) else: self.X = X # Normalize ouputs if self.normalize_output: self.y, self.y_mean, self.y_std = zero_mean_unit_var_normalization( y) else: self.y = y self.y = self.y[:, None] # Check if we have enough points to create a minibatch otherwise use all data points if self.X.shape[0] <= self.batch_size: batch_size = self.X.shape[0] else: batch_size = self.batch_size # Create the neural network features = X.shape[1] self.network = self._build_net(self.input_var, features) prediction = lasagne.layers.get_output(self.network) # Define loss function for training loss = T.mean(T.square(prediction - self.target_var)) / 0.001 loss = loss.mean() params = lasagne.layers.get_all_params(self.network, trainable=True) self.learning_rate = theano.shared( np.array(self.init_learning_rate, dtype=theano.config.floatX)) updates = lasagne.updates.adam(loss, params, learning_rate=self.learning_rate) logging.debug("... compiling theano functions") self.train_fn = theano.function([self.input_var, self.target_var], loss, updates=updates, allow_input_downcast=True) # Start training lc = np.zeros([self.num_epochs]) for epoch in range(self.num_epochs): epoch_start_time = time.time() train_err = 0 train_batches = 0 for batch in self.iterate_minibatches(self.X, self.y, batch_size, shuffle=True): inputs, targets = batch train_err += self.train_fn(inputs, targets) train_batches += 1 lc[epoch] = train_err / train_batches logging.debug("Epoch {} of {}".format(epoch + 1, self.num_epochs)) curtime = time.time() epoch_time = curtime - epoch_start_time total_time = curtime - start_time logging.debug("Epoch time {:.3f}s, total time {:.3f}s".format( epoch_time, total_time)) logging.debug("Training loss:\t\t{:.5g}".format(train_err / train_batches)) # Adapt the learning rate if epoch % self.adapt_epoch == 0: self.learning_rate.set_value( np.float32(self.init_learning_rate * 0.1)) # Design matrix layers = lasagne.layers.get_all_layers(self.network) self.Theta = lasagne.layers.get_output(layers[:-1], self.X)[-1].eval() if do_optimize: if self.do_mcmc: self.sampler = emcee.EnsembleSampler( self.n_hypers, 2, self.marginal_log_likelihood) # Do a burn-in in the first iteration if not self.burned: # Initialize the walkers by sampling from the prior self.p0 = self.prior.sample_from_prior(self.n_hypers) # Run MCMC sampling self.p0, _, _ = self.sampler.run_mcmc(self.p0, self.burnin_steps, rstate0=self.rng) self.burned = True # Start sampling pos, _, _ = self.sampler.run_mcmc(self.p0, self.chain_length, rstate0=self.rng) # Save the current position, it will be the startpoint in # the next iteration self.p0 = pos # Take the last samples from each walker self.hypers = np.exp(self.sampler.chain[:, -1]) else: # Optimize hyperparameters of the Bayesian linear regression res = optimize.fmin(self.nll, np.random.rand(2)) self.hypers = [[np.exp(res[0]), np.exp(res[1])]] else: self.hypers = [[self.alpha, self.beta]] logging.info("Hypers: %s" % self.hypers) self.models = [] for sample in self.hypers: # Instantiate a model for each hyperparameter configuration model = BayesianLinearRegression(alpha=sample[0], beta=sample[1], basis_func=None) model.train(self.Theta, self.y[:, 0], do_optimize=False) self.models.append(model)
def train(self, X, y, do_optimize=True, **kwargs): self.X = normalize(X, self.lower, self.upper) if self.normalize_output: # Normalize output to have zero mean and unit standard deviation self.y, self.y_mean, self.y_std = normalization.zero_mean_unit_var_normalization( y) else: self.y = y # Use the mean of the data as mean for the GP mean = np.mean(self.y, axis=0) self.gp = george.GP(self.kernel, mean=mean) if do_optimize: # We have one walker for each hyperparameter configuration sampler = emcee.EnsembleSampler(self.n_hypers, len(self.kernel.pars) + 1, self.loglikelihood) # Do a burn-in in the first iteration if not self.burned: # Initialize the walkers by sampling from the prior if self.prior is None: self.p0 = np.random.rand(self.n_hypers, len(self.kernel.pars) + 1) else: self.p0 = self.prior.sample_from_prior(self.n_hypers) # Run MCMC sampling self.p0, _, _ = sampler.run_mcmc(self.p0, self.burnin_steps, rstate0=self.rng) self.burned = True # Start sampling pos, _, _ = sampler.run_mcmc(self.p0, self.chain_length, rstate0=self.rng) # Save the current position, it will be the start point in # the next iteration self.p0 = pos # Take the last samples from each walker self.hypers = sampler.chain[:, -1] else: if self.hypers is None: self.hypers = self.gp.kernel[:].tolist() self.hypers.append(self.noise) self.hypers = [self.hypers] self.models = [] for sample in self.hypers: # Instantiate a GP for each hyperparameter configuration kernel = deepcopy(self.kernel) #kernel.pars = np.exp(sample[:-1]) kernel.vector = sample[:-1] noise = np.exp(sample[-1]) model = MTBOGP(kernel, normalize_output=self.normalize_output, noise=noise, lower=self.lower, upper=self.upper, rng=self.rng) model.train(X, y, do_optimize=False) self.models.append(model) self.is_trained = True
def train(self, X, y, do_optimize=True, **kwargs): """ Performs MCMC sampling to sample hyperparameter configurations from the likelihood and trains for each sample a GP on X and y Parameters ---------- X: np.ndarray (N, D) Input data points. The dimensionality of X is (N, D), with N as the number of points and D is the number of features. y: np.ndarray (N,) The corresponding target values. do_optimize: boolean If set to true we perform MCMC sampling otherwise we just use the hyperparameter specified in the kernel. """ if self.normalize_input: # Normalize input to be in [0, 1] self.X, self.lower, self.upper = normalization.zero_one_normalization(X, self.lower, self.upper) else: self.X = X if self.normalize_output: # Normalize output to have zero mean and unit standard deviation self.y, self.y_mean, self.y_std = normalization.zero_mean_unit_var_normalization(y) if self.y_std == 0: raise ValueError("Cannot normalize output. All targets have the same value") else: self.y = y # Use the mean of the data as mean for the GP self.mean = np.mean(self.y, axis=0) self.gp = george.GP(self.kernel, mean=self.mean) if do_optimize: # We have one walker for each hyperparameter configuration sampler = emcee.EnsembleSampler(self.n_hypers, len(self.kernel.pars) + 1, self.loglikelihood) sampler.random_state = self.rng.get_state() # Do a burn-in in the first iteration if not self.burned: # Initialize the walkers by sampling from the prior if self.prior is None: self.p0 = self.rng.rand(self.n_hypers, len(self.kernel.pars) + 1) else: self.p0 = self.prior.sample_from_prior(self.n_hypers) # Run MCMC sampling self.p0, _, _ = sampler.run_mcmc(self.p0, self.burnin_steps, rstate0=self.rng) self.burned = True # Start sampling pos, _, _ = sampler.run_mcmc(self.p0, self.chain_length, rstate0=self.rng) # Save the current position, it will be the start point in # the next iteration self.p0 = pos # Take the last samples from each walker self.hypers = sampler.chain[:, -1] else: self.hypers = self.gp.kernel[:].tolist() self.hypers.append(self.noise) self.hypers = [self.hypers] self.models = [] for sample in self.hypers: # Instantiate a GP for each hyperparameter configuration kernel = deepcopy(self.kernel) kernel.pars = np.exp(sample[:-1]) noise = np.exp(sample[-1]) model = GaussianProcess(kernel, normalize_output=self.normalize_output, normalize_input=self.normalize_input, noise=noise, lower=self.lower, upper=self.upper, rng=self.rng) model.train(X, y, do_optimize=False) self.models.append(model) self.is_trained = True
def train(self, X, y, *args, **kwargs): """ Trains the model on the provided data. Parameters ---------- X: np.ndarray (N, D) Input data points. The dimensionality of X is (N, D), with N as the number of points and D is the number of features. y: np.ndarray (N,) The corresponding target values. """ # Clear old samples start_time = time.time() self.net = self.get_net(n_inputs=X.shape[1]) nll, mse = self.negativ_log_likelihood(self.net, self.Xt, self.Yt, X.shape[0], self.weight_prior, self.variance_prior) params = lasagne.layers.get_all_params(self.net, trainable=True) seed = self.rng.randint(1, 100000) srng = theano.sandbox.rng_mrg.MRG_RandomStreams(seed) if self.sampling_method == "sghmc": self.sampler = SGHMCSampler(rng=srng, precondition=self.precondition, ignore_burn_in=False) elif self.sampling_method == "sgld": self.sampler = SGLDSampler(rng=srng, precondition=self.precondition) else: logging.error("Sampling Strategy % does not exist!" % self.sampling_method) self.compute_err = theano.function([self.Xt, self.Yt], [mse, nll]) self.single_predict = theano.function([self.Xt], lasagne.layers.get_output(self.net, self.Xt)) self.samples.clear() if self.normalize_input: self.X, self.x_mean, self.x_std = zero_mean_unit_var_normalization(X) else: self.X = X if self.normalize_output: self.y, self.y_mean, self.y_std = zero_mean_unit_var_normalization(y) else: self.y = y self.sampler.prepare_updates(nll, params, self.l_rate, mdecay=self.mdecay, inputs=[self.Xt, self.Yt], scale_grad=X.shape[0]) logging.info("Starting sampling") # Check if we have enough data points to form a minibatch # otherwise set the batchsize equal to the number of input points if self.X.shape[0] < self.bsize: self.bsize = self.X.shape[0] logging.error("Not enough datapoint to form a minibatch. " "Set the batchsize to {}".format(self.bsize)) i = 0 while i < self.n_iters and len(self.samples) < self.n_nets: if self.X.shape[0] == self.bsize: start = 0 else: start = np.random.randint(0, self.X.shape[0] - self.bsize) xmb = floatX(self.X[start:start + self.bsize]) ymb = floatX(self.y[start:start + self.bsize, None]) if i < self.burn_in: _, nll_value = self.sampler.step_burn_in(xmb, ymb) else: _, nll_value = self.sampler.step(xmb, ymb) if i % 512 == 0 and i <= self.burn_in: total_err, total_nll = self.compute_err(floatX(self.X), floatX(self.y).reshape(-1, 1)) t = time.time() - start_time logging.info("Iter {:8d} : NLL = {:11.4e} MSE = {:.4e} " "Time = {:5.2f}".format(i, float(total_nll), float(total_err), t)) if i % self.sample_steps == 0 and i >= self.burn_in: total_err, total_nll = self.compute_err(floatX(self.X), floatX(self.y).reshape(-1, 1)) t = time.time() - start_time self.samples.append(lasagne.layers.get_all_param_values(self.net)) logging.info("Iter {:8d} : NLL = {:11.4e} MSE = {:.4e} " "Samples= {} Time = {:5.2f}".format(i, float(total_nll), float(total_err), len(self.samples), t)) i += 1 self.is_trained = True