def predict_log_marginal_probabilities(self, X: np.ndarray) -> np.ndarray: # TODO: Check against GPFlow. # TODO: Is this really worth it? Could just use predict_y. assert self.is_fit X = self.scaler.transform(X) # Run the prediction for each model results = list() for cur_model in self.models: if self.use_cache: # Load model cur_model = load_saved_gpflow_model(cur_model) # Predict f, the latent probability on the probit scale f_mean, f_var = cur_model.predict_f(X) f_std = np.sqrt(f_var) if self.use_cache: gpflow.reset_default_graph_and_session() result = log_probability_via_sampling( np.squeeze(f_mean), np.squeeze(f_std), self.n_draws_predict) results.append(result) results = np.stack(results, axis=1) return results
def evalMCMCSamples(X, Y, x): gpflow.reset_default_graph_and_session() Y = np.atleast_2d(Y).T traces, m = evalMCMC(X, Y) f_samples = [] nn = 1 for i, s in traces.iloc[::10].iterrows(): f = m.predict_f_samples(x, nn, initialize=False, feed_dict=m.sample_feed_dict(s)) f_samples.append(f) f_samples = np.array(f_samples) # print("f_samples.shape=", f_samples.shape) # print("x.shape=", x.shape) out = f_samples[:, 0, :, 0].reshape(f_samples.shape[0], f_samples.shape[2]) # print("out.shape=", out.shape) m.clear() return x, out
def evalMLESamples(X, Y, x): gpflow.reset_default_graph_and_session() Y = np.atleast_2d(Y).T _, m = evalMLE(X, Y) num_samples = 10 ff = m.predict_f_samples(x, num_samples, initialize=False) # print("ff.shape=", ff.shape) m.clear() return x, ff[:, :, 0]
def calculate_log_likelihood(self, X, y): assert self.is_fit assert y.shape[1] == len(self.models) X = self.scaler.transform(X) means, sds = list(), list() for cur_model in self.models: if self.use_cache: cur_model = load_saved_gpflow_model(cur_model) cur_mean, cur_vars = cur_model.predict_f(X) cur_sds = np.sqrt(cur_vars) if self.use_cache: gpflow.reset_default_graph_and_session() means.append(np.squeeze(cur_mean)) sds.append(np.squeeze(cur_sds)) means = np.stack(means, axis=1) sds = np.stack(sds, axis=1) site_log_liks = np.zeros(means.shape[0]) # Estimate site by site for i, (cur_y, cur_mean, cur_sd) in enumerate(zip(y, means, sds)): draws = np.random.normal( cur_mean, cur_sd, size=(self.n_draws_predict, means.shape[1])) log_lik = calculate_log_joint_bernoulli_likelihood(draws, cur_y) site_log_liks[i] = log_lik return site_log_liks
def main(): X = np.loadtxt("../data/neur.X.txt") Y = np.loadtxt("../data/neur.Y.txt") gpflow.reset_default_graph_and_session() name = 'test' minibatch_size = 500 W1_init = normalize(np.random.random(size=(C, K1))) W2_init = normalize(np.random.random(size=(G, K2))) with gpflow.defer_build(): kernel = mk.SharedIndependentMok( gpflow.kernels.RBF(1, active_dims=[0]), K1 * K2) Z = np.linspace(0, 1, T)[:, None].astype(np.float64) feature = gpflow.features.InducingPoints(Z) feature = mf.SharedIndependentMof(feature) model = SplitGPM(X, Y, np.log(W1_init + 1e-5), np.log(W2_init + 1e-5), kernel, gpflow.likelihoods.Gaussian(), feat=feature, minibatch_size=minibatch_size, name=name) model.compile() model.W1.set_trainable(True) # learn cell assignments model.W2.set_trainable(True) # learn gene assignments model.feature.set_trainable(True) # move inducing points model.kern.set_trainable(True) # learn kernel parameters model.likelihood.set_trainable(True) # lear likelihood parameters adam = gpflow.train.AdamOptimizer(0.005) adam.minimize(model, maxiter=10000) save_model(model)
def fit(self, X, y): self.scaler = StandardScaler() X = self.scaler.fit_transform(X) Z = find_starting_z(X, num_inducing=self.n_inducing, use_minibatching=False) self.models = list() # We need to fit each species separately for cur_output in tqdm(range(y.shape[1])): cur_kernel = self.kernel_function() cur_likelihood = gpflow.likelihoods.Bernoulli() cur_y = y[:, [cur_output]] cur_m = gpflow.models.SVGP(X, cur_y, kern=cur_kernel, likelihood=cur_likelihood, Z=Z) opt = gpflow.train.ScipyOptimizer( options={'maxfun': self.maxiter}) opt.minimize(cur_m, maxiter=self.maxiter, disp=self.verbose_fit) if self.use_cache: # Store in cache dir save_dir = join(self.cache_dir, f'model_{cur_output}') save_gpflow_model(cur_m, save_dir) self.models.append(save_dir) # Reset graph gpflow.reset_default_graph_and_session() else: # Append directly self.models.append(cur_m) self.is_fit = True
idx = [d[x] for x in zip(data.line, data.time)] library_size = data.groupby(['line', 'time']).value.sum().values[idx] gene_lengths = gene_attributes['size'].values[W2_idx] gc_content = gene_attributes.percentage_gene_gc_content.values[W2_idx] / 100 # make feature Zlibsize = np.quantile(library_size, np.linspace(0.01, .99, 200)) Zgenesize = np.quantile(gene_lengths, np.linspace(0.01, 0.99, 200)) Z = np.log(np.vstack([Zlibsize, Zgenesize]).T) X_aug = np.log(np.stack([library_size.flatten(), gene_lengths.flatten()]).T) # build model gpflow.reset_default_graph_and_session() with gpflow.defer_build(): kernel = gpflow.kernels.RBF(1, active_dims=[0]) + gpflow.kernels.RBF(1, active_dims=[1]) feature = gpflow.features.InducingPoints(Z) model = gpflow.models.SVGP( X_aug, Y, kernel, NegativeBinomial(), feat=feature, minibatch_size=500, name=name) model.compile() # restore/create monitor session lr = 0.01 monitor_tasks, session, global_step, file_writer = build_monitor(model, path) optimiser = gpflow.train.AdamOptimizer(lr)
def fitGP(method, normalize=True): assert method in ('VFE', 'FITC', 'GP') if data_directory == 'year-prediction-MSD' and method == 'FITC': return # big data, thus using only SVGP, no FITC perf = np.nan * np.zeros((n_splits, 2)) np.random.seed(1) for split in range(n_splits): (X_train_normalized, y_train_normalized, y_train, X_test_normalized, X_test, y_test, mean_X_train, std_X_train, mean_y_train, std_y_train) = _split_data(split, normalize) if data_directory != 'year-prediction-MSD': if method == 'GP': gp = GPy.models.GPRegression( X_train_normalized, y_train_normalized[:, None], GPy.kern.RBF(X_train_normalized.shape[1], ARD=True)) else: gp = GPy.models.SparseGPRegression( X_train_normalized, y_train_normalized[:, None], GPy.kern.RBF(X_train_normalized.shape[1], ARD=True), num_inducing=n_hidden) if method == 'FITC': gp.inference_method = GPy.inference.latent_function_inference.FITC( ) success = False for _ in range(10): try: gp.optimize_restarts(robust=True) success = True break except: pass if success: gp.save('results/%s/%s_split%g.hdf5' % (method, data_directory, split)) else: continue else: gpflow.reset_default_graph_and_session() Z = X_train_normalized[np.random.choice(np.arange( len(X_train_normalized)), n_hidden, replace=False)].copy() gp = gpflow.models.SVGP(X_train_normalized, y_train_normalized[:, None], gpflow.kernels.RBF( X_train_normalized.shape[1], ARD=True), gpflow.likelihoods.Gaussian(), Z, minibatch_size=1000) adam = gpflow.train.AdamOptimizer().make_optimize_action(gp) gpflow.actions.Loop(adam, stop=30000)() gp.anchor(gp.enquire_session()) saver = gpflow.saver.Saver() saver.save( 'results/%s/%s_split%g' % (method, data_directory, split), gp) if data_directory != 'year-prediction-MSD': m, v = np.squeeze(gp.predict(X_test_normalized)) else: m, v = np.squeeze(gp.predict_y(X_test_normalized)) if normalize: v *= std_y_train**2 m = m * std_y_train + mean_y_train perf[split] = np.sqrt(np.mean( (y_test - m)**2)), -logpdf(y_test - m, v).mean() np.save('results/%s/%s' % (method, data_directory), perf)
def fitANN(normalize=True): etas = np.array([1, 2, 5, 10, 20, 50, 100]) * { 'bostonHousing': 1e-7, 'concrete': 1e-7, 'energy': 1e-10, 'kin8nm': 1e-5, 'naval-propulsion-plant': 1e-9, 'power-plant': 1e-6, 'protein-tertiary-structure': 1e-5, 'wine-quality-red': 1e-6, 'yacht': 1e-9, 'year-prediction-MSD': 1e-4 }[data_directory] perf = np.nan * np.zeros((n_splits, 8)) np.random.seed(1) for split in range(n_splits): (X_train_normalized, y_train_normalized, y_train, X_test_normalized, X_test, y_test, mean_X_train, std_X_train, mean_y_train, std_y_train) = _split_data(split, normalize) if data_directory != 'year-prediction-MSD': gp = GPy.models.SparseGPRegression(X_train_normalized, y_train_normalized[:, None], GPy.kern.RBF( X_train_normalized.shape[1], ARD=True), num_inducing=n_hidden) gp[:] = h5py.File( 'results/VFE/%s_split%g.hdf5' % (data_directory, split), 'r')['param_array'] var = gp.Gaussian_noise.variance varK = gp.kern.variance Kfu = gp.kern.K(X_train_normalized, gp.inducing_inputs) Kfu_test = gp.kern.K(X_test_normalized, gp.inducing_inputs) w = gp.posterior.woodbury_vector.ravel() woodbury_inv = gp.posterior.woodbury_inv else: gpflow.reset_default_graph_and_session() saver = gpflow.saver.Saver() gp = saver.load('results/VFE/%s_split%g' % (data_directory, split)) var = gp.likelihood.variance.value varK = gp.kern.variance.value Kfu = gp.kern.compute_K(X_train_normalized, gp.feature.Z.value) Kuu = gp.kern.compute_K(gp.feature.Z.value, gp.feature.Z.value) Kfu_test = gp.kern.compute_K(X_test_normalized, gp.feature.Z.value) Sigma = np.linalg.inv(Kfu.T.dot(Kfu) + var * Kuu) w = Sigma.dot(Kfu.T.dot(y_train_normalized)) woodbury_inv = np.linalg.inv(Kuu) - var * Sigma def custom_loss(): # neg loglikelihood def loss(y_true, y_pred): return tf.divide(tf.square(y_pred[..., 0] - y_true[..., 0]), y_pred[..., 1]) + \ tf.math.log(y_pred[..., 1]) return loss def build_model(eta): u, s, v = np.linalg.svd(woodbury_inv) U = (u + v.T).dot(np.diag(np.sqrt(s))) / 2 inputs = layers.Input(shape=(n_hidden, )) m = layers.Dense(1, kernel_initializer=tf.constant_initializer(w), trainable=False)(inputs) x = layers.Dense(n_hidden, kernel_initializer=tf.constant_initializer(U), activation=tf.square)(inputs) def act(a): return tf.math.softplus(a / var / 2) * var * 2 v = layers.Dense(1, kernel_initializer=tf.constant_initializer( -np.ones((1, n_hidden))), bias_initializer=tf.constant_initializer(var + varK), activation=act)(x) outputs = layers.concatenate([m, v]) model = tf.keras.Model(inputs=inputs, outputs=outputs) model.compile(loss=custom_loss(), optimizer=tf.keras.optimizers.Adam(eta)) return model # find best learning rate using 5-fold cross validation best_loss = np.inf best_eta = etas[0] for eta in etas: loss = 0 for fold in range(5): model = build_model(eta) train_idx = np.ones(X_train_normalized.shape[0], dtype=bool) train_idx[fold::5] = False history = model.fit( Kfu[train_idx], y_train_normalized[train_idx], epochs=n_epochs, validation_data=(Kfu[~train_idx], y_train_normalized[~train_idx]), verbose=0) loss += history.history['val_loss'][-1] if loss < best_loss: best_loss = loss best_eta = eta model = build_model(best_eta) history = model.fit(Kfu, y_train_normalized, epochs=n_epochs, verbose=0) if data_directory != 'year-prediction-MSD': m = np.squeeze(gp.predict(X_test_normalized))[0] else: m = np.squeeze(gp.predict_y(X_test_normalized))[0] v = np.squeeze(model.predict(Kfu_test)).T[1] if normalize: m = m * std_y_train + mean_y_train v = v * std_y_train**2 perf[split, :2] = np.sqrt(np.mean( (y_test - m)**2)), -logpdf(y_test - m, v).mean() perf[split, 2] = best_eta # measure prediction time if data_directory != 'year-prediction-MSD': U, Ub, _, _, w, wb = model.get_weights() m = gp.posterior.woodbury_vector var = 2 * gp.Gaussian_noise.variance def act(a): return np.log(1 + np.exp(a / var)) * var if normalize: def predict(X_test): X_test_normalized = (X_test - mean_X_train) / std_X_train K = gp.kern.K(X_test_normalized, gp.inducing_inputs) return np.concatenate([ K.dot(m) * std_y_train + mean_y_train, act(((K.dot(U) + Ub)**2).dot(w) + wb) * std_y_train**2 ], 1) else: def predict(X_test): K = gp.kern.K(X_test, gp.inducing_inputs) return np.concatenate( [K.dot(m), act(((K.dot(U) + Ub)**2).dot(w) + wb)], 1) else: if normalize: def predict(X_test): X_test_normalized = (X_test - mean_X_train) / std_X_train K = gp.kern.compute_K(X_test_normalized, gp.feature.Z.value) m, v = np.squeeze(model.predict(K)).T return np.array( [m * std_y_train + mean_y_train, v * std_y_train**2]) else: def predict(X_test): K = gp.kern.compute_K(X_test, gp.feature.Z.value) m, v = np.squeeze(model.predict(K)).T return np.array([m, v]) for i in range(5): t = -time() _ = predict(X_test) t += time() perf[split, 3 + i] = t np.save('results/ANN/' + data_directory, perf)
def fitBioNN(normalize=True): perf = np.nan * np.zeros((n_splits, 8)) np.random.seed(1) for split in range(n_splits): (X_train_normalized, y_train_normalized, y_train, X_test_normalized, X_test, y_test, mean_X_train, std_X_train, mean_y_train, std_y_train) = _split_data(split, normalize) if data_directory != 'year-prediction-MSD': vfe = GPy.models.SparseGPRegression( X_train_normalized, y_train_normalized[:, None], GPy.kern.RBF(X_train_normalized.shape[1], ARD=True), num_inducing=n_hidden) vfe[:] = h5py.File( 'results/VFE/%s_split%g.hdf5' % (data_directory, split), 'r')['param_array'] nn = BioNN(X_train_normalized, y_train_normalized[:, None], vfe.inducing_inputs, vfe.kern.lengthscale) else: gpflow.reset_default_graph_and_session() saver = gpflow.saver.Saver() vfe = saver.load('results/VFE/%s_split%g' % (data_directory, split)) nn = BioNN(X_train_normalized, y_train_normalized[:, None], vfe.feature.Z.value, vfe.kern.lengthscales.value) m, v = np.squeeze(nn.predict(X_test_normalized)) if normalize: m = m * std_y_train + mean_y_train v = v * std_y_train**2 perf[split, :2] = np.sqrt(np.mean( (y_test - m)**2)), -logpdf(y_test - m, v).mean() perf[split, 2] = v.var() # measure prediction time if normalize: def predict(X_test): X_test_normalized = (X_test - mean_X_train) / std_X_train K = nn.kern.K(X_test_normalized, nn.inducing_inputs) m = K.dot(nn.w_mean) SNRinv = np.maximum(1 - np.sum(K**2, 1), 0) v = np.vstack([SNRinv, np.ones(len(m))]).T.dot(nn.wb_var) return np.concatenate( [m * std_y_train + mean_y_train, v * std_y_train**2], 1) else: def predict(X_test): K = nn.kern.K(X_test, nn.inducing_inputs) m = K.dot(nn.w_mean) SNRinv = np.maximum(1 - np.sum(K**2, 1), 0) v = np.vstack([SNRinv, np.ones(len(m))]).T.dot(nn.wb_var) return np.concatenate([m, v], 1) for i in range(5): t = -time() _ = predict(X_test) t += time() perf[split, 3 + i] = t np.save('results/BioNN/' + data_directory, perf)
def load_saved_gpflow_model(gpflow_model_path: str): gpflow.reset_default_graph_and_session() m = gpflow.saver.Saver().load(gpflow_model_path) return m
def fit(self, X, y): n_dims = X.shape[1] n_outputs = y.shape[1] kern_fun = partial(self.kernel_fun, n_dims=n_dims, n_outputs=n_outputs) def get_model(w_prior, bias_var): # We need to make a model creation function. cur_kernel = kern_fun(w_prior=w_prior, bias_var=bias_var) model_fun = partial(self.model_fun, kernel=cur_kernel) return model_fun() scores = list() for cur_variance in self.variances_to_try: # Compute the bias variance so that we have a variance of 0.4 # for that overall bias_var = 0.4 / cur_variance print(f'Fitting {cur_variance:.2f} with bias var {bias_var:.2f}') model_fun = lambda: get_model(cur_variance, bias_var) # NOQA cur_mean_score, cur_stderr = MultiOutputGP.cross_val_score( X, y, model_fun, save_dir=join(self.cv_save_dir, f'{cur_variance:.4f}'), n_folds=self.n_folds) gpf.reset_default_graph_and_session() print(f'Mean likelihood is {cur_mean_score}') scores.append({ 'mean': cur_mean_score, 'stderr': cur_stderr, 'variance': cur_variance }) scores = pd.DataFrame(scores) # Sort by ascending complexity scores = scores.sort_values('variance') # Find best index; invert mean since error rule expects errors, # where smaller is better, rather than likelihoods where higher is # better. best_idx = select_using_standard_error_rule(-scores['mean'].values, scores['stderr'].values) best_variance = scores.iloc[best_idx]['variance'] print(f'Selected model using one standard error rule has variance ' f' {best_variance:.2f}') bias_var = 0.4 / best_variance best_model = get_model(best_variance, bias_var) best_model.fit(X, y) self.is_fit = True self.model = best_model
def del_graph(self): gpflow.reset_default_graph_and_session() return
def cross_val_score(X, y, model_creation_fun, save_dir, n_folds=4): kfold = KFold(n_splits=n_folds) fold_liks = np.empty(n_folds) for i, (cur_train_ind, cur_test_ind) in tqdm(enumerate(kfold.split(X, y))): cur_X = X[cur_train_ind] cur_y = y[cur_train_ind] gpf.reset_default_graph_and_session() model = model_creation_fun() model.fit(cur_X, cur_y) cur_save_dir = join(save_dir, f'fold_{i + 1}') os.makedirs(cur_save_dir, exist_ok=True) model.save_model(cur_save_dir) cur_test_x = X[cur_test_ind] cur_test_y = y[cur_test_ind] log_liks = model.calculate_log_likelihood(cur_test_x, cur_test_y) marg_pred = pd.DataFrame( model.predict_marginal_probabilities(cur_test_x)) marg_pred.to_csv(join(cur_save_dir, 'marginal_probs.csv')) pd.DataFrame(cur_test_y).to_csv(join(cur_save_dir, 'y_t.csv')) # I am also interested in the log loss. y_t_df = pd.DataFrame(cur_test_y) neg_log_loss_results = multi_class_eval(marg_pred, y_t_df, neg_log_loss_with_labels, 'log_lik') neg_log_loss_results.to_csv( join(cur_save_dir, 'marginal_species_log_lik.csv')) pd.Series(neg_log_loss_results.mean()).to_csv( join(cur_save_dir, 'neg_log_loss_mean.csv')) fold_liks[i] = np.mean(log_liks) np.savez(join(cur_save_dir, 'cv_results'), site_log_liks=log_liks, cur_train_X=cur_X, cur_train_y=cur_y, cur_test_X=cur_test_x, cur_test_y=cur_test_y, train_ind=cur_train_ind, test_ind=cur_test_ind) pd.Series({ 'mean_lik': np.mean(fold_liks) }).to_csv(join(save_dir, 'mean_lik.csv')) pd.Series(fold_liks, index=[f'fold_{i+1}' for i in range(n_folds) ]).to_csv(join(save_dir, 'fold_liks.csv')) return np.mean(fold_liks), np.std(fold_liks) / np.sqrt(len(fold_liks))