def main(path, task, representation, use_pca, n_trials, test_set_size, use_rmse_conf): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence- error curves. True is the option for rmse. """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() r2_list = [] rmse_list = [] mae_list = [] # We pre-allocate arrays for plotting confidence-error curves _, _, _, y_test = train_test_split(X, y, test_size=test_set_size) # To get test set size n_test = len(y_test) rmse_confidence_list = np.zeros((n_trials, n_test)) mae_confidence_list = np.zeros((n_trials, n_test)) print('\nBeginning training loop...') for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=i) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # e_iso_pi best params: # {'learner': RandomForestRegressor(max_features=0.9348473830061558, n_estimators=381, # n_jobs=1, random_state=2, verbose=False)} # e_iso_n best params: # {'learner': RandomForestRegressor(bootstrap=False, max_features=0.09944870853556087, # min_samples_leaf=3, n_estimators=1295, n_jobs=1, # random_state=0, verbose=False)} # z_iso_pi best params: # {'learner': RandomForestRegressor(max_depth=4, max_features=0.33072121415416944, # n_estimators=2755, n_jobs=1, random_state=2, # verbose=False)} # z_iso_n best params: # {'learner': RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1, # random_state=3, verbose=False)} regr_rf = RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1, random_state=3, verbose=False) regr_rf.fit(X_train, y_train) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # mean and variance GP prediction and RF prediction y_pred, y_var = m.predict_f(X_test) y_pred_rf = regr_rf.predict(X_test) y_pred_av = (y_pred + y_pred_rf.reshape(-1, 1)) / 2.0 y_pred = y_scaler.inverse_transform(y_pred_av) y_test = y_scaler.inverse_transform(y_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) y_pred_train_rf = regr_rf.predict(X_train) y_pred_train = (y_pred_train + y_pred_train_rf.reshape(-1, 1)) / 2.0 train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))
def main(path, representation): """ :param path: str specifying path to dataset. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] """ task = 'e_iso_pi' # task always e_iso_pi with human performance comparison data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # 5 test molecules test_smiles = [ 'BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2', 'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC', 'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC' ] # and their indices in the loaded data test_smiles_indices = [116, 131, 168, 221, 229] X_train = np.delete(X, np.array(test_smiles_indices), axis=0) y_train = np.delete(y, np.array(test_smiles_indices)) X_test = X[[116, 131, 168, 221, 229]] # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was # under a different solvent y_test = y[[116, 131, 168, 221, 229]] y_test[2] = 407. y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # # We standardise the outputs but leave the inputs unchanged # # _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data( ) smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) y_e_iso_n = y_e_iso_n.reshape(-1, 1) y_z_iso_n = y_z_iso_n.reshape(-1, 1) X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) output_dim = 4 # Number of outputs rank = 1 # Rank of W feature_dim = len(X_train[0, :]) tanimoto_active_dims = [i for i in range(feature_dim) ] # active dims for Tanimoto base kernel. # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_train, np.zeros_like(y_train))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.zeros_like(y_test))) # Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) # set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([ gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian() ]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) # fit the covariance function parameters maxiter = ci_niter(1000) gpflow.optimizers.Scipy().minimize( m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B", ) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) r2 = r2_score(y_test[:, 0], y_pred) rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred)) mae = mean_absolute_error(y_test[:, 0], y_pred) per_molecule = np.diag(abs(y_pred - y_test[:, 0])) print("\n Averaged test statistics are") print("\nR^2: {:.3f}".format(r2)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) print("\nAbsolute error per molecule is {} ".format(per_molecule))
def main(path, task, representation, use_pca, n_trials, test_set_size, use_rmse_conf): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence- error curves. True is the option for rmse. """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() r2_list = [] rmse_list = [] mae_list = [] # We pre-allocate arrays for plotting confidence-error curves _, _, _, y_test = train_test_split( X, y, test_size=test_set_size) # To get test set size n_test = len(y_test) rmse_confidence_list = np.zeros((n_trials, n_test)) mae_confidence_list = np.zeros((n_trials, n_test)) print('\nBeginning training loop...') for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_set_size, random_state=i) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data( X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=10000)) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Compute scores for confidence curve plotting. ranked_confidence_list = np.argsort(y_var, axis=0).flatten() for k in range(len(y_test)): # Construct the RMSE error for each level of confidence conf = ranked_confidence_list[0:k + 1] rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf])) rmse_confidence_list[i, k] = rmse # Construct the MAE error for each level of confidence mae = mean_absolute_error(y_test[conf], y_pred[conf]) mae_confidence_list[i, k] = mae # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list)))) # Plot confidence-error curves confidence_percentiles = np.arange( 1e-14, 100, 100 / len(y_test) ) # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29 if use_rmse_conf: rmse_mean = np.mean(rmse_confidence_list, axis=0) rmse_std = np.std(rmse_confidence_list, axis=0) # We flip because we want the most confident predictions on the right-hand side of the plot rmse_mean = np.flip(rmse_mean) rmse_std = np.flip(rmse_std) # One-sigma error bars lower = rmse_mean - rmse_std upper = rmse_mean + rmse_std plt.plot(confidence_percentiles, rmse_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('RMSE (nm)') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig( task + '/results/gpr/{}_confidence_curve_rmse.png'.format(representation)) plt.show() else: # We plot the Mean-absolute error confidence-error curves mae_mean = np.mean(mae_confidence_list, axis=0) mae_std = np.std(mae_confidence_list, axis=0) mae_mean = np.flip(mae_mean) mae_std = np.flip(mae_std) lower = mae_mean - mae_std upper = mae_mean + mae_std plt.plot(confidence_percentiles, mae_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('MAE (nm)') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig( task + '/results/gpr/{}_confidence_curve_mae.png'.format(representation)) plt.show()
def main(path, path_to_dft_dataset, task, representation, theory_level): """ :param path: str specifying path to photoswitches.csv file. :param path_to_dft_dataset: str specifying path to dft_comparison.csv file. :param task: str specifying the task. e_iso_pi only supported task for the TD-DFT comparison. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0'] """ data_loader = TaskDataLoader(task, path) smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset) X = featurise_mols(smiles_list, representation) # Keep only non-duplicate entries because we're not considering effects of solvent non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) X = X[non_duplicate_indices, :] experimental_vals = experimental_vals[non_duplicate_indices] non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) pbe0_vals = pbe0_vals[non_dup_pbe0] cam_vals = cam_vals[non_dup_cam] # molecules with dft values to be split into train/test if theory_level == 'CAM-B3LYP': X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals))) # DFT values for the CAM-B3LYP level of theory dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals))) else: X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals))) # DFT values for the PBE0 level of theory dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals))) mae_list = [] dft_mae_list = [] # We define the Gaussian Process optimisation objective m = None def objective_closure(): return -m.log_marginal_likelihood() print('\nBeginning training loop...') for i in range(len(y_with_dft)): X_train = np.delete(X_with_dft, i, axis=0) y_train = np.delete(y_with_dft, i) X_test = X_with_dft[i].reshape(1, -1) y_test = y_with_dft[i] dft_test = dft_vals[i] X_train = np.concatenate((X_train, X_no_dft)) y_train = np.concatenate((y_train, y_no_dft)) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Output MAE for this trial mae = abs(y_test - y_pred) print("MAE: {}".format(mae)) # Store values in order to compute the mean and standard error of the statistics across trials mae_list.append(mae) # DFT prediction scores on the same trial dft_mae = abs(y_test - dft_test) dft_mae_list.append(dft_mae) mae_list = np.array(mae_list) dft_mae_list = np.array(dft_mae_list) print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list)))) print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list))))
def objective_closure(): return -m.log_marginal_likelihood() # We standardise the outputs but leave the inputs unchanged. Equivalent to transform data used in other scripts. y_train = y_train.reshape(-1, 1) y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) # Fit GP k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # mean and variance GP prediction
def main(path, representation): """ :param path: str specifying path to dataset. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] """ task = 'e_iso_pi' # Always e_iso_pi for human performance comparison data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # 5 test molecules test_smiles = [ 'BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2', 'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC', 'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC' ] # and their indices in the loaded data test_smiles_indices = [116, 131, 168, 221, 229] X_train = np.delete(X, np.array(test_smiles_indices), axis=0) y_train = np.delete(y, np.array(test_smiles_indices)) X_test = X[[116, 131, 168, 221, 229]] # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was # under a different solvent y_test = y[[116, 131, 168, 221, 229]] y_test[2] = 407. y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) num_features = np.shape(X)[1] # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() # for plotting confidence-error curves rmse_confidence_list = [] mae_confidence_list = [] k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) r2 = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) per_molecule = abs(y_pred - y_test) print("\n Averaged test statistics are") print("\nR^2: {:.3f}".format(r2)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) print("\nAbsolute error per molecule is {} ".format(per_molecule))
np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1))) X_test = np.append(X_test, np.ones((len(X_test), 1)) * 3, axis=1) X_train = np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack((np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_train, np.ones_like(y_train) * 3)))) # Fit GP # Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) # set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik)
def main(path, task, representation, use_pca, n_trials, test_set_size): """ Train a multioutput GP simultaneously on all tasks of the photoswitch dataset. :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set """ # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None data_loader_e_iso_pi = TaskDataLoader('e_iso_pi', path) data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) smiles_list_e_iso_pi, y_e_iso_pi = data_loader_e_iso_pi.load_property_data( ) smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data( ) smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() y_e_iso_pi = y_e_iso_pi.reshape(-1, 1) y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) y_e_iso_n = y_e_iso_n.reshape(-1, 1) y_z_iso_n = y_z_iso_n.reshape(-1, 1) X_e_iso_pi = featurise_mols(smiles_list_e_iso_pi, representation) X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) output_dim = 4 # Number of outputs rank = 1 # Rank of W feature_dim = len(X_e_iso_pi[0, :]) tanimoto_active_dims = [i for i in range(feature_dim) ] # active dims for Tanimoto base kernel. r2_list = [] rmse_list = [] mae_list = [] print('\nBeginning training loop...') for i in range(0, n_trials): if task == 'e_iso_pi': X_task = X_e_iso_pi y_task = y_e_iso_pi elif task == 'z_iso_pi': X_task = X_z_iso_pi y_task = y_z_iso_pi elif task == 'e_iso_n': X_task = X_e_iso_n y_task = y_e_iso_n else: X_task = X_z_iso_n y_task = y_z_iso_n X_train, X_test, y_train, y_test = train_test_split( X_task, y_task, test_size=test_set_size, random_state=i) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) if task == 'e_iso_pi': # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones( (len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones( (len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_train, np.zeros_like(y_train))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.zeros_like(y_test))) elif task == 'z_iso_pi': # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), np.append(X_train, np.ones((len(X_train), 1)), axis=1), np.append(X_e_iso_n, np.ones( (len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones( (len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.ones((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.ones((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), np.hstack((y_train, np.ones_like(y_train))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.ones_like(y_test))) elif task == 'e_iso_n': # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_train, np.ones((len(X_train), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones( (len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.ones((len(X_test), 1)) * 2, axis=1) X_train = np.append(X_train, np.ones((len(X_train), 1)) * 2, axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_train, np.ones_like(y_train) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.ones_like(y_test) * 2)) else: # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones( (len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1))) X_test = np.append(X_test, np.ones((len(X_test), 1)) * 3, axis=1) X_train = np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_train, np.ones_like(y_train) * 3)))) y_test = np.hstack((y_test, np.ones_like(y_test) * 3)) # Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) #set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([ gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian() ]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) # fit the covariance function parameters maxiter = ci_niter(1000) gpflow.optimizers.Scipy().minimize( m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B", ) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test[:, 0], y_pred) rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred)) mae = mean_absolute_error(y_test[:, 0], y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) B = coreg.output_covariance().numpy() print("B =", B) _ = plt.imshow(B) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list))))
def main(path, path_to_dft_dataset, representation, theory_level): """ :param path: str specifying path to photoswitches.csv file. :param path_to_dft_dataset: str specifying path to dft_comparison.csv file. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0'] """ task = 'e_iso_pi' # e_iso_pi only task supported for TD-DFT comparison data_loader = TaskDataLoader(task, path) smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset) X = featurise_mols(smiles_list, representation) # Keep only non-duplicate entries because we're not considering effects of solvent non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) X = X[non_duplicate_indices, :] experimental_vals = experimental_vals[non_duplicate_indices] non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) pbe0_vals = pbe0_vals[non_dup_pbe0] cam_vals = cam_vals[non_dup_cam] # molecules with dft values to be split into train/test if theory_level == 'CAM-B3LYP': X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals))) # DFT values for the CAM-B3LYP level of theory dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals))) else: X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals))) # DFT values for the PBE0 level of theory dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals))) # Load in the other property values for multitask learning. e_iso_pi is a always the task in this instance. data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data() smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) y_e_iso_n = y_e_iso_n.reshape(-1, 1) y_z_iso_n = y_z_iso_n.reshape(-1, 1) X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) output_dim = 4 # Number of outputs rank = 1 # Rank of W feature_dim = len(X_no_dft[0, :]) tanimoto_active_dims = [i for i in range(feature_dim)] # active dims for Tanimoto base kernel. mae_list = [] dft_mae_list = [] # We define the Gaussian Process optimisation objective m = None def objective_closure(): return -m.log_marginal_likelihood() print('\nBeginning training loop...') for i in range(len(y_with_dft)): X_train = np.delete(X_with_dft, i, axis=0) y_train = np.delete(y_with_dft, i) X_test = X_with_dft[i].reshape(1, -1) y_test = y_with_dft[i] dft_test = dft_vals[i] X_train = np.concatenate((X_train, X_no_dft)) y_train = np.concatenate((y_train, y_no_dft)) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack((np.hstack((y_train, np.zeros_like(y_train))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.zeros_like(y_test))) # Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) #set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) # fit the covariance function parameters maxiter = ci_niter(1000) gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B",) print_summary(m) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) # Output MAE for this trial mae = abs(y_test[:, 0] - y_pred) print("MAE: {}".format(mae)) # Store values in order to compute the mean and standard error of the statistics across trials mae_list.append(mae) # DFT prediction scores on the same trial dft_mae = abs(y_test[:, 0] - dft_test) dft_mae_list.append(dft_mae) mae_list = np.array(mae_list) dft_mae_list = np.array(dft_mae_list) print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list)))) print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list))))