def initial_data_split(x, smiles_list, y, seed, split, test_size=0.2, init_size=0.25): """ Splits data by initial/train/test :param x: input features :param y: output targets :param test_size: size of test set expressed as fraction of total data - 20% (Yao's paper) :param init_size: size of initial training subset as fraction of training set - 25% (Yao's paper) :return: X_init, y_init, X_holdout, y_holdout, X_test, y_test """ if split=='random': X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed) X_holdout, X_init, y_holdout, y_init = train_test_split(X_train, y_train, test_size=init_size, random_state=seed) elif split=='scaffold': train_ind, test_ind = scaffold_split(smiles_list, seed=seed) smiles_train = np.array(smiles_list)[train_ind] X_train, X_test = x[train_ind], x[test_ind] y_train, y_test = y[train_ind], y[test_ind] holdout_ind, init_ind = scaffold_split(smiles_train.tolist(), sizes=(1-init_size, init_size), seed=seed) X_holdout, X_init = X_train[holdout_ind], X_train[init_ind] y_holdout, y_init = y_train[holdout_ind], y_train[init_ind] y_init = y_init.reshape(-1, 1) y_holdout = y_holdout.reshape(-1, 1) y_test = y_test.reshape(-1, 1) return X_init, y_init, X_holdout, y_holdout, X_test, y_test
def main(args): warnings.filterwarnings('ignore') print('\nTraining SOAP-GP on '+task+' dataset') print('\nGenerating features...') smiles_list, y = parse_dataset(args.task, subtask=args.subtask) X = np.arange(len(smiles_list)).reshape(-1,1) # array of data indices if args.task!='IC50': rem_mat = np.load(args.kernel_path+args.task+'_soap.npy') else: print('Subtask: {}'.format(args.subtask)) rem_mat = np.load(args.kernel_path+args.subtask+'_soap.npy') r2_list = [] rmse_list = [] print('\nBeginning training loop...') j=0 for i in range(args.n_runs): if split=='random': kf = KFold(n_splits=args.n_folds, random_state=i, shuffle=True) split_list = kf.split(X) elif split=='scaffold': train_ind, test_ind = scaffold_split(smiles_list, seed=i) split_list = [train_ind, test_ind] for train_ind, test_ind in split_list: X_train, X_test = X[train_ind], X[test_ind] y_train, y_test = y[train_ind], y[test_ind] X_train = tf.convert_to_tensor(X_train, dtype = tf.float64) X_test = tf.convert_to_tensor(X_test, dtype = tf.float64) m = train_soapgp(X_train, y_train, rem_mat) #mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) r2_list.append(score) rmse_list.append(rmse) #np.savetxt('results/soapgp_'+task+'_split_'+split+'_run_'+str(j)+'_ypred.txt', y_pred) #np.savetxt('results/soapgp_'+task+'_split_'+split+'_run_'+str(j)+'_ytest.txt', y_test) #np.savetxt('results/soapgp_'+task+'_split_'+split+'_run_'+str(j)+'_ystd.txt', np.sqrt(y_var)) j+=1 print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list))))
def main(args): warnings.filterwarnings('ignore') print('\nTraining ECFP-RF on ' + args.task + ' dataset') print('\nGenerating features...') if args.task=='IC50': print('Subtask: {}'.format(args.subtask)) smiles_list, y = parse_dataset(args.task, subtask=args.subtask) X = np.arange(len(smiles_list)).reshape(-1,1) # array of data indices r2_list = [] rmse_list = [] print('\nBeginning training loop...') j = 0 for i in range(args.n_runs): if args.split == 'random': kf = KFold(n_splits=args.n_folds, random_state=i, shuffle=True) split_list = kf.split(X) elif args.split == 'scaffold': train_ind, test_ind = scaffold_split(smiles_list, seed=i) split_list = [train_ind, test_ind] for train_ind, test_ind in split_list: y_train, y_test = y[train_ind], y[test_ind] smiles_df = pd.DataFrame(smiles_list, columns=['smiles']) train_smiles = smiles_df.iloc[train_ind]['smiles'].to_list() test_smiles = smiles_df.iloc[test_ind]['smiles'].to_list() X_train = np.asarray([generate_fingerprints(s) for s in train_smiles]) X_test = np.asarray([generate_fingerprints(s) for s in test_smiles]) m = fit_forest(X_train, y_train) y_pred = m.predict(X_test) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) r2_list.append(score) rmse_list.append(rmse) # np.savetxt('results/ecfp_'+task+'_split_'+split+'_run_'+str(j)+'_ypred.txt', y_pred) # np.savetxt('results/ecfp_'+task+'_split_'+split+'_run_'+str(j)+'_ytest.txt', y_test) # np.savetxt('results/ecfp_'+task+'_split_'+split+'_run_'+str(j)+'_ystd.txt', np.sqrt(y_var)) j += 1 print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list))))
def main(task, split, n_runs, n_fold, n_bits): global rem_mat, rem_diag, max_rem warnings.filterwarnings('ignore') print('\nTraining E3FP-GP on '+task+' dataset') print('\nGenerating features...') if task in PATHS: smiles_list, y = parse_dataset(task, PATHS[task]) #NEED TO FIX MALARIA X = np.arange(len(smiles_list)).reshape(-1,1) else: raise Exception('Must provide dataset') m = None def objective_closure(): return -m.log_marginal_likelihood() print('\nBeginning training loop...') if n_bits==-1: bit_list = [512, 1024, 2048, 4096, 8192] else: bit_list = [n_bits] for bits in bit_list: r2_list = [] rmse_list = [] logP_list = [] j=0 for i in range(n_runs): if split=='random': kf = KFold(n_splits=n_fold, random_state=i, shuffle=True) split_list = kf.split(X) elif split=='scaffold': train_ind, test_ind = scaffold_split(smiles_list, seed=i) split_list = [(train_ind, test_ind)] for train_ind, test_ind in split_list: X_train, X_test = X[train_ind], X[test_ind] y_train, y_test = y[train_ind], y[test_ind] y_train, y_test, y_scaler = transform_data(y_train, y_test) X_train = tf.convert_to_tensor(X_train, dtype = tf.float64) X_test = tf.convert_to_tensor(X_test, dtype = tf.float64) #rem_mat = np.load('kernels/'+task+'_ecfp_'+str(bits)+'.npy') rem_mat = np.load('/rds-d2/user/wjm41/hpc-work/kernels/e3fp/'+task+'_e3fp.npy') rem_diag = tf.constant(np.diag(rem_mat),dtype=tf.float64) rem_mat = tf.constant(rem_mat,dtype=tf.float64) from gpflow.utilities import positive class Matern32_rem(gpflow.kernels.Kernel): def __init__(self): super().__init__(active_dims=[0]) self.var = gpflow.Parameter(1.0, transform=positive()) self.mag = gpflow.Parameter(1.0, transform=positive()) def K(self, X, X2=None, presliced=None): global rem_mat if X2 is None: X2=X A = tf.cast(X,tf.int32) A = tf.reshape(A,[-1]) A2 = tf.reshape(X2,[-1]) A2 = tf.cast(A2,tf.int32) K_mat = tf.gather(rem_mat, A, axis=0) K_mat = tf.gather(K_mat, A2, axis=1) z = tf.math.sqrt(3*K_mat)*self.var K_final = self.mag*(1+z)*tf.math.exp(-z) return K_final def K_diag(self, X, presliced=None): global rem_diag A=tf.cast(X,tf.int32) K_diag = tf.gather_nd(rem_diag, A) z = tf.math.sqrt(3*K_diag)*self.var return self.mag*(1+z)*tf.math.exp(-z) k = Matern32_rem()+gpflow.kernels.White(0.1) m = gpflow.models.GPR( data=(X_train, y_train), kernel=k) opt = gpflow.optimizers.Scipy() opt_logs = opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=10000)) #print_summary(m) y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) y_var = y_scaler.var_ * y_var score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) logP = -m.log_likelihood() #print("\nR^2: {:.3f}".format(score)) #print("RMSE: {:.3f}".format(rmse)) #print("-ve logP: {:.3f}".format(logP)) r2_list.append(score) rmse_list.append(rmse) logP_list.append(logP) np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ypred.txt', y_pred) np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ytest.txt', y_test) np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ystd.txt', np.sqrt(y_var)) j+=1 r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) logP_list = np.array(logP_list) print("\nbits: {}".format(bits)) print("mean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list)))) print("mean -ve logP: {:.4f} +- {:.4f}\n".format(np.mean(logP_list), np.std(logP_list)/np.sqrt(len(logP_list))))