for iteration in range(num_iters): # We fit the GP M = 500 sgp = SparseGP(X_train, 0 * X_train, y_train, M) sgp.train_via_ADAM(X_train, 0 * X_train, y_train, X_test, X_test * 0, y_test, minibatch_size=10 * M, max_iterations=45, learning_rate=0.005) save_object(sgp, "results_QED_solo/sgp{}.dat".format(iteration)) # We load the saved gp sgp = load_object("results_QED_solo/sgp{}.dat".format(iteration)) # We load some previous trained gp pred, uncert = sgp.predict(X_test, 0 * X_test) error = np.sqrt(np.mean((pred - y_test)**2)) testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert))) print('Test RMSE: ', error) print('Test ll: ', testll) pred, uncert = sgp.predict(X_train, 0 * X_train) error = np.sqrt(np.mean((pred - y_train)**2))
h = 0 for points in set_store: if h == 0: X_con_tr = points h += 1 else: X_con_tr = np.concatenate((X_con_tr, points)) # Save the latent points and corresponding labels (0 for negative case) num_examples = X_con_tr.shape[0] y_con_tr = np.zeros([num_examples]) save_object(X_con_tr, 'train_test_sets/N_40000_Samples/40000_Neg_X_con_tr_{}.dat'.format(criterion)) save_object(y_con_tr, 'train_test_sets/N_40000_Samples/40000_Neg_y_con_tr_{}.dat'.format(criterion)) save_object(num_examples, 'train_test_sets/N_40000_Samples/40000_num_examples_{}.dat'.format(criterion)) if i == 1: criterion += 5 else: criterion += 10 i += 1 # We collect the positive class latent points o = 1 criterion = 5
for iteration in range(num_iters): # We fit the GP M = 500 sgp = SparseGP(X_train, 0 * X_train, y_train, M) sgp.train_via_ADAM(X_train, 0 * X_train, y_train, X_test, X_test * 0, y_test, minibatch_size=10 * M, max_iterations=50, learning_rate=0.005) save_object(sgp, "results_logP/sgp{}.dat".format(iteration)) # We load the saved gp sgp = load_object("results_logP/sgp{}.dat".format(iteration)) # We load some previous trained gp pred, uncert = sgp.predict(X_test, 0 * X_test) error = np.sqrt(np.mean((pred - y_test)**2)) testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert))) print('Test RMSE: ', error) print('Test ll: ', testll) pred, uncert = sgp.predict(X_train, 0 * X_train) error = np.sqrt(np.mean((pred - y_train)**2))
for iteration in range(num_iters): # We fit the GP M = 500 sgp = SparseGP(X_train, 0 * X_train, y_train, M) sgp.train_via_ADAM(X_train, 0 * X_train, y_train, X_test, X_test * 0, y_test, minibatch_size=10 * M, max_iterations=50, learning_rate=0.005) save_object(sgp, "results_QED_comp/sgp{}.dat".format(iteration)) # We load the saved gp sgp = load_object("results_QED_comp/sgp{}.dat".format(iteration)) # We load some previous trained gp pred, uncert = sgp.predict(X_test, 0 * X_test) error = np.sqrt(np.mean((pred - y_test)**2)) testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert))) print('Test RMSE: ', error) print('Test ll: ', testll) pred, uncert = sgp.predict(X_train, 0 * X_train) error = np.sqrt(np.mean((pred - y_train)**2))
def make_training_data(X_train, num_valid_decodings, start_count, end_count): """ Function that makes a training data set for binary classification of validity based on the number of valid decodings from 100 attempts. Assumes that data folders to look through are of the format P1, P2, P3,... :param X_train: Latent features of the training data for the autoencoder. :param num_valid_decodings: an int between 0 and 100 representing the threshold for classification as valid or invalid. :param start_count: the index of the first data folder to look through! :param end_count: the index of the last data folder to look through :return A training set of (x,y) pairs for binary classification. """ validity_criterion_string = 'y_con_{}.dat'.format(num_valid_decodings) for i in range(start_count, end_count): labels = load_object('Collated_Data/P{}/'.format(i) + '{}'.format(validity_criterion_string)) num_labels = len(labels) pos_indices = [p for p in range(num_labels) if labels[p] == 1] neg_indices = [n for n in range(num_labels) if labels[n] == 0] num_pos_labels = len(pos_indices) num_neg_labels = len(neg_indices) assert num_pos_labels + num_neg_labels == num_labels pos_latent_features_list = [X_train[p] for p in pos_indices] neg_latent_features_list = [X_train[n] for n in neg_indices] X_con_tr_pos = np.array(pos_latent_features_list) X_con_tr_neg = np.array(neg_latent_features_list) assert X_con_tr_pos.shape == (num_pos_labels, 56) assert X_con_tr_neg.shape == (num_neg_labels, 56) y_con_tr_pos = np.ones([num_pos_labels]) y_con_tr_neg = np.zeros([num_neg_labels]) assert y_con_tr_pos.shape == (num_pos_labels, ) assert y_con_tr_neg.shape == (num_neg_labels, ) if i == start_count: X_con_tr_full_pos = X_con_tr_pos X_con_tr_full_neg = X_con_tr_neg y_con_tr_full_pos = y_con_tr_pos y_con_tr_full_neg = y_con_tr_neg else: X_con_tr_full_pos = np.concatenate( (X_con_tr_full_pos, X_con_tr_pos)) X_con_tr_full_neg = np.concatenate( (X_con_tr_full_neg, X_con_tr_neg)) y_con_tr_full_pos = np.concatenate( (y_con_tr_full_pos, y_con_tr_pos)) y_con_tr_full_neg = np.concatenate( (y_con_tr_full_neg, y_con_tr_neg)) num_pos_examples = X_con_tr_full_pos.shape[0] num_neg_examples = X_con_tr_full_neg.shape[0] save_object( X_con_tr_full_pos, 'train_test_sets/Train_Samples/Positive_Latents/X_con_tr_pos{}.dat'. format(num_valid_decodings)) save_object( y_con_tr_full_pos, 'train_test_sets/Train_Samples/Positive_Latents/Y_con_tr_pos{}.dat'. format(num_valid_decodings)) save_object( num_pos_examples, 'train_test_sets/Train_Samples/Positive_Latents/num_pos_examples{}.dat' .format(num_valid_decodings)) save_object( X_con_tr_full_neg, 'train_test_sets/Train_Samples/Negative_Latents/X_con_tr_neg{}.dat'. format(num_valid_decodings)) save_object( y_con_tr_full_neg, 'train_test_sets/Train_Samples/Negative_Latents/Y_con_tr_neg{}.dat'. format(num_valid_decodings)) save_object( num_neg_examples, 'train_test_sets/Train_Samples/Negative_Latents/num_pos_examples{}.dat' .format(num_valid_decodings)) return None
def best_so_far(results_directory, num_iterations): """ Function that plots: 1) The best feasible value obtained so far as a function of the number of iterations 2) A scatterplot showing the data points collected :param results_directory: directory to save the plots to. :param num_iterations: the number of iterations for which data collection is being carried out. """ best_vals = [] # coordinates of collected data points x1_vals = [] x2_vals = [] counter = 0 first_find = 0 for iteration in range(num_iterations): # We monitor the best value obtained so far evaluations = load_object(results_directory + "/scores{}.dat".format(iteration)) best_value = min(evaluations) constraint_value = load_object(results_directory + "/con_scores{}.dat".format(iteration)) # We DON'T use the best value found in the training data if the first collected point is not feasible if constraint_value[0] == 1 and counter == 0: counter += 1 best_vals.append(best_value[0]) first_find += 1 if counter > 0: if first_find == 1: first_find += 1 else: counter += 1 if best_value[0] < min(best_vals): best_vals.append(best_value[0]) else: best_vals.append(min(best_vals)) # We collect the data points for plotting next_inputs = load_object(results_directory + "/next_inputs{}.dat".format(iteration)) for data_point in next_inputs: x1_vals.append(data_point[0]) x2_vals.append(data_point[1]) iterations = range((num_iterations - counter) + 1, num_iterations + 1) # We plot the best value obtained so far as a function of iterations plt.figure(2) axes = plt.figure(2).gca() xa, ya = axes.get_xaxis(), axes.get_yaxis() xa.set_major_locator( MaxNLocator(integer=True)) # force axis ticks to be integers ya.set_major_locator(MaxNLocator(integer=True)) plt.xlim((num_iterations - counter) + 1, num_iterations) plt.xlabel('Function Evaluations') plt.ylabel('Best Feasible Value') plt.plot(iterations, best_vals) pylab.savefig(results_directory + "/best_so_far.png") plt.close() save_object(iterations, results_directory + "/iterations.dat") save_object(best_vals, results_directory + "/best_vals.dat") # We plot the data points collected plt.figure(3) plt.title('Data Points Collected') plt.gca().set_aspect('equal') plt.xlim(-5, 10) plt.ylim(0, 15) plt.xlabel('x1') plt.ylabel('x2') plt.scatter(x1_vals, x2_vals) pylab.savefig(results_directory + "/data_collected.png") plt.close()
def main(input_directory, output_directory): """ :param input_directory: directory to which the output of Branin_Sampler.py was saved. :param output_directory: directory in which to save the plots. """ np.random.seed(2) # Load the dataset X_bran = genfromtxt(input_directory + '/inputs.csv', delimiter=',', dtype='float32') y_con = genfromtxt(input_directory + '/constraint_targets.csv', delimiter=',', dtype='int') y_reg = genfromtxt(input_directory + '/branin_targets.csv', delimiter=',', dtype='float32') y_reg = y_reg.reshape((-1, 1)) # We convert constraint targets from one-hot to categorical. y_con_cat = np.zeros(len(y_con), dtype=int) i = 0 for element in y_con: if element[0] == 1: y_con_cat[i] = 1 else: y_con_cat[i] = 0 i += 1 y_con = y_con_cat n_bran = X_bran.shape[0] # number of examples permutation = np.random.choice(n_bran, n_bran, replace=False) # We shuffle the data X_tr_bran = X_bran[permutation, :][40:np.int(np.round( 0.9 * n_bran)), :] # 50/10 train/test split. X_te_bran = X_bran[permutation, :][ np.int(np.round(0.8 * n_bran)):np.int(np.round(0.9 * n_bran)), :] y_tr_reg = y_reg[permutation][40:np.int( np.round(0.9 * n_bran) )] # 10:20 have balanced class split after the permutation is applied with random seed = 1 y_te_reg = y_reg[permutation][np.int(np.round(0.8 * n_bran)):np. int(np.round(0.9 * n_bran))] y_tr_con = y_con[permutation][40:np.int( np.round(0.9 * n_bran) )] # no test set for constraint as traning subroutine for BNN doesn't require it y_te_con = y_con[permutation][np.int(np.round(0.8 * n_bran)):np. int(np.round(0.9 * n_bran))] # We plot the data used to initialise the surrogate model X1 = X_tr_bran[:, 0] X2 = X_tr_bran[:, 1] save_object(X1, output_directory + "/X1.dat") save_object(X2, output_directory + "/X2.dat") # We store the best feasible value found in the training set for reference feasible_vals = [] for i in range(X_tr_bran.shape[0]): if y_tr_con[i] == 0: continue feasible_vals.append([branin(tuple(X_tr_bran[i]))]) best_tr = min(feasible_vals) best_tr = best_tr[0] save_object(best_tr, output_directory + "/best_feasible_training_point.dat") # We set the number of data colletion iterations num_iters = 4 for iteration in range(num_iters): # We train the regression model # We fit the GP # M = np.int(np.maximum(10,np.round(0.1 * n_bran))) M = 20 sgp = SparseGP(X_tr_bran, 0 * X_tr_bran, y_tr_reg, M) sgp.train_via_ADAM(X_tr_bran, 0 * X_tr_bran, y_tr_reg, X_te_bran, X_te_bran * 0, y_te_reg, minibatch_size=M, max_iterations=400, learning_rate=0.005) save_object(sgp, output_directory + "/sgp{}.dat".format(iteration)) # We load the saved gp sgp = load_object(output_directory + "/sgp{}.dat".format(iteration)) # We load some previous trained gp pred, uncert = sgp.predict(X_te_bran, 0 * X_te_bran) error = np.sqrt(np.mean((pred - y_te_reg)**2)) testll = np.mean( sps.norm.logpdf(pred - y_te_reg, scale=np.sqrt(uncert))) print('Test RMSE: ', error) print('Test ll: ', testll) pred, uncert = sgp.predict(X_tr_bran, 0 * X_tr_bran) error = np.sqrt(np.mean((pred - y_tr_reg)**2)) trainll = np.mean( sps.norm.logpdf(pred - y_tr_reg, scale=np.sqrt(uncert))) print('Train RMSE: ', error) print('Train ll: ', trainll) # we train the constraint network # We load the random seed seed = 1 np.random.seed(seed) # We load the data datasets, n, d, n_labels = load_data(X_tr_bran, y_tr_con, X_te_bran, y_te_con) train_set_x, train_set_y = datasets[0] test_set_x, test_set_y = datasets[1] N_train = train_set_x.get_value(borrow=True).shape[0] N_test = test_set_x.get_value(borrow=True).shape[0] layer_sizes = [d, 50, n_labels] n_samples = 50 alpha = 0.5 learning_rate = 0.001 v_prior = 1.0 batch_size = 10 print('... building model') sys.stdout.flush() bb_alpha = BB_alpha(layer_sizes, n_samples, alpha, learning_rate, v_prior, batch_size, train_set_x, train_set_y, N_train, test_set_x, test_set_y, N_test) print('... training') sys.stdout.flush() test_error, test_ll = bb_alpha.train(400) # We save the trained BNN sys.setrecursionlimit(4000) # Required to save the BNN save_object(bb_alpha, output_directory + "/bb_alpha{}.dat".format(iteration)) # We pick the next 5 inputs based on random sampling np.random.seed() num_inputs = 1 x1 = np.random.uniform(-5, 10, size=num_inputs) x2 = np.random.uniform(0, 15, size=num_inputs) random_inputs = np.zeros([num_inputs, 2]) random_inputs[:, 0] = x1 random_inputs[:, 1] = x2 reg_scores = [] # collect y-values for Branin-Hoo function con_scores = [] # collect y-values for Constraint function probs = [] # collect the probabilities of satisfying the constraint log_probs = [ ] # collect the log probabilities of satisfying the constraint for i in range(random_inputs.shape[0]): reg_scores.append([branin(tuple(random_inputs[i]))]) if (random_inputs[i][0] - 2.5)**2 + (random_inputs[i][1] - 7.5)**2 <= 50: con_scores.append(np.int64(1)) else: con_scores.append(np.int64(0)) probs.append( bb_alpha.prediction_probs(random_inputs[i].reshape( 1, d))[0][0][1]) log_probs.append( bb_alpha.pred_log_probs(random_inputs[i].reshape(1, d))[0][0][1]) print(i) # print the value of the Branin-Hoo function at the data points we have acquired print(reg_scores) # save y-values and (x1,x2)-coordinates of locations chosen for evaluation save_object(reg_scores, output_directory + "/scores{}.dat".format(iteration)) save_object(random_inputs, output_directory + "/next_inputs{}.dat".format(iteration)) save_object(con_scores, output_directory + "/con_scores{}.dat".format(iteration)) save_object(probs, output_directory + "/probs{}.dat".format(iteration)) save_object(log_probs, output_directory + "/log_probs{}.dat".format(iteration)) # extend labelled training data for next cycle X_tr_bran = np.concatenate([X_tr_bran, random_inputs], 0) y_tr_reg = np.concatenate([y_tr_reg, np.array(reg_scores)], 0) y_tr_con = np.concatenate([y_tr_con, np.array(con_scores)], 0) best_so_far( output_directory, num_iters ) # Plot the best point as a function of the data collection iteration number GP_contours(output_directory, num_iters) # Plot the contours of the GP regression model BNN_contours(output_directory, num_iters) # Plot the contours of the BNN constraint model initial_data( output_directory) # Plot the data used to initialise the model
# molecules, 0 otherwise) y_con = np.zeros([X_latent_with_small_noise.shape[0], ]) for i in range(m): # decode the noisy latent data points to SMILES strings sampler_out = postprocessor.ls_to_smiles([X_latent_with_small_noise[i: (i + 1), :]], decode_attempts, decode_attempts, ) rdmols, valid_smiles, all_smiles, output_reps, distances = sampler_out valid_sens_smiles = [x for x in valid_smiles if len(x) > 5] num_sensible_and_long = sum([all_smiles.count(x) for x in valid_sens_smiles]) num_valid_and_long.append(num_sensible_and_long) valid_smiles_final.append(valid_sens_smiles) if num_sensible_and_long > decode_attempts / 5.0: y_con[i] = 1.0 else: y_con[i] = 0.0 print(i) save_object(y_con, "Small_Noise/y.dat") save_object(X_latent_with_small_noise, "Small_Noise/X.dat") print (datetime.now() - startTime)
else: y_con_20[i] = 0.0 else: y_con_10[i] = 0.0 else: y_con_5[i] = 0.0 print(i) # We save the labels # See the directory Collated_Data/Data_Lexicon for a mapping between the training set indices and the directory number # e.g. the directory number P19 corresponds to indices 168000-173000. # The script make_training_data.py expects to see the directories in the P1 through P19 format. # Create new directories P1, P2, ..., P18 in the Collated_Data folder as required save_object(num_sensible_per_point, "Collated_Data/P19/num_sensible_per_point.dat") save_object(valid_smiles_final, "Collated_Data/P19/valid_smiles.dat") save_object(all_smiles_final, "Collated_Data/P19/all_smiles.dat") save_object(y_con_5, "Collated_Data/P19/y_con_5.dat") save_object(y_con_10, "Collated_Data/P19/y_con_10.dat") save_object(y_con_20, "Collated_Data/P19/y_con_20.dat") save_object(y_con_30, "Collated_Data/P19/y_con_30.dat") save_object(y_con_40, "Collated_Data/P19/y_con_40.dat") save_object(y_con_50, "Collated_Data/P19/y_con_50.dat") save_object(y_con_60, "Collated_Data/P19/y_con_60.dat") save_object(y_con_70, "Collated_Data/P19/y_con_70.dat") save_object(y_con_80, "Collated_Data/P19/y_con_80.dat") save_object(y_con_90, "Collated_Data/P19/y_con_90.dat") print(datetime.now() - startTime)