def initial_data(results_directory):
    """
    Produces a scatterplot of the data used to initialise the surrogate model.

    :param results_directory: the directory containing the (x1, x2) data.
    """

    X1 = load_object(results_directory + "/X1.dat")
    X2 = load_object(results_directory + "/X2.dat")
    plt.figure(1)
    plt.title('Initial Data')
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.gca().set_aspect('equal', adjustable='box')
    plt.scatter(X1, X2)
    pylab.savefig(results_directory + "/initial_data.png")
def BNN_contours(results_directory, num_iterations):
    """
    Function that plots:

        1) The positive class probabilities of the BNN logistic regression model

    :param results_directory: the directory in which plots are saved.
    :param num_iterations: the number of iterations for which data collection is carried out.
    """

    # We load the saved BNN logistic regression model

    bnn = load_object(results_directory +
                      "/bb_alpha{}.dat".format(num_iterations - 1))

    # We prepare the contour grid

    delta = 0.05  # grid spacing
    x = np.arange(-5.0, 10.0, delta)
    y = np.arange(0.0, 15.0, delta)
    X, Y = np.meshgrid(x, y)
    X = X.reshape(len(x)**2, 1)
    Y = Y.reshape(len(y)**2, 1)

    # We reshape the meshgrid in a way such that it can be passed into the sgp and bnn prediction functions

    reshaped_grid = np.zeros([len(x)**2, 2])
    reshaped_grid[:, 0] = X.reshape(len(x)**2)
    reshaped_grid[:, 1] = Y.reshape(len(x)**2, order='F')

    # We plot the constraint probabilities of the BNN logistic regression model
    # which should resemble the disk constraint

    import sys
    sys.stdout.flush()
    probs_array = bnn.prediction_probs(reshaped_grid)
    positive_class_probs = probs_array[0][:, 1]
    constraint = positive_class_probs.reshape(len(x), len(x))

    plt.figure(6)
    plt.gca().set_aspect('equal', adjustable='box')
    Css = plt.contourf(x, y, constraint, np.arange(0, 1, .1), extend='both')
    Cbb = plt.colorbar(Css, shrink=0.8, extend='both')
    axes = pylab.axes()
    my_polygon_scatter(axes, [np.pi], [2.275],
                       radius=.5,
                       resolution=3,
                       alpha=.5,
                       color='r')

    pylab.savefig(results_directory + "/constraint_contour.png")
    plt.close()
Пример #3
0
    crit_string = 'y_con_{}.dat'.format(criterion)

    # list for storing latent points

    set_store = []

    # set start and end_count according to numbers in the Collated data folder

    start_count = 1
    end_count = 6

    for n in range(start_count, end_count):

        # open the class labels for the sampled points

        labels = load_object('Collated_Data/N{}/'.format(n) + '{}'.format(crit_string))
        length = len(labels)

        # Extract the indices that correspond to negative class

        neg_indexes = [k for k in range(length) if labels[k] == 0]

        # store the number of data points negative class labels in the sample

        num_negs = len(neg_indexes)

        # retrieve a list of latent points for which the class assigned was negative

        list_latent_points = [x_samp[m] for m in neg_indexes]

        # extract the points from the list and convert to a numpy array
Пример #4
0
import scipy.stats as sps
from Constrained_BO.Chemical_Design.autoencoder.latent_space import encode_decode as lasp
from Constrained_BO.Chemical_Design.qed import qed
from rdkit.Chem import MolFromSmiles

from Constrained_BO.black_box_alpha import BB_alpha
from sparse_gp import SparseGP
from Constrained_BO.utils import load_object, save_object, load_data

np.random.seed(1)

# We load the data

# Classification data

X_tr_pos_con = load_object(
    '../train_test_sets/Train_Samples/Positive_Latents/X_con_tr_pos20.dat')
X_tr_neg_con = load_object(
    '../train_test_sets/Train_Samples/Negative_Latents/X_con_tr_neg20.dat')
y_tr_pos_con = load_object(
    '../train_test_sets/Train_Samples/Positive_Latents/y_con_tr_pos20.dat')
y_tr_neg_con = load_object(
    '../train_test_sets/Train_Samples/Negative_Latents/y_con_tr_neg20.dat')

# Balance the number of samples from each class

m = X_tr_pos_con.shape[0]
permute_pos = np.random.choice(m, m, replace=False)
n = X_tr_neg_con.shape[0]
permute_neg = np.random.choice(n, n, replace=False)

X_tr_pos_con = X_tr_pos_con[permute_pos, :]
    M = 500
    sgp = SparseGP(X_train, 0 * X_train, y_train, M)
    sgp.train_via_ADAM(X_train,
                       0 * X_train,
                       y_train,
                       X_test,
                       X_test * 0,
                       y_test,
                       minibatch_size=10 * M,
                       max_iterations=50,
                       learning_rate=0.005)
    save_object(sgp, "results_QED_comp/sgp{}.dat".format(iteration))

    # We load the saved gp

    sgp = load_object("results_QED_comp/sgp{}.dat".format(iteration))

    # We load some previous trained gp

    pred, uncert = sgp.predict(X_test, 0 * X_test)
    error = np.sqrt(np.mean((pred - y_test)**2))
    testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert)))
    print('Test RMSE: ', error)
    print('Test ll: ', testll)

    pred, uncert = sgp.predict(X_train, 0 * X_train)
    error = np.sqrt(np.mean((pred - y_train)**2))
    trainll = np.mean(sps.norm.logpdf(pred - y_train, scale=np.sqrt(uncert)))
    print('Train RMSE: ', error)
    print('Train ll: ', trainll)
Пример #6
0
    M = 500
    sgp = SparseGP(X_train, 0 * X_train, y_train, M)
    sgp.train_via_ADAM(X_train,
                       0 * X_train,
                       y_train,
                       X_test,
                       X_test * 0,
                       y_test,
                       minibatch_size=10 * M,
                       max_iterations=50,
                       learning_rate=0.005)
    save_object(sgp, "results_logP/sgp{}.dat".format(iteration))

    # We load the saved gp

    sgp = load_object("results_logP/sgp{}.dat".format(iteration))

    # We load some previous trained gp

    pred, uncert = sgp.predict(X_test, 0 * X_test)
    error = np.sqrt(np.mean((pred - y_test)**2))
    testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert)))
    print('Test RMSE: ', error)
    print('Test ll: ', testll)

    pred, uncert = sgp.predict(X_train, 0 * X_train)
    error = np.sqrt(np.mean((pred - y_train)**2))
    trainll = np.mean(sps.norm.logpdf(pred - y_train, scale=np.sqrt(uncert)))
    print('Train RMSE: ', error)
    print('Train ll: ', trainll)
Пример #7
0
def make_training_data(X_train, num_valid_decodings, start_count, end_count):
    """
    Function that makes a training data set for binary classification
    of validity based on the number of valid decodings from 100 attempts.
    Assumes that data folders to look through are of the format P1, P2, P3,...

    :param X_train: Latent features of the training data for the autoencoder.
    :param num_valid_decodings: an int between 0 and 100 representing the
    threshold for classification as valid or invalid.
    :param start_count: the index of the first data folder to look through!
    :param end_count: the index of the last data folder to look through
    :return A training set of (x,y) pairs for binary classification.
    """

    validity_criterion_string = 'y_con_{}.dat'.format(num_valid_decodings)

    for i in range(start_count, end_count):

        labels = load_object('Collated_Data/P{}/'.format(i) +
                             '{}'.format(validity_criterion_string))
        num_labels = len(labels)

        pos_indices = [p for p in range(num_labels) if labels[p] == 1]
        neg_indices = [n for n in range(num_labels) if labels[n] == 0]

        num_pos_labels = len(pos_indices)
        num_neg_labels = len(neg_indices)

        assert num_pos_labels + num_neg_labels == num_labels

        pos_latent_features_list = [X_train[p] for p in pos_indices]
        neg_latent_features_list = [X_train[n] for n in neg_indices]

        X_con_tr_pos = np.array(pos_latent_features_list)
        X_con_tr_neg = np.array(neg_latent_features_list)

        assert X_con_tr_pos.shape == (num_pos_labels, 56)
        assert X_con_tr_neg.shape == (num_neg_labels, 56)

        y_con_tr_pos = np.ones([num_pos_labels])
        y_con_tr_neg = np.zeros([num_neg_labels])

        assert y_con_tr_pos.shape == (num_pos_labels, )
        assert y_con_tr_neg.shape == (num_neg_labels, )

        if i == start_count:
            X_con_tr_full_pos = X_con_tr_pos
            X_con_tr_full_neg = X_con_tr_neg
            y_con_tr_full_pos = y_con_tr_pos
            y_con_tr_full_neg = y_con_tr_neg
        else:
            X_con_tr_full_pos = np.concatenate(
                (X_con_tr_full_pos, X_con_tr_pos))
            X_con_tr_full_neg = np.concatenate(
                (X_con_tr_full_neg, X_con_tr_neg))
            y_con_tr_full_pos = np.concatenate(
                (y_con_tr_full_pos, y_con_tr_pos))
            y_con_tr_full_neg = np.concatenate(
                (y_con_tr_full_neg, y_con_tr_neg))

    num_pos_examples = X_con_tr_full_pos.shape[0]
    num_neg_examples = X_con_tr_full_neg.shape[0]

    save_object(
        X_con_tr_full_pos,
        'train_test_sets/Train_Samples/Positive_Latents/X_con_tr_pos{}.dat'.
        format(num_valid_decodings))
    save_object(
        y_con_tr_full_pos,
        'train_test_sets/Train_Samples/Positive_Latents/Y_con_tr_pos{}.dat'.
        format(num_valid_decodings))
    save_object(
        num_pos_examples,
        'train_test_sets/Train_Samples/Positive_Latents/num_pos_examples{}.dat'
        .format(num_valid_decodings))
    save_object(
        X_con_tr_full_neg,
        'train_test_sets/Train_Samples/Negative_Latents/X_con_tr_neg{}.dat'.
        format(num_valid_decodings))
    save_object(
        y_con_tr_full_neg,
        'train_test_sets/Train_Samples/Negative_Latents/Y_con_tr_neg{}.dat'.
        format(num_valid_decodings))
    save_object(
        num_neg_examples,
        'train_test_sets/Train_Samples/Negative_Latents/num_pos_examples{}.dat'
        .format(num_valid_decodings))

    return None
def best_so_far(results_directory, num_iterations):
    """
    Function that plots:

        1) The best feasible value obtained so far as a function of the number of iterations
        2) A scatterplot showing the data points collected

    :param results_directory: directory to save the plots to.
    :param num_iterations: the number of iterations for which data collection is being carried out.
    """

    best_vals = []

    # coordinates of collected data points

    x1_vals = []
    x2_vals = []
    counter = 0
    first_find = 0

    for iteration in range(num_iterations):

        # We monitor the best value obtained so far

        evaluations = load_object(results_directory +
                                  "/scores{}.dat".format(iteration))
        best_value = min(evaluations)
        constraint_value = load_object(results_directory +
                                       "/con_scores{}.dat".format(iteration))

        # We DON'T use the best value found in the training data if the first collected point is not feasible

        if constraint_value[0] == 1 and counter == 0:
            counter += 1
            best_vals.append(best_value[0])
            first_find += 1

        if counter > 0:
            if first_find == 1:
                first_find += 1
            else:
                counter += 1
                if best_value[0] < min(best_vals):
                    best_vals.append(best_value[0])
                else:
                    best_vals.append(min(best_vals))

        # We collect the data points for plotting

        next_inputs = load_object(results_directory +
                                  "/next_inputs{}.dat".format(iteration))

        for data_point in next_inputs:
            x1_vals.append(data_point[0])
            x2_vals.append(data_point[1])

    iterations = range((num_iterations - counter) + 1, num_iterations + 1)

    # We plot the best value obtained so far as a function of iterations

    plt.figure(2)
    axes = plt.figure(2).gca()
    xa, ya = axes.get_xaxis(), axes.get_yaxis()
    xa.set_major_locator(
        MaxNLocator(integer=True))  # force axis ticks to be integers
    ya.set_major_locator(MaxNLocator(integer=True))
    plt.xlim((num_iterations - counter) + 1, num_iterations)
    plt.xlabel('Function Evaluations')
    plt.ylabel('Best Feasible Value')
    plt.plot(iterations, best_vals)
    pylab.savefig(results_directory + "/best_so_far.png")
    plt.close()

    save_object(iterations, results_directory + "/iterations.dat")
    save_object(best_vals, results_directory + "/best_vals.dat")

    # We plot the data points collected

    plt.figure(3)
    plt.title('Data Points Collected')
    plt.gca().set_aspect('equal')
    plt.xlim(-5, 10)
    plt.ylim(0, 15)
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.scatter(x1_vals, x2_vals)
    pylab.savefig(results_directory + "/data_collected.png")
    plt.close()
def GP_contours(results_directory, num_iterations):
    """
    Function that plots:

        1) The predictive mean of the GP regression model
        2) The variance of the GP regression model

    :param results_directory: the directory in which the plots are saved.
    :param num_iterations: the number of iterations for which data collection is carried out.
    """

    # We load the saved GP regression model

    sgp = load_object(results_directory +
                      "/sgp{}.dat".format(num_iterations - 1))

    # We prepare the contour grid

    delta = 0.025  # grid spacing
    x = np.arange(-5.0, 10.0, delta)
    y = np.arange(0.0, 15.0, delta)
    X, Y = np.meshgrid(x, y)
    X = X.reshape(len(x)**2, 1)
    Y = Y.reshape(len(y)**2, 1)

    # We reshape the meshgrid in a way such that it can be passed into the sgp and bnn prediction functions

    reshaped_grid = np.zeros([len(x)**2, 2])
    reshaped_grid[:, 0] = X.reshape(len(x)**2)
    reshaped_grid[:, 1] = Y.reshape(len(x)**2, order='F')

    # We plot the predictive mean and variance of the GP regression model

    pred, uncert = sgp.predict(reshaped_grid, 0 * reshaped_grid)
    branin, uncert = pred.reshape(len(x),
                                  len(x)), uncert.reshape(len(x), len(x))
    plt.figure(4)
    plt.gca().set_aspect('equal', adjustable='box')
    CS = plt.contourf(x, y, branin, cmap=cm.viridis_r)
    CB = plt.colorbar(CS, shrink=0.8, extend='both')
    axes = plt.gca()
    my_polygon_scatter(axes, [np.pi], [2.275],
                       radius=.5,
                       resolution=3,
                       alpha=.5,
                       color='r')
    pylab.savefig(results_directory + "/branin_contour.png")
    plt.close()

    plt.figure(5)
    plt.gca().set_aspect('equal', adjustable='box')
    Cs = plt.contourf(x, y, uncert, cmap=cm.viridis_r)
    Cb = plt.colorbar(Cs, shrink=0.8, extend='both')
    axes = pylab.axes()
    my_polygon_scatter(axes, [np.pi], [2.275],
                       radius=.5,
                       resolution=3,
                       alpha=.5,
                       color='r')
    pylab.savefig(results_directory + "/branin_uncertainty.png")
    plt.close()
def main(input_directory, output_directory):
    """

    :param input_directory: directory to which the output of Branin_Sampler.py was saved.
    :param output_directory: directory in which to save the plots.
    """

    np.random.seed(2)

    # Load the dataset

    X_bran = genfromtxt(input_directory + '/inputs.csv',
                        delimiter=',',
                        dtype='float32')
    y_con = genfromtxt(input_directory + '/constraint_targets.csv',
                       delimiter=',',
                       dtype='int')
    y_reg = genfromtxt(input_directory + '/branin_targets.csv',
                       delimiter=',',
                       dtype='float32')
    y_reg = y_reg.reshape((-1, 1))

    # We convert constraint targets from one-hot to categorical.

    y_con_cat = np.zeros(len(y_con), dtype=int)
    i = 0

    for element in y_con:
        if element[0] == 1:
            y_con_cat[i] = 1
        else:
            y_con_cat[i] = 0
        i += 1

    y_con = y_con_cat

    n_bran = X_bran.shape[0]  # number of examples

    permutation = np.random.choice(n_bran, n_bran,
                                   replace=False)  # We shuffle the data

    X_tr_bran = X_bran[permutation, :][40:np.int(np.round(
        0.9 * n_bran)), :]  # 50/10 train/test split.
    X_te_bran = X_bran[permutation, :][
        np.int(np.round(0.8 * n_bran)):np.int(np.round(0.9 * n_bran)), :]

    y_tr_reg = y_reg[permutation][40:np.int(
        np.round(0.9 * n_bran)
    )]  # 10:20 have balanced class split after the permutation is applied with random seed = 1
    y_te_reg = y_reg[permutation][np.int(np.round(0.8 * n_bran)):np.
                                  int(np.round(0.9 * n_bran))]
    y_tr_con = y_con[permutation][40:np.int(
        np.round(0.9 * n_bran)
    )]  # no test set for constraint as traning subroutine for BNN doesn't require it
    y_te_con = y_con[permutation][np.int(np.round(0.8 * n_bran)):np.
                                  int(np.round(0.9 * n_bran))]

    # We plot the data used to initialise the surrogate model

    X1 = X_tr_bran[:, 0]
    X2 = X_tr_bran[:, 1]

    save_object(X1, output_directory + "/X1.dat")
    save_object(X2, output_directory + "/X2.dat")

    # We store the best feasible value found in the training set for reference

    feasible_vals = []

    for i in range(X_tr_bran.shape[0]):

        if y_tr_con[i] == 0:
            continue

        feasible_vals.append([branin(tuple(X_tr_bran[i]))])

    best_tr = min(feasible_vals)
    best_tr = best_tr[0]

    save_object(best_tr,
                output_directory + "/best_feasible_training_point.dat")

    # We set the number of data colletion iterations

    num_iters = 4

    for iteration in range(num_iters):

        # We train the regression model

        # We fit the GP

        # M = np.int(np.maximum(10,np.round(0.1 * n_bran)))

        M = 20

        sgp = SparseGP(X_tr_bran, 0 * X_tr_bran, y_tr_reg, M)
        sgp.train_via_ADAM(X_tr_bran,
                           0 * X_tr_bran,
                           y_tr_reg,
                           X_te_bran,
                           X_te_bran * 0,
                           y_te_reg,
                           minibatch_size=M,
                           max_iterations=400,
                           learning_rate=0.005)

        save_object(sgp, output_directory + "/sgp{}.dat".format(iteration))

        # We load the saved gp

        sgp = load_object(output_directory + "/sgp{}.dat".format(iteration))

        # We load some previous trained gp

        pred, uncert = sgp.predict(X_te_bran, 0 * X_te_bran)
        error = np.sqrt(np.mean((pred - y_te_reg)**2))
        testll = np.mean(
            sps.norm.logpdf(pred - y_te_reg, scale=np.sqrt(uncert)))
        print('Test RMSE: ', error)
        print('Test ll: ', testll)

        pred, uncert = sgp.predict(X_tr_bran, 0 * X_tr_bran)
        error = np.sqrt(np.mean((pred - y_tr_reg)**2))
        trainll = np.mean(
            sps.norm.logpdf(pred - y_tr_reg, scale=np.sqrt(uncert)))
        print('Train RMSE: ', error)
        print('Train ll: ', trainll)

        # we train the constraint network

        # We load the random seed

        seed = 1
        np.random.seed(seed)

        # We load the data

        datasets, n, d, n_labels = load_data(X_tr_bran, y_tr_con, X_te_bran,
                                             y_te_con)

        train_set_x, train_set_y = datasets[0]
        test_set_x, test_set_y = datasets[1]

        N_train = train_set_x.get_value(borrow=True).shape[0]
        N_test = test_set_x.get_value(borrow=True).shape[0]
        layer_sizes = [d, 50, n_labels]
        n_samples = 50
        alpha = 0.5
        learning_rate = 0.001
        v_prior = 1.0
        batch_size = 10
        print('... building model')
        sys.stdout.flush()
        bb_alpha = BB_alpha(layer_sizes, n_samples, alpha, learning_rate,
                            v_prior, batch_size, train_set_x, train_set_y,
                            N_train, test_set_x, test_set_y, N_test)
        print('... training')
        sys.stdout.flush()

        test_error, test_ll = bb_alpha.train(400)

        # We save the trained BNN

        sys.setrecursionlimit(4000)  # Required to save the BNN

        save_object(bb_alpha,
                    output_directory + "/bb_alpha{}.dat".format(iteration))

        # We pick the next 5 inputs based on random sampling

        np.random.seed()

        num_inputs = 1

        x1 = np.random.uniform(-5, 10, size=num_inputs)
        x2 = np.random.uniform(0, 15, size=num_inputs)
        random_inputs = np.zeros([num_inputs, 2])
        random_inputs[:, 0] = x1
        random_inputs[:, 1] = x2

        reg_scores = []  # collect y-values for Branin-Hoo function
        con_scores = []  # collect y-values for Constraint function
        probs = []  # collect the probabilities of satisfying the constraint
        log_probs = [
        ]  # collect the log probabilities of satisfying the constraint

        for i in range(random_inputs.shape[0]):

            reg_scores.append([branin(tuple(random_inputs[i]))])

            if (random_inputs[i][0] - 2.5)**2 + (random_inputs[i][1] -
                                                 7.5)**2 <= 50:
                con_scores.append(np.int64(1))
            else:
                con_scores.append(np.int64(0))

            probs.append(
                bb_alpha.prediction_probs(random_inputs[i].reshape(
                    1, d))[0][0][1])
            log_probs.append(
                bb_alpha.pred_log_probs(random_inputs[i].reshape(1,
                                                                 d))[0][0][1])

            print(i)

        # print the value of the Branin-Hoo function at the data points we have acquired

        print(reg_scores)

        # save y-values and (x1,x2)-coordinates of locations chosen for evaluation

        save_object(reg_scores,
                    output_directory + "/scores{}.dat".format(iteration))
        save_object(random_inputs,
                    output_directory + "/next_inputs{}.dat".format(iteration))
        save_object(con_scores,
                    output_directory + "/con_scores{}.dat".format(iteration))
        save_object(probs, output_directory + "/probs{}.dat".format(iteration))
        save_object(log_probs,
                    output_directory + "/log_probs{}.dat".format(iteration))

        # extend labelled training data for next cycle

        X_tr_bran = np.concatenate([X_tr_bran, random_inputs], 0)
        y_tr_reg = np.concatenate([y_tr_reg, np.array(reg_scores)], 0)
        y_tr_con = np.concatenate([y_tr_con, np.array(con_scores)], 0)

    best_so_far(
        output_directory, num_iters
    )  # Plot the best point as a function of the data collection iteration number
    GP_contours(output_directory,
                num_iters)  # Plot the contours of the GP regression model
    BNN_contours(output_directory,
                 num_iters)  # Plot the contours of the BNN constraint model
    initial_data(
        output_directory)  # Plot the data used to initialise the model