예제 #1
0
def test_upper_bound_few_inducing_points():
    """
    Test for upper bound for regression marginal likelihood
    """
    model_vfe = gpflow.models.SGPR(
        (DatumUpper.X, DatumUpper.Y),
        gpflow.kernels.SquaredExponential(),
        inducing_variable=DatumUpper.X[:10, :].copy(),
        mean_function=Constant())
    opt = gpflow.optimizers.Scipy()

    @tf.function(autograph=False)
    def model_vfe_closure():
        return -model_vfe.log_marginal_likelihood()

    opt.minimize(model_vfe_closure,
                 variables=model_vfe.trainable_variables,
                 options=dict(maxiter=500))

    full_gp = gpflow.models.GPR((DatumUpper.X, DatumUpper.Y),
                                kernel=gpflow.kernels.SquaredExponential(),
                                mean_function=Constant())
    full_gp.kernel.lengthscale.assign(model_vfe.kernel.lengthscale)
    full_gp.kernel.variance.assign(model_vfe.kernel.variance)
    full_gp.likelihood.variance.assign(model_vfe.likelihood.variance)
    full_gp.mean_function.c.assign(model_vfe.mean_function.c)

    lml_upper = model_vfe.upper_bound()
    lml_vfe = model_vfe.log_marginal_likelihood()
    lml_full_gp = full_gp.log_marginal_likelihood()

    assert lml_vfe < lml_full_gp
    assert lml_full_gp < lml_upper
def make_single_layer_models(X, Y, Z):
    D = X.shape[1]
    Y_mean, Y_std = np.average(Y), np.std(Y)

    m_sgpr = SGPR(X,
                  Y,
                  RBF(D, variance=Y_std**2),
                  Z.copy(),
                  mean_function=Constant(Y_mean))
    m_svgp = SVGP(X,
                  Y,
                  RBF(D, variance=Y_std**2),
                  Gaussian(),
                  Z.copy(),
                  mean_function=Constant(Y_mean))
    m_fitc = GPRFITC(X,
                     Y,
                     RBF(D, variance=Y_std**2),
                     Z.copy(),
                     mean_function=Constant(Y_mean))

    for m in [m_sgpr, m_svgp, m_fitc]:
        m.mean_function.fixed = True
        m.likelihood.variance = 0.1 * Y_std
    return m_sgpr, m_svgp, m_fitc
예제 #3
0
def _prepare_models():
    """
    Prepare models to make sure the coregionalized model with diagonal coregion kernel and
    with fixed lengthscale is equivalent with normal GP regression.
    """
    # 1. Two independent VGPs for two sets of data
    k0 = gpflow.kernels.SquaredExponential()
    k0.lengthscale.trainable = False
    k1 = gpflow.kernels.SquaredExponential()
    k1.lengthscale.trainable = False
    vgp0 = gpflow.models.VGP((Datum.X[0], Datum.Y[0]),
                             kernel=k0,
                             mean_function=Constant(),
                             likelihood=gpflow.likelihoods.Gaussian(), num_latent=1)
    vgp1 = gpflow.models.VGP((Datum.X[1], Datum.Y[1]),
                             kernel=k1,
                             mean_function=Constant(),
                             likelihood=gpflow.likelihoods.Gaussian(), num_latent=1)
    # 2. Coregionalized GPR
    kc = gpflow.kernels.SquaredExponential(active_dims=[0, 1])
    kc.lengthscale.trainable = False
    kc.variance.trainable = False  # variance is handles by the coregion kernel
    coreg = gpflow.kernels.Coregion(output_dim=2, rank=1, active_dims=[2])
    coreg.W.trainable = False
    lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(),
                                                 gpflow.likelihoods.Gaussian()]
                                                )
    mean_c = gpflow.mean_functions.SwitchedMeanFunction(
        [gpflow.mean_functions.Constant(), gpflow.mean_functions.Constant()])
    cvgp = gpflow.models.VGP((Datum.X_augumented, Datum.Y_augumented),
                             kernel=kc * coreg,
                             mean_function=mean_c,
                             likelihood=lik,
                             num_latent=1
                             )

    # Train them for a small number of iterations

    opt = gpflow.optimizers.Scipy()

    @tf.function(autograph=False)
    def vgp0_closure():
        return - vgp0.log_marginal_likelihood()

    @tf.function(autograph=False)
    def vgp1_closure():
        return - vgp1.log_marginal_likelihood()

    @tf.function(autograph=False)
    def cvgp_closure():
        return - cvgp.log_marginal_likelihood()

    opt.minimize(vgp0_closure, variables=vgp0.trainable_variables,
                 options=dict(maxiter=1000), method='BFGS')
    opt.minimize(vgp1_closure, variables=vgp1.trainable_variables,
                 options=dict(maxiter=1000), method='BFGS')
    opt.minimize(cvgp_closure, variables=cvgp.trainable_variables,
                 options=dict(maxiter=1000), method='BFGS')

    return vgp0, vgp1, cvgp
def _create_approximate_models():
    """
    1) Variational GP (with the likelihood set to Gaussian)
    2) Sparse variational GP (likelihood is Gaussian, inducing points
       at the data)
    3) Sparse variational GP (as above, but with the whitening rotation
       of the inducing variables)
    4) Sparse variational GP Regression (as above, but there the inducing
       variables are 'collapsed' out, as in Titsias 2009)
    5) FITC Sparse GP Regression
    """
    model_1 = gpflow.models.VGP(
        (Datum.X, Datum.Y),
        kernel=gpflow.kernels.SquaredExponential(),
        likelihood=gpflow.likelihoods.Gaussian(),
        mean_function=gpflow.mean_functions.Constant(),
    )

    model_2 = gpflow.models.SVGP(
        kernel=gpflow.kernels.SquaredExponential(),
        likelihood=gpflow.likelihoods.Gaussian(),
        inducing_variable=Datum.X.copy(),
        q_diag=False,
        whiten=False,
        mean_function=gpflow.mean_functions.Constant(),
        num_latent_gps=Datum.Y.shape[1],
    )
    gpflow.set_trainable(model_2.inducing_variable, False)

    model_3 = gpflow.models.SVGP(
        kernel=gpflow.kernels.SquaredExponential(),
        likelihood=gpflow.likelihoods.Gaussian(),
        inducing_variable=Datum.X.copy(),
        q_diag=False,
        whiten=True,
        mean_function=gpflow.mean_functions.Constant(),
        num_latent_gps=Datum.Y.shape[1],
    )
    gpflow.set_trainable(model_3.inducing_variable, False)

    model_4 = gpflow.models.SGPR(
        (Datum.X, Datum.Y),
        kernel=gpflow.kernels.SquaredExponential(),
        inducing_variable=Datum.X.copy(),
        mean_function=Constant(),
    )
    gpflow.set_trainable(model_4.inducing_variable, False)

    model_5 = gpflow.models.GPRFITC(
        (Datum.X, Datum.Y),
        kernel=gpflow.kernels.SquaredExponential(),
        inducing_variable=Datum.X.copy(),
        mean_function=Constant(),
    )
    gpflow.set_trainable(model_5.inducing_variable, False)

    return model_1, model_2, model_3, model_4, model_5
예제 #5
0
def test_switched_mean_function(N, D):
    """
    Test for the SwitchedMeanFunction.
    """
    X = np.hstack([rng.randn(N, D), 1.0 * rng.randint(0, 2, N).reshape(-1, 1)])
    zeros, ones = Constant(np.zeros(1)), Constant(np.ones(1))
    switched_mean = SwitchedMeanFunction([zeros, ones])

    np_list = np.array([0., 1.])
    result_ref = (np_list[X[:, D].astype(default_int())]).reshape(-1, 1)
    result = switched_mean(X)

    assert_allclose(result, result_ref)
def make_dgp(X, Y, Z, L):
    D = X.shape[1]
    Y_mean, Y_std = np.average(Y), np.std(Y)

    # the layer shapes are defined by the kernel dims, so here all hidden layers are D dimensional
    kernels = []
    for l in range(L):
        kernels.append(RBF(D, lengthscales=1., variance=1.))

    # between layer noise (doesn't actually make much difference but we include it anyway)
    for kernel in kernels[:-1]:
        kernel += White(D, variance=1e-5)

    mb = 10000 if X.shape[0] > 10000 else None
    model = DGP(X, Y, Z, kernels, Gaussian(), num_samples=1, minibatch_size=mb)

    # same final layer inits we used for the single layer model
    model.layers[-1].kern.variance = Y_std**2
    model.likelihood.variance = Y_std * 0.1
    model.layers[-1].mean_function = Constant(Y_mean)
    model.layers[-1].mean_function.fixed = True

    # start the inner layers almost deterministically
    for layer in model.layers[:-1]:
        layer.q_sqrt = layer.q_sqrt.value * 1e-5

    return model
def mean_function_factory(mean_function_name, D_in, D_out):
    if mean_function_name == "Zero":
        return Zero(output_dim=D_out)
    elif mean_function_name == "Constant":
        return Constant(c=rng.rand(D_out))
    elif mean_function_name == "Linear":
        return Linear(A=rng.rand(D_in, D_out), b=rng.rand(D_out))
    else:
        return None
예제 #8
0
 def fit(self, X_train, y_train):
     y_train_scaled = self.y_scaler.fit_transform(y_train.reshape(-1, 1))
     k = Tanimoto()
     self.m = gpflow.models.GPR(
         data=(X_train.astype(np.float64), y_train_scaled),
         mean_function=Constant(np.mean(y_train_scaled)),
         kernel=k,
         noise_variance=1)
     opt = gpflow.optimizers.Scipy()
     opt.minimize(self.objective_closure,
                  self.m.trainable_variables,
                  options=dict(maxiter=self.maxiter))
예제 #9
0
    def _fit(self, X, F, data):

        if self.regr == 'constant':
            mf = Constant()
        elif self.regr == 'linear':
            mf = Linear(numpy.ones((X.shape[1], 1)), numpy.ones((1, 1)))

        if self.kernel == 'linear':
            kernel = gpflow.kernels.Linear(X.shape[1], ARD=self.ARD)
        if self.kernel == 'rbf':
            kernel = gpflow.kernels.RBF(X.shape[1], ARD=self.ARD)
        elif self.kernel == 'polynomial':
            kernel = gpflow.kernels.Polynomial(X.shape[1], ARD=self.ARD)

        m = gpflow.gpr.GPR(X,
                           numpy.array([F]).T,
                           kern=kernel,
                           mean_function=mf)
        m.optimize()
        self.model = m
예제 #10
0
def _prepare_models():
    """
    Prepare models to make sure the coregionalized model with diagonal coregion kernel and
    with fixed lengthscales is equivalent with normal GP regression.
    """
    # 1. Two independent VGPs for two sets of data
    k0 = gpflow.kernels.SquaredExponential()
    set_trainable(k0.lengthscales, False)
    k1 = gpflow.kernels.SquaredExponential()
    set_trainable(k1.lengthscales, False)
    vgp0 = gpflow.models.VGP(
        (Datum.X[0], Datum.Y[0]),
        kernel=k0,
        mean_function=Constant(),
        likelihood=gpflow.likelihoods.Gaussian(),
        num_latent_gps=1,
    )
    vgp1 = gpflow.models.VGP(
        (Datum.X[1], Datum.Y[1]),
        kernel=k1,
        mean_function=Constant(),
        likelihood=gpflow.likelihoods.Gaussian(),
        num_latent_gps=1,
    )
    # 2. Coregionalized VGP
    kc = gpflow.kernels.SquaredExponential(active_dims=[0, 1])
    set_trainable(kc.lengthscales, False)
    set_trainable(kc.variance,
                  False)  # variance is handled by the Coregion kernel
    coreg = gpflow.kernels.Coregion(output_dim=2, rank=1, active_dims=[2])
    coreg.W.assign(np.zeros((2, 1)))  # zero correlation between outputs
    set_trainable(coreg.W, False)
    lik = gpflow.likelihoods.SwitchedLikelihood(
        [gpflow.likelihoods.Gaussian(),
         gpflow.likelihoods.Gaussian()])
    mean_c = gpflow.mean_functions.SwitchedMeanFunction(
        [gpflow.mean_functions.Constant(),
         gpflow.mean_functions.Constant()])
    cvgp = gpflow.models.VGP(
        (Datum.X_augmented, Datum.Y_augmented),
        kernel=kc * coreg,
        mean_function=mean_c,
        likelihood=lik,
        num_latent_gps=1,
    )

    # Train them for a small number of iterations

    opt = gpflow.optimizers.Scipy()
    opt.minimize(
        vgp0.training_loss,
        variables=vgp0.trainable_variables,
        options=dict(maxiter=1000),
        method="BFGS",
    )
    opt.minimize(
        vgp1.training_loss,
        variables=vgp1.trainable_variables,
        options=dict(maxiter=1000),
        method="BFGS",
    )
    opt.minimize(
        cvgp.training_loss,
        variables=cvgp.trainable_variables,
        options=dict(maxiter=1000),
        method="BFGS",
    )

    return vgp0, vgp1, cvgp
def main(path, path_to_dft_dataset, task, representation, theory_level):
    """
    :param path: str specifying path to photoswitches.csv file.
    :param path_to_dft_dataset: str specifying path to dft_comparison.csv file.
    :param task: str specifying the task. e_iso_pi only supported task for the TD-DFT comparison.
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0']
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset)

    X = featurise_mols(smiles_list, representation)

    # Keep only non-duplicate entries because we're not considering effects of solvent

    non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    X = X[non_duplicate_indices, :]
    experimental_vals = experimental_vals[non_duplicate_indices]
    non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    pbe0_vals = pbe0_vals[non_dup_pbe0]
    cam_vals = cam_vals[non_dup_cam]

    # molecules with dft values to be split into train/test
    if theory_level == 'CAM-B3LYP':
        X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0)
        y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals)))
        # DFT values for the CAM-B3LYP level of theory
        dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals)))
        # molecules with no dft vals must go into the training set.
        X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0)
        y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals)))
    else:
        X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0)
        y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals)))
        # DFT values for the PBE0 level of theory
        dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals)))
        # molecules with no dft vals must go into the training set.
        X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0)
        y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals)))

    mae_list = []
    dft_mae_list = []

    # We define the Gaussian Process optimisation objective

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    print('\nBeginning training loop...')

    for i in range(len(y_with_dft)):

        X_train = np.delete(X_with_dft, i, axis=0)
        y_train = np.delete(y_with_dft, i)
        X_test = X_with_dft[i].reshape(1, -1)
        y_test = y_with_dft[i]
        dft_test = dft_vals[i]

        X_train = np.concatenate((X_train, X_no_dft))
        y_train = np.concatenate((y_train, y_no_dft))
        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        #  We standardise the outputs but leave the inputs unchanged

        _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        k = Tanimoto()
        m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1)

        # Optimise the kernel variance and noise level by the marginal likelihood

        opt = gpflow.optimizers.Scipy()
        opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100))
        print_summary(m)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train, _ = m.predict_f(X_train)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train)))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        # mean and variance GP prediction

        y_pred, y_var = m.predict_f(X_test)
        y_pred = y_scaler.inverse_transform(y_pred)
        y_test = y_scaler.inverse_transform(y_test)

        # Output MAE for this trial

        mae = abs(y_test - y_pred)

        print("MAE: {}".format(mae))

        # Store values in order to compute the mean and standard error of the statistics across trials

        mae_list.append(mae)

        # DFT prediction scores on the same trial

        dft_mae = abs(y_test - dft_test)

        dft_mae_list.append(dft_mae)

    mae_list = np.array(mae_list)
    dft_mae_list = np.array(dft_mae_list)

    print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))

    print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list))))
예제 #12
0
def _create_approximate_models():
    """
    1) Variational GP (with the likelihood set to Gaussian)
    2) Sparse variational GP (likelihood is Gaussian, inducing points
       at the data)
    3) Sparse variational GP (as above, but with the whitening rotation
       of the inducing variables)
    4) Sparse variational GP Regression (as above, but there the inducing
       variables are 'collapsed' out, as in Titsias 2009)
    5) FITC Sparse GP Regression
    """
    model_1 = gpflow.models.VGP((Datum.X, Datum.Y),
                                gpflow.kernels.SquaredExponential(),
                                likelihood=gpflow.likelihoods.Gaussian(),
                                mean_function=gpflow.mean_functions.Constant())
    model_2 = gpflow.models.SVGP(
        gpflow.kernels.SquaredExponential(),
        gpflow.likelihoods.Gaussian(),
        inducing_variable=Datum.X.copy(),
        q_diag=False,
        mean_function=gpflow.mean_functions.Constant(),
        num_latent=Datum.Y.shape[1])
    gpflow.utilities.set_trainable(model_2.inducing_variable, False)
    model_3 = gpflow.models.SVGP(
        kernel=gpflow.kernels.SquaredExponential(),
        likelihood=gpflow.likelihoods.Gaussian(),
        inducing_variable=Datum.X.copy(),
        q_diag=False,
        whiten=True,
        mean_function=gpflow.mean_functions.Constant(),
        num_latent=Datum.Y.shape[1])
    gpflow.utilities.set_trainable(model_3.inducing_variable, False)
    model_4 = gpflow.models.GPRFITC((Datum.X, Datum.Y),
                                    kernel=gpflow.kernels.SquaredExponential(),
                                    inducing_variable=Datum.X.copy(),
                                    mean_function=Constant())
    gpflow.utilities.set_trainable(model_4.inducing_variable, False)
    model_5 = gpflow.models.SGPR((Datum.X, Datum.Y),
                                 gpflow.kernels.SquaredExponential(),
                                 inducing_variable=Datum.X.copy(),
                                 mean_function=Constant())
    gpflow.utilities.set_trainable(model_5.inducing_variable, False)

    # Train models

    opt = gpflow.optimizers.Scipy()

    @tf.function(autograph=False)
    def model_1_closure():
        return -model_1.log_marginal_likelihood()

    @tf.function(autograph=False)
    def model_2_closure():
        return -model_2.elbo(Datum.data)

    @tf.function(autograph=False)
    def model_3_closure():
        return -model_3.elbo(Datum.data)

    @tf.function(autograph=False)
    def model_4_closure():
        return -model_4.log_marginal_likelihood()

    @tf.function(autograph=False)
    def model_5_closure():
        return -model_5.log_marginal_likelihood()

    opt.minimize(model_1_closure,
                 variables=model_1.trainable_variables,
                 options=dict(maxiter=300))
    opt.minimize(model_2_closure,
                 variables=model_2.trainable_variables,
                 options=dict(maxiter=300))
    opt.minimize(model_3_closure,
                 variables=model_3.trainable_variables,
                 options=dict(maxiter=300))
    opt.minimize(model_4_closure,
                 variables=model_4.trainable_variables,
                 options=dict(maxiter=300))
    opt.minimize(model_5_closure,
                 variables=model_5.trainable_variables,
                 options=dict(maxiter=300))

    return model_1, model_2, model_3, model_4, model_5
def main(path, task, representation, use_pca, n_trials, test_set_size):
    """
    Train a multioutput GP simultaneously on all tasks of the photoswitch dataset.

    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    """

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 100
    else:
        n_components = None

    data_loader_e_iso_pi = TaskDataLoader('e_iso_pi', path)
    data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path)
    data_loader_e_iso_n = TaskDataLoader('e_iso_n', path)
    data_loader_z_iso_n = TaskDataLoader('z_iso_n', path)

    smiles_list_e_iso_pi, y_e_iso_pi = data_loader_e_iso_pi.load_property_data(
    )
    smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data(
    )
    smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data()
    smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data()

    y_e_iso_pi = y_e_iso_pi.reshape(-1, 1)
    y_z_iso_pi = y_z_iso_pi.reshape(-1, 1)
    y_e_iso_n = y_e_iso_n.reshape(-1, 1)
    y_z_iso_n = y_z_iso_n.reshape(-1, 1)

    X_e_iso_pi = featurise_mols(smiles_list_e_iso_pi, representation)
    X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation)
    X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation)
    X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation)

    output_dim = 4  # Number of outputs
    rank = 1  # Rank of W
    feature_dim = len(X_e_iso_pi[0, :])

    tanimoto_active_dims = [i for i in range(feature_dim)
                            ]  # active dims for Tanimoto base kernel.

    r2_list = []
    rmse_list = []
    mae_list = []

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        if task == 'e_iso_pi':
            X_task = X_e_iso_pi
            y_task = y_e_iso_pi
        elif task == 'z_iso_pi':
            X_task = X_z_iso_pi
            y_task = y_z_iso_pi
        elif task == 'e_iso_n':
            X_task = X_e_iso_n
            y_task = y_e_iso_n
        else:
            X_task = X_z_iso_n
            y_task = y_z_iso_n

        X_train, X_test, y_train, y_test = train_test_split(
            X_task, y_task, test_size=test_set_size, random_state=i)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        if task == 'e_iso_pi':
            # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
            X_augmented = np.vstack((np.append(X_train,
                                               np.zeros((len(X_train), 1)),
                                               axis=1),
                                     np.append(X_z_iso_pi,
                                               np.ones((len(X_z_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_e_iso_n,
                                               np.ones(
                                                   (len(X_e_iso_n), 1)) * 2,
                                               axis=1),
                                     np.append(X_z_iso_n,
                                               np.ones(
                                                   (len(X_z_iso_n), 1)) * 3,
                                               axis=1)))

            X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1)
            X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1)

            # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
            Y_augmented = np.vstack(
                (np.hstack((y_train, np.zeros_like(y_train))),
                 np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))),
                 np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)),
                 np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3))))

            y_test = np.hstack((y_test, np.zeros_like(y_test)))

        elif task == 'z_iso_pi':
            # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
            X_augmented = np.vstack((np.append(X_e_iso_pi,
                                               np.zeros((len(X_e_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_train,
                                               np.ones((len(X_train), 1)),
                                               axis=1),
                                     np.append(X_e_iso_n,
                                               np.ones(
                                                   (len(X_e_iso_n), 1)) * 2,
                                               axis=1),
                                     np.append(X_z_iso_n,
                                               np.ones(
                                                   (len(X_z_iso_n), 1)) * 3,
                                               axis=1)))

            X_test = np.append(X_test, np.ones((len(X_test), 1)), axis=1)
            X_train = np.append(X_train, np.ones((len(X_train), 1)), axis=1)

            # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
            Y_augmented = np.vstack(
                (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))),
                 np.hstack((y_train, np.ones_like(y_train))),
                 np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)),
                 np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3))))

            y_test = np.hstack((y_test, np.ones_like(y_test)))

        elif task == 'e_iso_n':
            # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
            X_augmented = np.vstack((np.append(X_e_iso_pi,
                                               np.zeros((len(X_e_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_z_iso_pi,
                                               np.ones((len(X_z_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_train,
                                               np.ones((len(X_train), 1)) * 2,
                                               axis=1),
                                     np.append(X_z_iso_n,
                                               np.ones(
                                                   (len(X_z_iso_n), 1)) * 3,
                                               axis=1)))

            X_test = np.append(X_test, np.ones((len(X_test), 1)) * 2, axis=1)
            X_train = np.append(X_train,
                                np.ones((len(X_train), 1)) * 2,
                                axis=1)

            # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
            Y_augmented = np.vstack(
                (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))),
                 np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))),
                 np.hstack((y_train, np.ones_like(y_train) * 2)),
                 np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3))))

            y_test = np.hstack((y_test, np.ones_like(y_test) * 2))

        else:
            # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
            X_augmented = np.vstack((np.append(X_e_iso_pi,
                                               np.zeros((len(X_e_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_z_iso_pi,
                                               np.ones((len(X_z_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_e_iso_n,
                                               np.ones(
                                                   (len(X_e_iso_n), 1)) * 2,
                                               axis=1),
                                     np.append(X_train,
                                               np.ones((len(X_train), 1)) * 3,
                                               axis=1)))

            X_test = np.append(X_test, np.ones((len(X_test), 1)) * 3, axis=1)
            X_train = np.append(X_train,
                                np.ones((len(X_train), 1)) * 3,
                                axis=1)

            # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
            Y_augmented = np.vstack(
                (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))),
                 np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))),
                 np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)),
                 np.hstack((y_train, np.ones_like(y_train) * 3))))

            y_test = np.hstack((y_test, np.ones_like(y_test) * 3))

        # Base kernel
        k = Tanimoto(active_dims=tanimoto_active_dims)
        #set_trainable(k.variance, False)

        # Coregion kernel
        coreg = gpflow.kernels.Coregion(output_dim=output_dim,
                                        rank=rank,
                                        active_dims=[feature_dim])

        # Create product kernel
        kern = k * coreg

        # This likelihood switches between Gaussian noise with different variances for each f_i:
        lik = gpflow.likelihoods.SwitchedLikelihood([
            gpflow.likelihoods.Gaussian(),
            gpflow.likelihoods.Gaussian(),
            gpflow.likelihoods.Gaussian(),
            gpflow.likelihoods.Gaussian()
        ])

        # now build the GP model as normal
        m = gpflow.models.VGP((X_augmented, Y_augmented),
                              mean_function=Constant(np.mean(y_train[:, 0])),
                              kernel=kern,
                              likelihood=lik)

        # fit the covariance function parameters
        maxiter = ci_niter(1000)
        gpflow.optimizers.Scipy().minimize(
            m.training_loss,
            m.trainable_variables,
            options=dict(maxiter=maxiter),
            method="L-BFGS-B",
        )
        print_summary(m)

        # mean and variance GP prediction

        y_pred, y_var = m.predict_f(X_test)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train, _ = m.predict_f(X_train)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        score = r2_score(y_test[:, 0], y_pred)
        rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred))
        mae = mean_absolute_error(y_test[:, 0], y_pred)

        print("\nR^2: {:.3f}".format(score))
        print("RMSE: {:.3f}".format(rmse))
        print("MAE: {:.3f}".format(mae))

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

        B = coreg.output_covariance().numpy()
        print("B =", B)
        _ = plt.imshow(B)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))
예제 #14
0
        best_nlpd_kernel = ''

        gapped_rates = np.reshape(gapped_flux[i, :], (-1, 1))
        ground_truth_rates = ground_truth_flux_matrix[i, :]

        # Standardize the count rates

        flux_scaler = StandardScaler()
        gapped_rates = flux_scaler.fit_transform(gapped_rates)

        for k in kernel_list:

            name = kernel_dict[k]

            m = gpflow.models.GPR(data=(train_times, gapped_rates),
                                  mean_function=Constant(
                                      np.mean(gapped_rates)),
                                  kernel=k,
                                  noise_variance=np.float64(0.001))

            if fix_noise:
                fixed_noise = np.float64(0.001)  # was 0.05 previously
                set_trainable(
                    m.likelihood.variance, False
                )  # We don't want to optimise the noise level in this case.
                m.likelihood.variance = fixed_noise

            opt = gpflow.optimizers.Scipy()

            # If Cholesky decomposition error, then skip

            try:
예제 #15
0
        kernel_list[0]: 'RBF_Kernel',
        kernel_list[1]: 'Matern_12_Kernel',
        kernel_list[2]: 'Matern_32_Kernel',
        kernel_list[3]: 'Matern_52_Kernel',
        kernel_list[4]: 'Rational_Quadratic_Kernel'
    }

    for k in kernel_list:

        name = kernel_dict[k]

        # GP uses a constant mean function, where the constant is set to be the empirical average of the standardised
        # counts

        m = gpflow.models.GPR(data=(time, uv_band_flux),
                              mean_function=Constant(np.mean(uv_band_flux)),
                              kernel=k,
                              noise_variance=1)

        if fix_noise:

            # Fix a noise level to be the average experimental error observed in the dataset (0.037) for magnitudes
            # Noise level is 2.0364e-15 for the flux values.
            # Standardisation destroys this information so setting noise to be mean of standardised values divided by
            # the SNR in the orignal space.

            fixed_noise = np.mean(np.abs(uv_band_flux / snr))
            set_trainable(
                m.likelihood.variance, False
            )  # We don't want to optimise the noise level in this case.
            m.likelihood.variance = fixed_noise
예제 #16
0
from gpflow.config import default_int
from gpflow.inducing_variables import InducingPoints
from gpflow.mean_functions import Additive, Constant, Linear, Product, SwitchedMeanFunction, Zero

rng = np.random.RandomState(99021)


class Datum:
    input_dim, output_dim = 3, 2
    N, Ntest, M = 20, 30, 10


_mean_functions = [
    Zero(),
    Linear(A=rng.randn(Datum.input_dim, Datum.output_dim), b=rng.randn(Datum.output_dim, 1).reshape(-1)),
    Constant(c=rng.randn(Datum.output_dim, 1).reshape(-1))
]


@pytest.mark.parametrize('mean_function_1', _mean_functions)
@pytest.mark.parametrize('mean_function_2', _mean_functions)
@pytest.mark.parametrize('operation', ['+', 'x'])
def test_mean_functions_output_shape(mean_function_1, mean_function_2, operation):
    """
    Test the output shape for basic and compositional mean functions, also
    check that the combination of mean functions returns the correct class
    """
    X = np.random.randn(Datum.N, Datum.input_dim)
    Y = mean_function_1(X)
    # basic output shape check
    assert Y.shape in [(Datum.N, Datum.output_dim), (Datum.N, 1)]
        return -m.log_marginal_likelihood()

    #  We standardise the outputs but leave the inputs unchanged. Equivalent to transform data used in other scripts.

    y_train = y_train.reshape(-1, 1)
    y_scaler = StandardScaler()
    y_train = y_scaler.fit_transform(y_train)

    X_train = X_train.astype(np.float64)
    X_test = X_test.astype(np.float64)

    # Fit GP

    k = Tanimoto()
    m = gpflow.models.GPR(data=(X_train, y_train),
                          mean_function=Constant(np.mean(y_train)),
                          kernel=k,
                          noise_variance=1)

    # Optimise the kernel variance and noise level by the marginal likelihood

    opt = gpflow.optimizers.Scipy()
    opt.minimize(objective_closure,
                 m.trainable_variables,
                 options=dict(maxiter=100))
    print_summary(m)

    # mean and variance GP prediction

    y_pred, y_var = m.predict_f(X_test)
    y_pred = y_scaler.inverse_transform(y_pred)
예제 #18
0
        kernel_list[0]: 'RBF_Kernel',
        kernel_list[1]: 'Matern_12_Kernel',
        kernel_list[2]: 'Matern_32_Kernel',
        kernel_list[3]: 'Matern_52_Kernel',
        kernel_list[4]: 'Rational_Quadratic_Kernel'
    }

    for k in kernel_list:

        name = kernel_dict[k]

        # GP uses a constant mean function, where the constant is set to be the empirical average of the standardised
        # counts

        m = gpflow.models.GPR(data=(time, counts),
                              mean_function=Constant(np.mean(counts)),
                              kernel=k,
                              noise_variance=1)
        if fix_noise:

            # Fix a noise level to be a jitter of 1e-4 because the log transform means we lose access to the
            # empirical values.
            # The SNR is ca. 16 in the original data so it's possible to impose this in the standardised data as well.

            fixed_noise = np.mean(
                np.abs(counts / snr)
            )  # 0.05, current val, fixed_noise = np.float64(0.0001) previously
            set_trainable(
                m.likelihood.variance, False
            )  # We don't want to optimise the noise level in this case.
            m.likelihood.variance = fixed_noise
예제 #19
0
def main(path, representation):
    """
    :param path: str specifying path to dataset.
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    """

    task = 'e_iso_pi'  # Always e_iso_pi for human performance comparison
    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    # 5 test molecules

    test_smiles = [
        'BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1',
        'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1',
        'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2',
        'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC',
        'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC'
    ]

    # and their indices in the loaded data
    test_smiles_indices = [116, 131, 168, 221, 229]

    X_train = np.delete(X, np.array(test_smiles_indices), axis=0)
    y_train = np.delete(y, np.array(test_smiles_indices))
    X_test = X[[116, 131, 168, 221, 229]]

    # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was
    # under a different solvent
    y_test = y[[116, 131, 168, 221, 229]]
    y_test[2] = 407.

    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    #  We standardise the outputs but leave the inputs unchanged

    _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test,
                                                     y_test)

    X_train = X_train.astype(np.float64)
    X_test = X_test.astype(np.float64)

    num_features = np.shape(X)[1]

    # We define the Gaussian Process Regression Model using the Tanimoto kernel

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    # for plotting confidence-error curves

    rmse_confidence_list = []
    mae_confidence_list = []

    k = Tanimoto()
    m = gpflow.models.GPR(data=(X_train, y_train),
                          mean_function=Constant(np.mean(y_train)),
                          kernel=k,
                          noise_variance=1)

    # Optimise the kernel variance and noise level by the marginal likelihood

    opt = gpflow.optimizers.Scipy()
    opt.minimize(objective_closure,
                 m.trainable_variables,
                 options=dict(maxiter=100))
    print_summary(m)

    # mean and variance GP prediction

    y_pred, y_var = m.predict_f(X_test)
    y_pred = y_scaler.inverse_transform(y_pred)
    y_test = y_scaler.inverse_transform(y_test)

    # Output Standardised RMSE and RMSE on Train Set

    y_pred_train, _ = m.predict_f(X_train)
    train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
    train_rmse = np.sqrt(
        mean_squared_error(y_scaler.inverse_transform(y_train),
                           y_scaler.inverse_transform(y_pred_train)))
    print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
    print("Train RMSE: {:.3f}".format(train_rmse))

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    per_molecule = abs(y_pred - y_test)

    print("\n Averaged test statistics are")
    print("\nR^2: {:.3f}".format(r2))
    print("RMSE: {:.3f}".format(rmse))
    print("MAE: {:.3f}".format(mae))
    print("\nAbsolute error per molecule is {} ".format(per_molecule))
예제 #20
0
def main(path, task, n_trials, test_set_size, use_rmse_conf, kernel, N):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['Photoswitch', 'ESOL', 'FreeSolv', 'Lipophilicity']
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence-
    error curves. True is the option for rmse.
    :param kernel: str specifying the kernel to be used. One of ['ShortestPath', ]
    """

    start_time = time.time()
    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()

    # List truncation for faster computation
    smiles_list = smiles_list[0:N]
    y = y[0:N]

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    r2_list = []
    rmse_list = []
    mae_list = []

    # We pre-allocate arrays for plotting confidence-error curves

    _, _, _, y_test = train_test_split(
        smiles_list, y, test_size=test_set_size)  # To get test set size

    # Photoswitch dataset requires 80/20 splitting. Other datasets are 80/10/10.

    if task != 'Photoswitch':
        split_in_two = int(len(y_test) / 2)
        n_test = split_in_two
    else:
        n_test = len(y_test)

    rmse_confidence_list = np.zeros((n_trials, n_test))
    mae_confidence_list = np.zeros((n_trials, n_test))

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(
            smiles_list, y, test_size=test_set_size, random_state=i)

        if task != 'Photoswitch':

            # Artificially create a 80/10/10 train/validation/test split discarding the validation set.
            split_in_two = int(len(y_test) / 2)
            X_test = X_test[0:split_in_two]
            y_test = y_test[0:split_in_two]

            y_train = y_train.reshape(-1, 1)
            y_test = y_test.reshape(-1, 1)

            X_train = np.asarray(X_train)
            X_test = np.asarray(X_test)

            print('kernel is ', kernel)

            if kernel == 'PUTH':
                k = GP.kernels.PUTH()

            elif kernel == 'CW':
                k = GP.kernels.CWgeo()

            elif kernel == 'MK':
                k = GP.kernels.MK()

            elif kernel == 'SP':
                k = GP.kernels.SP()

            elif kernel == 'SSP':
                k = GP.kernels.SSP()

            elif kernel == 'T':
                k = GP.kernels.T()

            elif kernel == 'WL':
                k = GP.kernels.WL()

            m = gpflow.models.GPR(data=(X_train, y_train),
                                  mean_function=Constant(np.mean(y_train)),
                                  kernel=k,
                                  noise_variance=1)

            # Optimise the kernel variance and noise level by the marginal likelihood

            optimizer = tf.optimizers.Adam(learning_rate=0.1)
            print_summary(m)

            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(m.trainable_variables)
                ll = m.maximum_log_likelihood_objective()
                objective = -ll
                gradients = tape.gradient(objective, m.trainable_variables)
            optimizer.apply_gradients(zip(gradients, m.trainable_variables))
            print_summary(m)

            # mean and variance GP prediction

            y_pred, y_var = m.predict_f(X_test)

            y_pred = y_pred.numpy()

            # Compute scores for confidence curve plotting.

            ranked_confidence_list = np.argsort(y_var, axis=0).flatten()

            for k in range(len(y_test)):
                # Construct the RMSE error for each level of confidence

                conf = ranked_confidence_list[0:k + 1]
                rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf]))
                rmse_confidence_list[i, k] = rmse

                # Construct the MAE error for each level of confidence

                mae = mean_absolute_error(y_test[conf], y_pred[conf])
                mae_confidence_list[i, k] = mae

            # Output Standardised RMSE and RMSE on Train Set

            y_pred_train, _ = m.predict_f(X_train)
            train_rmse_stan = np.sqrt(mean_squared_error(
                y_train, y_pred_train))
            train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
            print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
            print("Train RMSE: {:.3f}".format(train_rmse))

            score = r2_score(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            mae = mean_absolute_error(y_test, y_pred)

            print("\nR^2: {:.3f}".format(score))
            print("RMSE: {:.3f}".format(rmse))
            print("MAE: {:.3f}".format(mae))

            r2_list.append(score)
            rmse_list.append(rmse)
            mae_list.append(mae)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nDataset: {}".format(task))
    print("\nKernel: {}".format(kernel))
    print("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))

    #### modify to include kernel and dataset info in outputs, use {} command

    outF = open("results.txt", "a")
    outF.write("\n")
    outF.write("\nDataset: {}".format(task))
    outF.write("\nKernel: {}".format(kernel))
    outF.write("\nTime taken: {}".format(time.time() - start_time))
    outF.write("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    outF.write("\nmean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    outF.write("\nmean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))
    outF.close()

    # Plot confidence-error curves

    confidence_percentiles = np.arange(
        1e-14, 100, 100 / len(y_test)
    )  # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29
    # Base kernel
    k = Tanimoto(active_dims=tanimoto_active_dims)
    # set_trainable(k.variance, False)

    # Coregion kernel
    coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim])

    # Create product kernel
    kern = k * coreg

    # This likelihood switches between Gaussian noise with different variances for each f_i:
    lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(),
                                                 gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()])

    # now build the GP model as normal
    m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern,
                          likelihood=lik)

    # fit the covariance function parameters
    maxiter = ci_niter(1000)
    gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter),
                                       method="L-BFGS-B", )
    print_summary(m)

    # mean and variance GP prediction

    y_pred, y_var = m.predict_f(X_test)

    # Output Standardised RMSE and RMSE on Train Set

    y_pred_train, _ = m.predict_f(X_train)
예제 #22
0
def main(path, task, representation, use_pca, n_trials, test_set_size, use_rmse_conf):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence-
    error curves. True is the option for rmse.
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 100
    else:
        n_components = None

    # We define the Gaussian Process Regression Model using the Tanimoto kernel

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    r2_list = []
    rmse_list = []
    mae_list = []

    # We pre-allocate arrays for plotting confidence-error curves

    _, _, _, y_test = train_test_split(X, y, test_size=test_set_size)  # To get test set size
    n_test = len(y_test)

    rmse_confidence_list = np.zeros((n_trials, n_test))
    mae_confidence_list = np.zeros((n_trials, n_test))

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=i)

        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        #  We standardise the outputs but leave the inputs unchanged

        _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        k = Tanimoto()
        m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1)

        # e_iso_pi best params:
        # {'learner': RandomForestRegressor(max_features=0.9348473830061558, n_estimators=381,
        #                       n_jobs=1, random_state=2, verbose=False)}
        # e_iso_n best params:
        # {'learner': RandomForestRegressor(bootstrap=False, max_features=0.09944870853556087,
        #                                   min_samples_leaf=3, n_estimators=1295, n_jobs=1,
        #                                   random_state=0, verbose=False)}
        # z_iso_pi best params:
        # {'learner': RandomForestRegressor(max_depth=4, max_features=0.33072121415416944,
        #                                   n_estimators=2755, n_jobs=1, random_state=2,
        #                                   verbose=False)}
        # z_iso_n best params:
        # {'learner': RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1,
        #                                   random_state=3, verbose=False)}

        regr_rf = RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1,
                                           random_state=3, verbose=False)
        regr_rf.fit(X_train, y_train)

        # Optimise the kernel variance and noise level by the marginal likelihood

        opt = gpflow.optimizers.Scipy()
        opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100))
        print_summary(m)

        # mean and variance GP prediction and RF prediction

        y_pred, y_var = m.predict_f(X_test)
        y_pred_rf = regr_rf.predict(X_test)
        y_pred_av = (y_pred + y_pred_rf.reshape(-1, 1)) / 2.0
        y_pred = y_scaler.inverse_transform(y_pred_av)
        y_test = y_scaler.inverse_transform(y_test)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train, _ = m.predict_f(X_train)
        y_pred_train_rf = regr_rf.predict(X_train)
        y_pred_train = (y_pred_train + y_pred_train_rf.reshape(-1, 1)) / 2.0
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train)))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        score = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)

        print("\nR^2: {:.3f}".format(score))
        print("RMSE: {:.3f}".format(rmse))
        print("MAE: {:.3f}".format(mae))

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))
예제 #23
0
def test_models_with_mean_functions_changes(model_class):
    """
    Simply check that all models have a higher prediction with a constant mean
    function than with a zero mean function.

    For compositions of mean functions check that multiplication/ addition of
    a constant results in a higher prediction, whereas addition of zero/
    mutliplication with one does not.
    """
    data = rng.randn(Datum.N, Datum.input_dim), rng.randn(Datum.N, 1)
    predict_at = rng.randn(Datum.Ntest, Datum.input_dim)
    inducing_variable = InducingPoints(Z=rng.randn(Datum.M, Datum.input_dim))
    kernel = gpflow.kernels.Matern32()
    likelihood = gpflow.likelihoods.Gaussian()
    zero_mean = Zero()
    non_zero_mean = Constant(c=np.ones(1) * 10)

    if model_class in [gpflow.models.GPR]:
        model_zero_mean = model_class(data, kernel=kernel, mean_function=zero_mean)
        model_non_zero_mean = model_class(data, kernel=kernel, mean_function=non_zero_mean)
    elif model_class in [gpflow.models.VGP]:
        model_zero_mean = model_class(data, likelihood=likelihood, kernel=kernel, mean_function=zero_mean)
        model_non_zero_mean = model_class(data, likelihood=likelihood, kernel=kernel, mean_function=non_zero_mean)
    elif model_class in [gpflow.models.SVGP]:
        model_zero_mean = model_class(kernel=kernel,
                                      likelihood=likelihood,
                                      inducing_variable=inducing_variable,
                                      mean_function=zero_mean)
        model_non_zero_mean = model_class(kernel=kernel,
                                          likelihood=likelihood,
                                          inducing_variable=inducing_variable,
                                          mean_function=non_zero_mean)
    elif model_class in [gpflow.models.SGPR, gpflow.models.GPRFITC]:
        model_zero_mean = model_class(data,
                                      kernel=kernel,
                                      inducing_variable=inducing_variable,
                                      mean_function=zero_mean)
        model_non_zero_mean = model_class(data,
                                          kernel=kernel,
                                          inducing_variable=inducing_variable,
                                          mean_function=non_zero_mean)
    elif model_class in [gpflow.models.SGPMC]:
        model_zero_mean = model_class(data,
                                      kernel=kernel,
                                      likelihood=likelihood,
                                      inducing_variable=inducing_variable,
                                      mean_function=zero_mean)
        model_non_zero_mean = model_class(data,
                                          kernel=kernel,
                                          likelihood=likelihood,
                                          inducing_variable=inducing_variable,
                                          mean_function=non_zero_mean)
    elif model_class in [gpflow.models.GPMC]:
        model_zero_mean = model_class(data,
                                      kernel=kernel,
                                      likelihood=likelihood,
                                      mean_function=zero_mean)
        model_non_zero_mean = model_class(data,
                                          kernel=kernel,
                                          likelihood=likelihood,
                                          mean_function=non_zero_mean)
    else:
        raise (NotImplementedError)

    mu_zero, var_zero = model_zero_mean.predict_f(predict_at)
    mu_non_zero, var_non_zero = model_non_zero_mean.predict_f(predict_at)
    # predictive variance remains unchanged after modifying mean function
    assert np.all(var_zero.numpy() == var_non_zero.numpy())
    # predictive mean changes after modifying mean function
    assert not np.all(mu_zero.numpy() == mu_non_zero.numpy())
def main(path, representation):
    """
    :param path: str specifying path to dataset.
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    """

    task = 'e_iso_pi'  # task always e_iso_pi with human performance comparison
    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    # 5 test molecules

    test_smiles = [
        'BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1',
        'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1',
        'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2',
        'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC',
        'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC'
    ]

    # and their indices in the loaded data
    test_smiles_indices = [116, 131, 168, 221, 229]

    X_train = np.delete(X, np.array(test_smiles_indices), axis=0)
    y_train = np.delete(y, np.array(test_smiles_indices))
    X_test = X[[116, 131, 168, 221, 229]]

    # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was
    # under a different solvent
    y_test = y[[116, 131, 168, 221, 229]]
    y_test[2] = 407.

    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    # #  We standardise the outputs but leave the inputs unchanged
    #
    # _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

    X_train = X_train.astype(np.float64)
    X_test = X_test.astype(np.float64)

    data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path)
    data_loader_e_iso_n = TaskDataLoader('e_iso_n', path)
    data_loader_z_iso_n = TaskDataLoader('z_iso_n', path)

    smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data(
    )
    smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data()
    smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data()

    y_z_iso_pi = y_z_iso_pi.reshape(-1, 1)
    y_e_iso_n = y_e_iso_n.reshape(-1, 1)
    y_z_iso_n = y_z_iso_n.reshape(-1, 1)

    X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation)
    X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation)
    X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation)

    output_dim = 4  # Number of outputs
    rank = 1  # Rank of W
    feature_dim = len(X_train[0, :])

    tanimoto_active_dims = [i for i in range(feature_dim)
                            ]  # active dims for Tanimoto base kernel.

    # We define the Gaussian Process Regression Model using the Tanimoto kernel

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
    X_augmented = np.vstack((np.append(X_train,
                                       np.zeros((len(X_train), 1)),
                                       axis=1),
                             np.append(X_z_iso_pi,
                                       np.ones((len(X_z_iso_pi), 1)),
                                       axis=1),
                             np.append(X_e_iso_n,
                                       np.ones((len(X_e_iso_n), 1)) * 2,
                                       axis=1),
                             np.append(X_z_iso_n,
                                       np.ones((len(X_z_iso_n), 1)) * 3,
                                       axis=1)))

    X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1)
    X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1)

    # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
    Y_augmented = np.vstack(
        (np.hstack((y_train, np.zeros_like(y_train))),
         np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))),
         np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)),
         np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3))))

    y_test = np.hstack((y_test, np.zeros_like(y_test)))

    # Base kernel
    k = Tanimoto(active_dims=tanimoto_active_dims)
    # set_trainable(k.variance, False)

    # Coregion kernel
    coreg = gpflow.kernels.Coregion(output_dim=output_dim,
                                    rank=rank,
                                    active_dims=[feature_dim])

    # Create product kernel
    kern = k * coreg

    # This likelihood switches between Gaussian noise with different variances for each f_i:
    lik = gpflow.likelihoods.SwitchedLikelihood([
        gpflow.likelihoods.Gaussian(),
        gpflow.likelihoods.Gaussian(),
        gpflow.likelihoods.Gaussian(),
        gpflow.likelihoods.Gaussian()
    ])

    # now build the GP model as normal
    m = gpflow.models.VGP((X_augmented, Y_augmented),
                          mean_function=Constant(np.mean(y_train[:, 0])),
                          kernel=kern,
                          likelihood=lik)

    # fit the covariance function parameters
    maxiter = ci_niter(1000)
    gpflow.optimizers.Scipy().minimize(
        m.training_loss,
        m.trainable_variables,
        options=dict(maxiter=maxiter),
        method="L-BFGS-B",
    )
    print_summary(m)

    # mean and variance GP prediction

    y_pred, y_var = m.predict_f(X_test)

    # Output Standardised RMSE and RMSE on Train Set

    y_pred_train, _ = m.predict_f(X_train)
    train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
    print("Train RMSE: {:.3f}".format(train_rmse))

    r2 = r2_score(y_test[:, 0], y_pred)
    rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred))
    mae = mean_absolute_error(y_test[:, 0], y_pred)
    per_molecule = np.diag(abs(y_pred - y_test[:, 0]))

    print("\n Averaged test statistics are")
    print("\nR^2: {:.3f}".format(r2))
    print("RMSE: {:.3f}".format(rmse))
    print("MAE: {:.3f}".format(mae))
    print("\nAbsolute error per molecule is {} ".format(per_molecule))
예제 #25
0
def main(path, task, representation, use_pca, n_trials, test_set_size,
         use_rmse_conf):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence-
    error curves. True is the option for rmse.
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 100
    else:
        n_components = None

    # We define the Gaussian Process Regression Model using the Tanimoto kernel

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    r2_list = []
    rmse_list = []
    mae_list = []

    # We pre-allocate arrays for plotting confidence-error curves

    _, _, _, y_test = train_test_split(
        X, y, test_size=test_set_size)  # To get test set size
    n_test = len(y_test)

    rmse_confidence_list = np.zeros((n_trials, n_test))
    mae_confidence_list = np.zeros((n_trials, n_test))

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_set_size, random_state=i)

        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        #  We standardise the outputs but leave the inputs unchanged

        _, y_train, _, y_test, y_scaler = transform_data(
            X_train,
            y_train,
            X_test,
            y_test,
            n_components=n_components,
            use_pca=use_pca)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        k = Tanimoto()
        m = gpflow.models.GPR(data=(X_train, y_train),
                              mean_function=Constant(np.mean(y_train)),
                              kernel=k,
                              noise_variance=1)

        # Optimise the kernel variance and noise level by the marginal likelihood

        opt = gpflow.optimizers.Scipy()
        opt.minimize(objective_closure,
                     m.trainable_variables,
                     options=dict(maxiter=10000))
        print_summary(m)

        # mean and variance GP prediction

        y_pred, y_var = m.predict_f(X_test)
        y_pred = y_scaler.inverse_transform(y_pred)
        y_test = y_scaler.inverse_transform(y_test)

        # Compute scores for confidence curve plotting.

        ranked_confidence_list = np.argsort(y_var, axis=0).flatten()

        for k in range(len(y_test)):

            # Construct the RMSE error for each level of confidence

            conf = ranked_confidence_list[0:k + 1]
            rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf]))
            rmse_confidence_list[i, k] = rmse

            # Construct the MAE error for each level of confidence

            mae = mean_absolute_error(y_test[conf], y_pred[conf])
            mae_confidence_list[i, k] = mae

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train, _ = m.predict_f(X_train)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(
            mean_squared_error(y_scaler.inverse_transform(y_train),
                               y_scaler.inverse_transform(y_pred_train)))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        score = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)

        print("\nR^2: {:.3f}".format(score))
        print("RMSE: {:.3f}".format(rmse))
        print("MAE: {:.3f}".format(mae))

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))

    # Plot confidence-error curves

    confidence_percentiles = np.arange(
        1e-14, 100, 100 / len(y_test)
    )  # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29

    if use_rmse_conf:

        rmse_mean = np.mean(rmse_confidence_list, axis=0)
        rmse_std = np.std(rmse_confidence_list, axis=0)

        # We flip because we want the most confident predictions on the right-hand side of the plot

        rmse_mean = np.flip(rmse_mean)
        rmse_std = np.flip(rmse_std)

        # One-sigma error bars

        lower = rmse_mean - rmse_std
        upper = rmse_mean + rmse_std

        plt.plot(confidence_percentiles, rmse_mean, label='mean')
        plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2)
        plt.xlabel('Confidence Percentile')
        plt.ylabel('RMSE (nm)')
        plt.ylim([0, np.max(upper) + 1])
        plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))])
        plt.yticks(np.arange(0, np.max(upper) + 1, 5.0))
        plt.savefig(
            task +
            '/results/gpr/{}_confidence_curve_rmse.png'.format(representation))
        plt.show()

    else:

        # We plot the Mean-absolute error confidence-error curves

        mae_mean = np.mean(mae_confidence_list, axis=0)
        mae_std = np.std(mae_confidence_list, axis=0)

        mae_mean = np.flip(mae_mean)
        mae_std = np.flip(mae_std)

        lower = mae_mean - mae_std
        upper = mae_mean + mae_std

        plt.plot(confidence_percentiles, mae_mean, label='mean')
        plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2)
        plt.xlabel('Confidence Percentile')
        plt.ylabel('MAE (nm)')
        plt.ylim([0, np.max(upper) + 1])
        plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))])
        plt.yticks(np.arange(0, np.max(upper) + 1, 5.0))
        plt.savefig(
            task +
            '/results/gpr/{}_confidence_curve_mae.png'.format(representation))
        plt.show()
예제 #26
0
def main(path, path_to_dft_dataset, representation, theory_level):
    """
    :param path: str specifying path to photoswitches.csv file.
    :param path_to_dft_dataset: str specifying path to dft_comparison.csv file.
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0']
    """

    task = 'e_iso_pi'  # e_iso_pi only task supported for TD-DFT comparison
    data_loader = TaskDataLoader(task, path)
    smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset)

    X = featurise_mols(smiles_list, representation)

    # Keep only non-duplicate entries because we're not considering effects of solvent

    non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    X = X[non_duplicate_indices, :]
    experimental_vals = experimental_vals[non_duplicate_indices]
    non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    pbe0_vals = pbe0_vals[non_dup_pbe0]
    cam_vals = cam_vals[non_dup_cam]

    # molecules with dft values to be split into train/test
    if theory_level == 'CAM-B3LYP':
        X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0)
        y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals)))
        # DFT values for the CAM-B3LYP level of theory
        dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals)))
        # molecules with no dft vals must go into the training set.
        X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0)
        y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals)))
    else:
        X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0)
        y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals)))
        # DFT values for the PBE0 level of theory
        dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals)))
        # molecules with no dft vals must go into the training set.
        X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0)
        y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals)))

    # Load in the other property values for multitask learning. e_iso_pi is a always the task in this instance.

    data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path)
    data_loader_e_iso_n = TaskDataLoader('e_iso_n', path)
    data_loader_z_iso_n = TaskDataLoader('z_iso_n', path)

    smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data()
    smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data()
    smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data()

    y_z_iso_pi = y_z_iso_pi.reshape(-1, 1)
    y_e_iso_n = y_e_iso_n.reshape(-1, 1)
    y_z_iso_n = y_z_iso_n.reshape(-1, 1)

    X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation)
    X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation)
    X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation)

    output_dim = 4  # Number of outputs
    rank = 1  # Rank of W
    feature_dim = len(X_no_dft[0, :])

    tanimoto_active_dims = [i for i in range(feature_dim)]  # active dims for Tanimoto base kernel.

    mae_list = []
    dft_mae_list = []

    # We define the Gaussian Process optimisation objective

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    print('\nBeginning training loop...')

    for i in range(len(y_with_dft)):

        X_train = np.delete(X_with_dft, i, axis=0)
        y_train = np.delete(y_with_dft, i)
        X_test = X_with_dft[i].reshape(1, -1)
        y_test = y_with_dft[i]
        dft_test = dft_vals[i]

        X_train = np.concatenate((X_train, X_no_dft))
        y_train = np.concatenate((y_train, y_no_dft))
        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
        X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1),
                                 np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1),
                                 np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1),
                                 np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1)))

        X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1)
        X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1)

        # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
        Y_augmented = np.vstack((np.hstack((y_train, np.zeros_like(y_train))),
                                 np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))),
                                 np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)),
                                 np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3))))

        y_test = np.hstack((y_test, np.zeros_like(y_test)))

        # Base kernel
        k = Tanimoto(active_dims=tanimoto_active_dims)
        #set_trainable(k.variance, False)

        # Coregion kernel
        coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim])

        # Create product kernel
        kern = k * coreg

        # This likelihood switches between Gaussian noise with different variances for each f_i:
        lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(),
                                                     gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()])

        # now build the GP model as normal
        m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik)

        # fit the covariance function parameters
        maxiter = ci_niter(1000)
        gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B",)
        print_summary(m)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train, _ = m.predict_f(X_train)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        # mean and variance GP prediction

        y_pred, y_var = m.predict_f(X_test)

        # Output MAE for this trial

        mae = abs(y_test[:, 0] - y_pred)

        print("MAE: {}".format(mae))

        # Store values in order to compute the mean and standard error of the statistics across trials

        mae_list.append(mae)

        # DFT prediction scores on the same trial

        dft_mae = abs(y_test[:, 0] - dft_test)

        dft_mae_list.append(dft_mae)

    mae_list = np.array(mae_list)
    dft_mae_list = np.array(dft_mae_list)

    print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))
    print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list))))
예제 #27
0
def main(path, task, representation, use_pca, n_trials, test_set_size,
         use_rmse_conf, precompute_repr):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['Photoswitch', 'ESOL', 'FreeSolv', 'Lipophilicity']
    :param representation: str specifying the molecular representation. One of ['SMILES, fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence-
    error curves. True is the option for rmse.
    :param precompute_repr: bool indicating whether to precompute representations or not.
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()

    print('\nBeginning augmentation...')
    start_time = time.time()
    x, smiles_card, y = augmentation(np.array(smiles_list),
                                     y,
                                     15,
                                     canon=False,
                                     rotate=True)
    print('\nFinished augmentation after', time.time() - start_time)

    print('\nBeginning representation...')
    start_time = time.time()
    X = featurise_mols(x, representation)
    print('\nFinished representation after', time.time() - start_time)

    if precompute_repr:
        if representation == 'SMILES':
            with open(
                    f'precomputed_representations/{task}_{representation}.txt',
                    'w') as f:
                for smiles in X:
                    f.write(smiles + '\n')
        else:
            np.savetxt(
                f'precomputed_representations/{task}_{representation}.txt', X)

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 100
    else:
        n_components = None

    # We define the Gaussian Process Regression Model using the Tanimoto kernel

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    r2_list = []
    rmse_list = []
    mae_list = []

    # We pre-allocate arrays for plotting confidence-error curves

    _, _, _, y_test = train_test_split(
        X, y, test_size=test_set_size)  # To get test set size

    # Photoswitch dataset requires 80/20 splitting. Other datasets are 80/10/10.

    if task != 'Photoswitch':
        split_in_two = int(len(y_test) / 2)
        n_test = split_in_two
    else:
        n_test = len(y_test)

    rmse_confidence_list = np.zeros((n_trials, n_test))
    mae_confidence_list = np.zeros((n_trials, n_test))

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_set_size, random_state=i)

        if representation == 'SMILES':

            np.savetxt(f'fixed_train_test_splits/{task}/X_train_split_{i}.txt',
                       X_train,
                       fmt="%s")
            np.savetxt(f'fixed_train_test_splits/{task}/X_test_split_{i}.txt',
                       X_test,
                       fmt="%s")
            np.savetxt(f'fixed_train_test_splits/{task}/y_train_split_{i}.txt',
                       y_train)
            np.savetxt(f'fixed_train_test_splits/{task}/y_test_split_{i}.txt',
                       y_test)

        else:

            if task != 'Photoswitch':

                # Artificially create a 80/10/10 train/validation/test split discarding the validation set.
                split_in_two = int(len(y_test) / 2)
                X_test = X_test[0:split_in_two]
                y_test = y_test[0:split_in_two]

            y_train = y_train.reshape(-1, 1)
            y_test = y_test.reshape(-1, 1)

            #  We standardise the outputs but leave the inputs unchanged

            _, y_train, _, y_test, y_scaler = transform_data(
                X_train,
                y_train,
                X_test,
                y_test,
                n_components=n_components,
                use_pca=use_pca)

            X_train = X_train.astype(np.float64)
            X_test = X_test.astype(np.float64)

            k = Tanimoto()
            m = gpflow.models.GPR(data=(X_train, y_train),
                                  mean_function=Constant(np.mean(y_train)),
                                  kernel=k,
                                  noise_variance=1)

            # Optimise the kernel variance and noise level by the marginal likelihood

            opt = gpflow.optimizers.Scipy()
            opt.minimize(objective_closure,
                         m.trainable_variables,
                         options=dict(maxiter=100))
            print_summary(m)

            # mean and variance GP prediction

            y_pred, y_var = m.predict_f(X_test)
            y_pred = y_scaler.inverse_transform(y_pred)
            y_test = y_scaler.inverse_transform(y_test)

            # Compute scores for confidence curve plotting.

            ranked_confidence_list = np.argsort(y_var, axis=0).flatten()

            for k in range(len(y_test)):

                # Construct the RMSE error for each level of confidence

                conf = ranked_confidence_list[0:k + 1]
                rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf]))
                rmse_confidence_list[i, k] = rmse

                # Construct the MAE error for each level of confidence

                mae = mean_absolute_error(y_test[conf], y_pred[conf])
                mae_confidence_list[i, k] = mae

            # Output Standardised RMSE and RMSE on Train Set

            y_pred_train, _ = m.predict_f(X_train)
            train_rmse_stan = np.sqrt(mean_squared_error(
                y_train, y_pred_train))
            train_rmse = np.sqrt(
                mean_squared_error(y_scaler.inverse_transform(y_train),
                                   y_scaler.inverse_transform(y_pred_train)))
            print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
            print("Train RMSE: {:.3f}".format(train_rmse))

            score = r2_score(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            mae = mean_absolute_error(y_test, y_pred)

            print("\nR^2: {:.3f}".format(score))
            print("RMSE: {:.3f}".format(rmse))
            print("MAE: {:.3f}".format(mae))

            r2_list.append(score)
            rmse_list.append(rmse)
            mae_list.append(mae)

    if representation != 'SMILES':

        r2_list = np.array(r2_list)
        rmse_list = np.array(rmse_list)
        mae_list = np.array(mae_list)

        print("\nmean R^2: {:.4f} +- {:.4f}".format(
            np.mean(r2_list),
            np.std(r2_list) / np.sqrt(len(r2_list))))
        print("mean RMSE: {:.4f} +- {:.4f}".format(
            np.mean(rmse_list),
            np.std(rmse_list) / np.sqrt(len(rmse_list))))
        print("mean MAE: {:.4f} +- {:.4f}\n".format(
            np.mean(mae_list),
            np.std(mae_list) / np.sqrt(len(mae_list))))

        # Plot confidence-error curves

        confidence_percentiles = np.arange(
            1e-14, 100, 100 / len(y_test)
        )  # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29

        if use_rmse_conf:

            rmse_mean = np.mean(rmse_confidence_list, axis=0)
            rmse_std = np.std(rmse_confidence_list, axis=0)

            # We flip because we want the most confident predictions on the right-hand side of the plot

            rmse_mean = np.flip(rmse_mean)
            rmse_std = np.flip(rmse_std)

            # One-sigma error bars

            lower = rmse_mean - rmse_std
            upper = rmse_mean + rmse_std

            plt.plot(confidence_percentiles, rmse_mean, label='mean')
            plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2)
            plt.xlabel('Confidence Percentile')
            plt.ylabel('RMSE')
            plt.ylim([0, np.max(upper) + 1])
            plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))])
            plt.yticks(np.arange(0, np.max(upper) + 1, 5.0))
            plt.savefig(task +
                        '/results/tanimoto/{}_confidence_curve_rmse.png'.
                        format(representation))
            plt.show()

        else:

            # We plot the Mean-absolute error confidence-error curves

            mae_mean = np.mean(mae_confidence_list, axis=0)
            mae_std = np.std(mae_confidence_list, axis=0)

            mae_mean = np.flip(mae_mean)
            mae_std = np.flip(mae_std)

            lower = mae_mean - mae_std
            upper = mae_mean + mae_std

            plt.plot(confidence_percentiles, mae_mean, label='mean')
            plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2)
            plt.xlabel('Confidence Percentile')
            plt.ylabel('MAE')
            plt.ylim([0, np.max(upper) + 1])
            plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))])
            plt.yticks(np.arange(0, np.max(upper) + 1, 5.0))
            plt.savefig(task +
                        '/results/tanimoto/{}_confidence_curve_mae.png'.format(
                            representation))
            plt.show()