Exemplo n.º 1
0
def add_feature_by_feature(feature_order, outdir, lsm, nvm):
    """
    Initialises the model with the given hyperparameter multipliers and optimises them
    for an increasing number of features, in the specified order
    Args:
        feature_order: the order in which to add the features
        outdir: the directory in which the output directories are to be created
        lsm: the length scale multiplier
        nvm: the noise variance multiplier

    Returns: creates a new directory for each instance in which the result file and (optional)
             plots are saved
    """

    index_set = []

    # make sure the output directory exists, create it if not
    try:
        os.mkdir(outdir)
    except OSError:
        pass

    for feature in feature_order:

        print('Proceeding with feature '+feature)

        # create an output directory for each feature number
        index_set.append(feature)
        step_directory = outdir + f'/number_features{len(index_set)}'

        try:
           os.mkdir(step_directory)
        except OSError:
            print(f'The run for {len(index_set)} already exists.')
        else:
            train, test, col_order = extract_data(index_set)

            # setting initial parameters
            signal_variance = 1
            noise_variance = np.float64(nvm)
            length_scales = np.ones(len(index_set)) * lsm

            # define the kernel and the model
            rbf = gp.kernels.SquaredExponential(variance=signal_variance, lengthscales=length_scales)
            lin = gp.kernels.Linear()
            k = rbf + lin
            m = gp.models.GPR(data=(train['features'], train['affinity']), kernel=k, mean_function=None)
            m.likelihood.variance.assign(noise_variance)

            opt = gp.optimizers.Scipy()

            print(f'Running feature set length {len(index_set)}')

            maximize_marginal_likelihood(kernel=k, model=m, optimizer=opt, output_directory=step_directory,
                                         testing_data=test, feature_names=col_order,
                                         plot_ARD=True, plot_params=False)
Exemplo n.º 2
0
def rbf_grid(noise_variance, signal_variance, length_scales, training_data,
             testing_data):
    """
    A function that performs a hyperparameter grid search for the RBF kernel,
    as implemented in GPflow2.
    Args:
        noise_variance: the likelihood variance value around which the grid search is to be done
        signal_variance: the kernel variance value around which the grid search is to be done
        length_scales: the kernel lengthscale value around which the grid search is to be done
        training_data: the data on which to train the model
        testing_data: the data on which to test the model

    Returns: automatically creates the appropriate output directories into which the result is stored
    """

    # check whether main output directory exists/can be created
    try:
        os.mkdir(output_path)
    except OSError:
        pass

    # check whether kernel-specific output directory exists/can be created
    output_directory = output_path + '/rbf'
    try:
        os.mkdir(output_directory)
    except OSError:
        pass

    # specify multipliers for the different parameter values at which to initialise the model
    signal_variance_multiplier = [1]
    length_scale_multiplier = [0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1, 1.5]
    noise_variance_multiplier = [0.5, 0.1, 0.05, 0.01]

    # iterate through all permutations of the above-specified multipliers
    for svm, lsm, nvm in itertools.product(signal_variance_multiplier,
                                           length_scale_multiplier,
                                           noise_variance_multiplier):

        # create an instance directory for the specified parameter pair
        # skip if it already exists (e.g. during a re-run with previously evaluated param pairs)
        instance_directory = output_directory + f'/svm_{svm}_lsm_{lsm}_nvm_{nvm}'
        try:
            os.mkdir(instance_directory)
        except OSError:
            print(
                f'Calculation already complete for svm {svm}, lsm {lsm}, nvm {nvm}.'
            )
        else:

            # specify the respective model, kernel and optimiser and run the optimisation
            k = gp.kernels.SquaredExponential(variance=(signal_variance * svm),
                                              lengthscales=(length_scales *
                                                            lsm))
            m = gp.models.GPR(data=(training_data['features_train'],
                                    training_data['affinity_train']),
                              kernel=k,
                              mean_function=None)
            m.likelihood.variance.assign(noise_variance * nvm)

            opt = gp.optimizers.Scipy()

            print(
                f'Starting {k.name} kernel with svm {svm}, lsm {lsm}, nvm {nvm}.'
            )

            maximize_marginal_likelihood(kernel=k,
                                         model=m,
                                         optimizer=opt,
                                         output_directory=instance_directory,
                                         testing_data=testing_data,
                                         plot_params=False,
                                         plot_ARD=True)
            # NOTE: make sure to check model.kernel.lenghtscales
            # and not model.kernel.kernels[0].lengthscales (for composite kernels) is selected in the GPR script

            print(
                f'Finished {k.name} kernel with svm {svm}, lsm {lsm}, nvm {nvm}.'
            )
Exemplo n.º 3
0
def polynomial_grid(noise_variance, training_data, testing_data):
    """
    A function that performs a hyperparameter grid search for the polynomial kernel,
    as implemented in GPflow2.
    Args:
        noise_variance: the likelihood variance value around which the grid search is to be done
        signal_variance: the kernel variance value around which the grid search is to be done
        length_scales: the kernel lengthscale value around which the grid search is to be done
        training_data: the data on which to train the model
        testing_data: the data on which to test the model

    Returns: automatically creates the appropriate output directories into which the result is stored
    """

    # check whether main output directory exists/can be created
    try:
        os.mkdir(output_path)
    except OSError:
        pass

    # check whether kernel-specific output directory exists/can be created
    output_directory = output_path + '/poly'
    try:
        os.mkdir(output_directory)
    except OSError:
        pass

    # specify values for the different parameter values at which to initialise the model
    degree = [4, 5, 6, 7, 8, 9, 10]
    variance = [0.5, 1, 2]

    # iterate through all permutations of the above-specified values
    for deg, var in itertools.product(degree, variance):

        # create an instance directory for the specified parameter pair
        # skip if it already exists (e.g. during a re-run with previously evaluated param pairs)
        instance_directory = output_directory + f'/deg_{deg}_lvar_{var}_nvm_1'
        try:
            os.mkdir(instance_directory)
        except OSError:
            print(f'Calculation already complete for deg {deg}, var {var}.')
        else:

            # specify the respective model, kernel and optimiser and run the optimisation
            k = gp.kernels.Polynomial(degree=deg, variance=var)
            m = gp.models.GPR(data=(training_data['features_train'],
                                    training_data['affinity_train']),
                              kernel=k,
                              mean_function=None)
            m.likelihood.variance.assign(noise_variance)

            opt = gp.optimizers.Scipy()

            print(
                f'Starting {k.name} kernel with deg {deg}, var {var}, nvm 1.')

            maximize_marginal_likelihood(kernel=k,
                                         model=m,
                                         optimizer=opt,
                                         output_directory=instance_directory,
                                         testing_data=testing_data,
                                         plot_params=False,
                                         plot_ARD=False)

            print(
                f'Finished {k.name} kernel with deg {deg}, var {var}, nvm 1.')
Exemplo n.º 4
0
def increasing_random_samples(sample_sizes, feature_order, output_path):
    """
    Takes random samples of the training data and optimises the model on the reduced set.
    Creates an output directory for each instance in which a result file is written.
    Args:
        sample_sizes: the number of data points to sample
        feature_order: the order of the features by relevance

    Returns: a directory with the relevant subdirectories and result files
    """

    # create the relevant output subdirectory
    sample_dir = output_path + '/random_sampling'
    try:
        os.mkdir(sample_dir)
    except OSError:
        pass

    # set initial hyperparameters
    signal_variance = 1
    noise_variance = np.float64(0.4)
    length_scales = np.ones(len(feature_order)) * 0.2

    # define the kernel
    rbf = gp.kernels.SquaredExponential(variance=signal_variance, lengthscales=length_scales)
    lin = gp.kernels.Linear()
    k = rbf + lin

    # create a subdirectory for each sample size
    for sample_size in sample_sizes:

        size_dir = sample_dir + f'/sample_size_{sample_size}'
        try:
            os.mkdir(size_dir)
        except OSError:
            pass

        # define the number of repeats for each sample size range, not that GPs scale
        # cubically in the number of training points
        if sample_size <= 1500:
            repeats = 10
        elif sample_size > 1500 and sample_size < 3000:
            repeats = 5
        else:
            repeats = 1

        # create a sub_directory for each instantiation
        for i in range(0, repeats):

            run_dir = size_dir + f'/run_{i}'
            try:
                os.mkdir(run_dir)
            except OSError:
                print(f"Run {i} for sample size {sample_size} already exists.")
            else:
                # read in the training sample
                train, test, col_order = extract_data(feature_order, sample_size)

                # initialise and solve the model
                m = gp.models.GPR(data=(train['features'], train['affinity']), kernel=k, mean_function=None)
                m.likelihood.variance.assign(noise_variance)

                opt = gp.optimizers.Scipy()

                print(f'Running sample size {sample_size}, repetition {i}')

                maximize_marginal_likelihood(kernel=k, model=m, optimizer=opt, output_directory=run_dir,
                                             testing_data=test, feature_names=col_order,
                                             plot_ARD=False, plot_params=False)