def add_feature_by_feature(feature_order, outdir, lsm, nvm): """ Initialises the model with the given hyperparameter multipliers and optimises them for an increasing number of features, in the specified order Args: feature_order: the order in which to add the features outdir: the directory in which the output directories are to be created lsm: the length scale multiplier nvm: the noise variance multiplier Returns: creates a new directory for each instance in which the result file and (optional) plots are saved """ index_set = [] # make sure the output directory exists, create it if not try: os.mkdir(outdir) except OSError: pass for feature in feature_order: print('Proceeding with feature '+feature) # create an output directory for each feature number index_set.append(feature) step_directory = outdir + f'/number_features{len(index_set)}' try: os.mkdir(step_directory) except OSError: print(f'The run for {len(index_set)} already exists.') else: train, test, col_order = extract_data(index_set) # setting initial parameters signal_variance = 1 noise_variance = np.float64(nvm) length_scales = np.ones(len(index_set)) * lsm # define the kernel and the model rbf = gp.kernels.SquaredExponential(variance=signal_variance, lengthscales=length_scales) lin = gp.kernels.Linear() k = rbf + lin m = gp.models.GPR(data=(train['features'], train['affinity']), kernel=k, mean_function=None) m.likelihood.variance.assign(noise_variance) opt = gp.optimizers.Scipy() print(f'Running feature set length {len(index_set)}') maximize_marginal_likelihood(kernel=k, model=m, optimizer=opt, output_directory=step_directory, testing_data=test, feature_names=col_order, plot_ARD=True, plot_params=False)
def rbf_grid(noise_variance, signal_variance, length_scales, training_data, testing_data): """ A function that performs a hyperparameter grid search for the RBF kernel, as implemented in GPflow2. Args: noise_variance: the likelihood variance value around which the grid search is to be done signal_variance: the kernel variance value around which the grid search is to be done length_scales: the kernel lengthscale value around which the grid search is to be done training_data: the data on which to train the model testing_data: the data on which to test the model Returns: automatically creates the appropriate output directories into which the result is stored """ # check whether main output directory exists/can be created try: os.mkdir(output_path) except OSError: pass # check whether kernel-specific output directory exists/can be created output_directory = output_path + '/rbf' try: os.mkdir(output_directory) except OSError: pass # specify multipliers for the different parameter values at which to initialise the model signal_variance_multiplier = [1] length_scale_multiplier = [0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1, 1.5] noise_variance_multiplier = [0.5, 0.1, 0.05, 0.01] # iterate through all permutations of the above-specified multipliers for svm, lsm, nvm in itertools.product(signal_variance_multiplier, length_scale_multiplier, noise_variance_multiplier): # create an instance directory for the specified parameter pair # skip if it already exists (e.g. during a re-run with previously evaluated param pairs) instance_directory = output_directory + f'/svm_{svm}_lsm_{lsm}_nvm_{nvm}' try: os.mkdir(instance_directory) except OSError: print( f'Calculation already complete for svm {svm}, lsm {lsm}, nvm {nvm}.' ) else: # specify the respective model, kernel and optimiser and run the optimisation k = gp.kernels.SquaredExponential(variance=(signal_variance * svm), lengthscales=(length_scales * lsm)) m = gp.models.GPR(data=(training_data['features_train'], training_data['affinity_train']), kernel=k, mean_function=None) m.likelihood.variance.assign(noise_variance * nvm) opt = gp.optimizers.Scipy() print( f'Starting {k.name} kernel with svm {svm}, lsm {lsm}, nvm {nvm}.' ) maximize_marginal_likelihood(kernel=k, model=m, optimizer=opt, output_directory=instance_directory, testing_data=testing_data, plot_params=False, plot_ARD=True) # NOTE: make sure to check model.kernel.lenghtscales # and not model.kernel.kernels[0].lengthscales (for composite kernels) is selected in the GPR script print( f'Finished {k.name} kernel with svm {svm}, lsm {lsm}, nvm {nvm}.' )
def polynomial_grid(noise_variance, training_data, testing_data): """ A function that performs a hyperparameter grid search for the polynomial kernel, as implemented in GPflow2. Args: noise_variance: the likelihood variance value around which the grid search is to be done signal_variance: the kernel variance value around which the grid search is to be done length_scales: the kernel lengthscale value around which the grid search is to be done training_data: the data on which to train the model testing_data: the data on which to test the model Returns: automatically creates the appropriate output directories into which the result is stored """ # check whether main output directory exists/can be created try: os.mkdir(output_path) except OSError: pass # check whether kernel-specific output directory exists/can be created output_directory = output_path + '/poly' try: os.mkdir(output_directory) except OSError: pass # specify values for the different parameter values at which to initialise the model degree = [4, 5, 6, 7, 8, 9, 10] variance = [0.5, 1, 2] # iterate through all permutations of the above-specified values for deg, var in itertools.product(degree, variance): # create an instance directory for the specified parameter pair # skip if it already exists (e.g. during a re-run with previously evaluated param pairs) instance_directory = output_directory + f'/deg_{deg}_lvar_{var}_nvm_1' try: os.mkdir(instance_directory) except OSError: print(f'Calculation already complete for deg {deg}, var {var}.') else: # specify the respective model, kernel and optimiser and run the optimisation k = gp.kernels.Polynomial(degree=deg, variance=var) m = gp.models.GPR(data=(training_data['features_train'], training_data['affinity_train']), kernel=k, mean_function=None) m.likelihood.variance.assign(noise_variance) opt = gp.optimizers.Scipy() print( f'Starting {k.name} kernel with deg {deg}, var {var}, nvm 1.') maximize_marginal_likelihood(kernel=k, model=m, optimizer=opt, output_directory=instance_directory, testing_data=testing_data, plot_params=False, plot_ARD=False) print( f'Finished {k.name} kernel with deg {deg}, var {var}, nvm 1.')
def increasing_random_samples(sample_sizes, feature_order, output_path): """ Takes random samples of the training data and optimises the model on the reduced set. Creates an output directory for each instance in which a result file is written. Args: sample_sizes: the number of data points to sample feature_order: the order of the features by relevance Returns: a directory with the relevant subdirectories and result files """ # create the relevant output subdirectory sample_dir = output_path + '/random_sampling' try: os.mkdir(sample_dir) except OSError: pass # set initial hyperparameters signal_variance = 1 noise_variance = np.float64(0.4) length_scales = np.ones(len(feature_order)) * 0.2 # define the kernel rbf = gp.kernels.SquaredExponential(variance=signal_variance, lengthscales=length_scales) lin = gp.kernels.Linear() k = rbf + lin # create a subdirectory for each sample size for sample_size in sample_sizes: size_dir = sample_dir + f'/sample_size_{sample_size}' try: os.mkdir(size_dir) except OSError: pass # define the number of repeats for each sample size range, not that GPs scale # cubically in the number of training points if sample_size <= 1500: repeats = 10 elif sample_size > 1500 and sample_size < 3000: repeats = 5 else: repeats = 1 # create a sub_directory for each instantiation for i in range(0, repeats): run_dir = size_dir + f'/run_{i}' try: os.mkdir(run_dir) except OSError: print(f"Run {i} for sample size {sample_size} already exists.") else: # read in the training sample train, test, col_order = extract_data(feature_order, sample_size) # initialise and solve the model m = gp.models.GPR(data=(train['features'], train['affinity']), kernel=k, mean_function=None) m.likelihood.variance.assign(noise_variance) opt = gp.optimizers.Scipy() print(f'Running sample size {sample_size}, repetition {i}') maximize_marginal_likelihood(kernel=k, model=m, optimizer=opt, output_directory=run_dir, testing_data=test, feature_names=col_order, plot_ARD=False, plot_params=False)