def debug_laplace(): # Load data set X, y, D, Xtest, ytest = gpml.load_mat( '../data/kfold_data/r_concrete_500_fold_10_of_10.mat', y_dim=1) # Load the suspicious kernel sk = fk.repr_string_to_kernel( 'ScoredKernel(k_opt=ProductKernel([ MaskKernel(ndim=8, active_dimension=0, base_kernel=CubicKernel(offset=1.757755, output_variance=7.084045)), MaskKernel(ndim=8, active_dimension=7, base_kernel=SqExpPeriodicKernel(lengthscale=-2.701080, period=-0.380918, output_variance=-0.071214)) ]), nll=6348.096611, laplace_nle=-184450132.068237, bic_nle=12720.630212, noise=[-1.77276072])' ) # Create some code to evaluate it if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear cblparallel.copy_to_remote(data_file) scripts = [ gpml.OPTIMIZE_KERNEL_CODE % { 'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation=False), 'kernel_family': sk.k_opt.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in sk.k_opt.param_vector()), 'noise': str(sk.noise), 'iters': str(300) } ] #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[0] = re.sub('% ', '%% ', scripts[0]) # Test scripts[0] = re.sub('delta = 1e-6', 'delta = 1e-6', scripts[0]) #scripts[0] = re.sub('hyp.lik = [-1.77276072]', 'hyp.lik = [-0.77276072]', scripts[0]) output_file = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=600)[0] # Read in results output = gpml.read_outputs(output_file) result = ScoredKernel.from_matlab_output(output, sk.k_opt.family(), ndata) print result print output.hessian os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation=False)
def debug_laplace(): # Load data set X, y, D, Xtest, ytest = gpml.load_mat('../data/kfold_data/r_concrete_500_fold_10_of_10.mat', y_dim=1) # Load the suspicious kernel sk = fk.repr_string_to_kernel('ScoredKernel(k_opt=ProductKernel([ MaskKernel(ndim=8, active_dimension=0, base_kernel=CubicKernel(offset=1.757755, output_variance=7.084045)), MaskKernel(ndim=8, active_dimension=7, base_kernel=SqExpPeriodicKernel(lengthscale=-2.701080, period=-0.380918, output_variance=-0.071214)) ]), nll=6348.096611, laplace_nle=-184450132.068237, bic_nle=12720.630212, noise=[-1.77276072])') # Create some code to evaluate it if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear cblparallel.copy_to_remote(data_file) scripts = [gpml.OPTIMIZE_KERNEL_CODE % {'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation=False), 'kernel_family': sk.k_opt.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in sk.k_opt.param_vector()), 'noise': str(sk.noise), 'iters': str(300)}] #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[0] = re.sub('% ', '%% ', scripts[0]) # Test scripts[0] = re.sub('delta = 1e-6', 'delta = 1e-6', scripts[0]) #scripts[0] = re.sub('hyp.lik = [-1.77276072]', 'hyp.lik = [-0.77276072]', scripts[0]) output_file = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=600)[0] # Read in results output = gpml.read_outputs(output_file) result = ScoredKernel.from_matlab_output(output, sk.k_opt.family(), ndata) print result print output.hessian os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation=False)
def evaluate_kernels(kernels, X, y, verbose=True, noise=None, iters=300, local_computation=False, zip_files=False, max_jobs=500, zero_mean=False, random_seed=0): ''' Sets up the kernel optimisation and nll calculation experiments, returns the results as scored kernels Input: - kernels - A list of kernels (i.e. not scored kernels) - X - A matrix (data_points x dimensions) of input locations - y - A matrix (data_points x 1) of output values - ... Return: - A list of ScoredKernel objects ''' # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Set default noise using a heuristic. if noise is None: noise = np.log(np.var(y) / 10) # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(kernels) for (i, kernel) in enumerate(kernels): x = kernel.param_vector() parameters = { 'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'kernel_family': kernel.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in x if type(p) != list), 'eff_dimensions': '[ %s ]' % ';'.join(str(p) for p in x if type(p) == list), 'dim_positions': '[ %s ]' % ' '.join(str(i) for i in range(len(x)) if type(x[i]) == list), 'noise': str(noise), 'iters': str(iters), 'seed': str(random_seed) } print parameters['kernel_params'], parameters[ 'eff_dimensions'], parameters['dim_positions'] if zero_mean: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE_ZERO_MEAN % parameters else: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files) # Read in results results = [None] * len(kernels) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d: %s' % (i + 1, len(kernels), output_file) results[i] = ScoredKernel.from_matlab_output( gpml.read_outputs(output_file), kernels[i].family(), ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(kernels)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results
def evaluate_kernels(kernels, X, y, verbose=True, noise=None, iters=300, local_computation=False, zip_files=False, max_jobs=500, zero_mean=False, random_seed=0): ''' Sets up the kernel optimisation and nll calculation experiments, returns the results as scored kernels Input: - kernels - A list of kernels (i.e. not scored kernels) - X - A matrix (data_points x dimensions) of input locations - y - A matrix (data_points x 1) of output values - ... Return: - A list of ScoredKernel objects ''' # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Set default noise using a heuristic. if noise is None: noise = np.log(np.var(y)/10) # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(kernels) for (i, kernel) in enumerate(kernels): parameters = {'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'kernel_family': kernel.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in kernel.param_vector()), 'noise': str(noise), 'iters': str(iters), 'seed': str(random_seed)} if zero_mean: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE_ZERO_MEAN % parameters else: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files) # Read in results results = [None] * len(kernels) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d' % (i + 1, len(kernels)) results[i] = ScoredKernel.from_matlab_output(gpml.read_outputs(output_file), kernels[i].family(), ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(kernels)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results