예제 #1
0
def parse_results(results_filenames, max_level=None):
    '''
    Returns the best kernel in an experiment output file as a ScoredKernel
    '''
    if not isinstance(results_filenames, list):
        # Backward compatibility wth specifying a single file
        results_filenames = [results_filenames]
    # Read relevant lines of file(s)
    result_tuples = []
    for results_filename in results_filenames:
        lines = []
        with open(results_filename) as results_file:
            score = None
            for line in results_file:
                if line.startswith('score = '):
                    score = line[8:-2]
                elif line.startswith("GPModel"):
                    lines.append(line)
                elif (not max_level is None) and (len(re.findall('Level [0-9]+', line)) > 0):
                    level = int(line.split(' ')[2])
                    if level > max_level:
                        break
        result_tuples += [ff.repr_to_model(line.strip()) for line in lines]
    if not score is None:
        best_tuple = sorted(result_tuples, key=lambda a_model : GPModel.score(a_model, score))[0]
    else:
        best_tuple = sorted(result_tuples, key=GPModel.score)[0]
    return best_tuple
예제 #2
0
def parse_results(results_filenames, max_level=None):
    '''
    Returns the best kernel in an experiment output file as a ScoredKernel
    '''
    if not isinstance(results_filenames, list):
        # Backward compatibility wth specifying a single file
        results_filenames = [results_filenames]
    # Read relevant lines of file(s)
    result_tuples = []
    for results_filename in results_filenames:
        lines = []
        with open(results_filename) as results_file:
            score = None
            for line in results_file:
                if line.startswith('score = '):
                    score = line[8:-2]
                elif line.startswith("GPModel"):
                    lines.append(line)
                elif (not max_level is None) and (len(
                        re.findall('Level [0-9]+', line)) > 0):
                    level = int(line.split(' ')[2])
                    if level > max_level:
                        break
        result_tuples += [ff.repr_to_model(line.strip()) for line in lines]
    if not score is None:
        best_tuple = sorted(
            result_tuples,
            key=lambda a_model: GPModel.score(a_model, score))[0]
    else:
        best_tuple = sorted(result_tuples, key=GPModel.score)[0]
    return best_tuple
def evaluate_models(models,
                    X,
                    y,
                    verbose=True,
                    iters=300,
                    local_computation=False,
                    zip_files=False,
                    max_jobs=500,
                    random_seed=0,
                    subset=False,
                    subset_size=250,
                    full_iters=0,
                    bundle_size=1):

    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1: X = X[:, nax]
    if y.ndim == 1: y = y[:, nax]
    ndata = y.shape[0]

    # Create data file
    if verbose:
        print 'Creating data file locally'
    data_file = cblparallel.create_temp_file('.mat')

    scipy.io.savemat(data_file, {'X': X, 'y': y})

    # Move to fear if necessary
    if not local_computation:
        if verbose:
            print 'Moving data file to fear'
        cblparallel.copy_to_remote(data_file)

    # Create a list of MATLAB scripts to assess and optimise parameters for each kernel
    if verbose:
        print 'Creating scripts'
    scripts = [None] * len(models)
    for (i, model) in enumerate(models):
        parameters = {
            'datafile':
            data_file.split('/')[-1],
            'writefile':
            '%(output_file)s',  # N.B. cblparallel manages output files
            'gpml_path':
            cblparallel.gpml_path(local_computation),
            'mean_syntax':
            model.mean.get_gpml_expression(dimensions=X.shape[1]),
            'mean_params':
            '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector),
            'kernel_syntax':
            model.kernel.get_gpml_expression(dimensions=X.shape[1]),
            'kernel_params':
            '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector),
            'lik_syntax':
            model.likelihood.get_gpml_expression(dimensions=X.shape[1]),
            'lik_params':
            '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector),
            'inference':
            model.likelihood.gpml_inference_method,
            'iters':
            str(iters),
            'seed':
            str(np.random.randint(2**31)),
            'subset':
            'true' if subset else 'false',
            'subset_size':
            str(subset_size),
            'full_iters':
            str(full_iters)
        }

        scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters
        #### Need to be careful with % signs
        #### For the moment, cblparallel expects no single % signs - FIXME
        scripts[i] = re.sub('% ', '%% ', scripts[i])

    # Send to cblparallel and save output_files
    if verbose:
        print 'Sending scripts to cblparallel'
    if local_computation:
        output_files = cblparallel.run_batch_locally(scripts,
                                                     language='matlab',
                                                     max_cpu=1.1,
                                                     job_check_sleep=5,
                                                     submit_sleep=0.1,
                                                     max_running_jobs=10,
                                                     verbose=verbose)
    else:
        output_files = cblparallel.run_batch_on_fear(scripts,
                                                     language='matlab',
                                                     max_jobs=max_jobs,
                                                     verbose=verbose,
                                                     zip_files=zip_files,
                                                     bundle_size=bundle_size)

    # Read in results
    results = [None] * len(models)
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Reading output file %d of %d' % (i + 1, len(models))
        results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file),
                                                models[i], ndata)

    # Tidy up local output files
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Removing output file %d of %d' % (i + 1, len(models))
        os.remove(output_file)
    # Remove temporary data file (perhaps on the cluster server)
    cblparallel.remove_temp_file(data_file, local_computation)

    # Return results i.e. list of ScoredKernel objects
    return results
예제 #4
0
def perform_kernel_search(X, y, D, experiment_data_file_name, results_filename, exp):
    '''Search for the best kernel, in parallel on fear or local machine.'''
    
    # Initialise random seeds - randomness may be used in e.g. data subsetting

    utils.misc.set_all_random_seeds(exp.random_seed)
    
    # Create location, scale and minimum period parameters to pass around for initialisations

    data_shape = {}
    data_shape['x_mean'] = [np.mean(X[:,dim]) for dim in range(X.shape[1])]
    data_shape['y_mean'] = np.mean(y) #### TODO - should this be modified for non real valued data
    data_shape['x_sd'] = np.log([np.std(X[:,dim]) for dim in range(X.shape[1])])
    data_shape['y_sd'] = np.log(np.std(y)) #### TODO - should this be modified for non real valued data
    data_shape['y_min'] = np.min(y)
    data_shape['y_max'] = np.max(y)
    data_shape['x_min'] = [np.min(X[:,dim]) for dim in range(X.shape[1])]
    data_shape['x_max'] = [np.max(X[:,dim]) for dim in range(X.shape[1])]

    # Initialise period at a multiple of the shortest / average distance between points, to prevent Nyquist problems.

    if exp.period_heuristic_type == 'none':
        data_shape['min_period'] = None
    if exp.period_heuristic_type == 'min':
        data_shape['min_period'] = np.log([exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]) for i in range(X.shape[1])])
    elif exp.period_heuristic_type == 'average':
        data_shape['min_period'] = np.log([exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0] for i in range(X.shape[1])])
    elif exp.period_heuristic_type == 'both':
        data_shape['min_period'] = np.log([max(exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]), exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0]) for i in range(X.shape[1])])
    else:
        warnings.warn('Unrecognised period heuristic type : using most conservative heuristic')
        data_shape['min_period'] = np.log([max(exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]), exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0]) for i in range(X.shape[1])])

    data_shape['max_period'] = [np.log((1.0/exp.max_period_heuristic)*(data_shape['x_max'][i] - data_shape['x_min'][i])) for i in range(X.shape[1])]

    # Initialise mean, kernel and likelihood

    m = eval(exp.mean)
    k = eval(exp.kernel)
    l = eval(exp.lik)
    current_models = [ff.GPModel(mean=m, kernel=k, likelihood=l, ndata=y.size)]

    print '\n\nStarting search with this model:\n'
    print current_models[0].pretty_print()
    print ''

    # Perform the initial expansion

    current_models = grammar.expand_models(D=D, models=current_models, base_kernels=exp.base_kernels, rules=exp.search_operators)

    # Convert to additive form if desired

    if exp.additive_form:
        current_models = [model.additive_form() for model in current_models]
        current_models = ff.remove_duplicates(current_models)   

    # Set up lists to record search
    
    all_results = [] # List of scored kernels
    results_sequence = [] # List of lists of results, indexed by level of expansion.
    nan_sequence = [] # List of list of nan scored results
    oob_sequence = [] # List of list of out of bounds results
    best_models = None

    # Other setup

    best_score = np.Inf
    
    # Perform search
    for depth in range(exp.max_depth):
        
        if exp.debug==True:
            current_models = current_models[0:4]
             
        # Add random restarts to kernels
        current_models = ff.add_random_restarts(current_models, exp.n_rand, exp.sd, data_shape=data_shape)

        # Print result of expansion
        if exp.debug:
            print '\nRandomly restarted kernels\n'
            for model in current_models:
                print model.pretty_print()
        
        # Remove any redundancy introduced into kernel expressions
        current_models = [model.simplified() for model in current_models]
        # Print result of simplification
        if exp.debug:
            print '\nSimplified kernels\n'
            for model in current_models:
                print model.pretty_print()
        current_models = ff.remove_duplicates(current_models)
        # Print result of duplicate removal
        if exp.debug:
            print '\nDuplicate removed kernels\n'
            for model in current_models:
                print model.pretty_print()
        
        # Add jitter to parameter values (empirically discovered to help optimiser)
        current_models = ff.add_jitter(current_models, exp.jitter_sd)
        # Print result of jitter
        if exp.debug:
            print '\nJittered kernels\n'
            for model in current_models:
                print model.pretty_print()
        
        # Add the previous best models - in case we just need to optimise more rather than changing structure
        if not best_models is None:
            for a_model in best_models:
                current_models = current_models + [a_model.copy()] + ff.add_jitter_to_models([a_model.copy() for dummy in range(exp.n_rand)], exp.jitter_sd)
        
        # Randomise the order of the model to distribute computational load evenly
        np.random.shuffle(current_models)

        # Print current models
        if exp.debug:
            print '\nKernels to be evaluated\n'
            for model in current_models:
                print model.pretty_print()
        
        # Optimise parameters of and score the kernels
        new_results = jc.evaluate_models(current_models, X, y, verbose=exp.verbose, local_computation=exp.local_computation,
                                          zip_files=True, max_jobs=exp.max_jobs, iters=exp.iters, random_seed=exp.random_seed,
                                          subset=exp.subset, subset_size=exp.subset_size, full_iters=exp.full_iters, bundle_size=exp.bundle_size)
            
        # Remove models that were optimised to be out of bounds (this is similar to a 0-1 prior)
        new_results = [a_model for a_model in new_results if not a_model.out_of_bounds(data_shape)]
        oob_results = [a_model for a_model in new_results if a_model.out_of_bounds(data_shape)]
        oob_results = sorted(oob_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True)
        oob_sequence.append(oob_results)
        
        # Some of the scores may have failed - remove nans to prevent sorting algorithms messing up
        (new_results, nan_results) = remove_nan_scored_models(new_results, exp.score)
        nan_sequence.append(nan_results)
        assert(len(new_results) > 0) # FIXME - Need correct control flow if this happens

        # Sort the new results
        new_results = sorted(new_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True)

        print '\nAll new results\n'
        for result in new_results:
            print 'NLL=%0.1f' % result.nll, 'BIC=%0.1f' % result.bic, 'AIC=%0.1f' % result.aic, 'PL2=%0.3f' % result.pl2, result.pretty_print()

        all_results = all_results + new_results
        all_results = sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True)

        results_sequence.append(all_results)
        
        # Extract the best k kernels from the new all_results
        best_results = sorted(new_results, key=lambda a_model : GPModel.score(a_model, exp.score))[0:exp.k]

        # Print best kernels
        if exp.debug:
            print '\nBest models\n'
            for model in best_results:
                print model.pretty_print()
        
        # Expand the best models
        current_models = grammar.expand_models(D=D, models=best_results, base_kernels=exp.base_kernels, rules=exp.search_operators)

        # Print expansion
        if exp.debug:
            print '\nExpanded models\n'
            for model in current_models:
                print model.pretty_print()
        
        # Convert to additive form if desired
        if exp.additive_form:
            current_models = [model.additive_form() for model in current_models]
            current_models = ff.remove_duplicates(current_models)   

            # Print expansion
            if exp.debug:
                print '\Converted into additive\n'
                for model in current_models:
                    print model.pretty_print()
        
        # Reduce number of kernels when in debug mode
        if exp.debug==True:
            current_models = current_models[0:4]

        # Write all_results to a temporary file at each level.
        all_results = sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True)
        with open(results_filename + '.unfinished', 'w') as outfile:
            outfile.write('Experiment all_results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, all_results) in enumerate(results_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                if exp.verbose_results:
                    for result in all_results:
                        print >> outfile, result  
                else:
                    # Only print top k kernels - i.e. those used to seed the next level of the search
                    for result in sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score))[0:exp.k]:
                        print >> outfile, result 
        # Write nan scored kernels to a log file
        with open(results_filename + '.nans', 'w') as outfile:
            outfile.write('Experiment nan results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, nan_results) in enumerate(nan_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                for result in nan_results:
                    print >> outfile, result  
        # Write oob kernels to a log file
        with open(results_filename + '.oob', 'w') as outfile:
            outfile.write('Experiment oob results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, nan_results) in enumerate(oob_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                for result in nan_results:
                    print >> outfile, result  

        # Have we hit a stopping criterion?
        if 'no_improvement' in exp.stopping_criteria:
            new_best_score = min(GPModel.score(a_model, exp.score) for a_model in new_results)
            if new_best_score < best_score - exp.improvement_tolerance:
                best_score = new_best_score
            else:
                # Insufficient improvement
                print 'Insufficient improvement to score - stopping search'
                break
    
    # Rename temporary results file to actual results file                
    os.rename(results_filename + '.unfinished', results_filename)
예제 #5
0
def perform_kernel_search(X, y, D, experiment_data_file_name, results_filename,
                          exp):
    '''Search for the best kernel, in parallel on fear or local machine.'''

    # Initialise random seeds - randomness may be used in e.g. data subsetting

    utils.misc.set_all_random_seeds(exp.random_seed)

    # Create location, scale and minimum period parameters to pass around for initialisations

    data_shape = {}
    data_shape['x_mean'] = [np.mean(X[:, dim]) for dim in range(X.shape[1])]
    data_shape['y_mean'] = np.mean(
        y)  #### TODO - should this be modified for non real valued data
    data_shape['x_sd'] = np.log(
        [np.std(X[:, dim]) for dim in range(X.shape[1])])
    data_shape['y_sd'] = np.log(np.std(
        y))  #### TODO - should this be modified for non real valued data
    data_shape['y_min'] = np.min(y)
    data_shape['y_max'] = np.max(y)
    data_shape['x_min'] = [np.min(X[:, dim]) for dim in range(X.shape[1])]
    data_shape['x_max'] = [np.max(X[:, dim]) for dim in range(X.shape[1])]
    data_shape['x_min_abs_diff'] = np.log(
        [utils.misc.min_abs_diff(X[:, i]) for i in range(X.shape[1])])

    # Initialise period at a multiple of the shortest / average distance between points, to prevent Nyquist problems.

    if exp.period_heuristic_type == 'none':
        data_shape['min_period'] = None
    if exp.period_heuristic_type == 'min':
        data_shape['min_period'] = np.log([
            exp.period_heuristic * utils.misc.min_abs_diff(X[:, i])
            for i in range(X.shape[1])
        ])
    elif exp.period_heuristic_type == 'average':
        data_shape['min_period'] = np.log([
            exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0]
            for i in range(X.shape[1])
        ])
    elif exp.period_heuristic_type == 'both':
        data_shape['min_period'] = np.log([
            max(exp.period_heuristic * utils.misc.min_abs_diff(X[:, i]),
                exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0])
            for i in range(X.shape[1])
        ])
    else:
        warnings.warn(
            'Unrecognised period heuristic type : using most conservative heuristic'
        )
        data_shape['min_period'] = np.log([
            max(exp.period_heuristic * utils.misc.min_abs_diff(X[:, i]),
                exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0])
            for i in range(X.shape[1])
        ])

    data_shape['max_period'] = [
        np.log((1.0 / exp.max_period_heuristic) *
               (data_shape['x_max'][i] - data_shape['x_min'][i]))
        for i in range(X.shape[1])
    ]

    # Initialise mean, kernel and likelihood

    m = eval(exp.mean)
    k = eval(exp.kernel)
    l = eval(exp.lik)
    current_models = [ff.GPModel(mean=m, kernel=k, likelihood=l, ndata=y.size)]

    print '\n\nStarting search with this model:\n'
    print current_models[0].pretty_print()
    print ''

    # Perform the initial expansion

    current_models = grammar.expand_models(D=D,
                                           models=current_models,
                                           base_kernels=exp.base_kernels,
                                           rules=exp.search_operators)

    # Convert to additive form if desired

    if exp.additive_form:
        current_models = [model.additive_form() for model in current_models]
        current_models = ff.remove_duplicates(current_models)

    # Set up lists to record search

    all_results = []  # List of scored kernels
    results_sequence = [
    ]  # List of lists of results, indexed by level of expansion.
    nan_sequence = []  # List of list of nan scored results
    oob_sequence = []  # List of list of out of bounds results
    best_models = None

    # Other setup

    best_score = np.Inf

    # Perform search
    for depth in range(exp.max_depth):

        if exp.debug == True:
            current_models = current_models[0:4]

        # Add random restarts to kernels
        current_models = ff.add_random_restarts(current_models,
                                                exp.n_rand,
                                                exp.sd,
                                                data_shape=data_shape)

        # Print result of expansion
        if exp.debug:
            print '\nRandomly restarted kernels\n'
            for model in current_models:
                print model.pretty_print()

        # Remove any redundancy introduced into kernel expressions
        current_models = [model.simplified() for model in current_models]
        # Print result of simplification
        if exp.debug:
            print '\nSimplified kernels\n'
            for model in current_models:
                print model.pretty_print()
        current_models = ff.remove_duplicates(current_models)
        # Print result of duplicate removal
        if exp.debug:
            print '\nDuplicate removed kernels\n'
            for model in current_models:
                print model.pretty_print()

        # Add jitter to parameter values (empirically discovered to help optimiser)
        current_models = ff.add_jitter(current_models, exp.jitter_sd)
        # Print result of jitter
        if exp.debug:
            print '\nJittered kernels\n'
            for model in current_models:
                print model.pretty_print()

        # Add the previous best models - in case we just need to optimise more rather than changing structure
        if not best_models is None:
            for a_model in best_models:
                current_models = current_models + [
                    a_model.copy()
                ] + ff.add_jitter_to_models(
                    [a_model.copy()
                     for dummy in range(exp.n_rand)], exp.jitter_sd)

        # Randomise the order of the model to distribute computational load evenly
        np.random.shuffle(current_models)

        # Print current models
        if exp.debug:
            print '\nKernels to be evaluated\n'
            for model in current_models:
                print model.pretty_print()

        # Optimise parameters of and score the kernels
        new_results = jc.my_evaluate_models(
            current_models,
            X,
            y,
            verbose=exp.verbose,
            local_computation=exp.local_computation,
            zip_files=True,
            max_jobs=exp.max_jobs,
            iters=exp.iters,
            random_seed=exp.random_seed,
            subset=exp.subset,
            subset_size=exp.subset_size,
            full_iters=exp.full_iters,
            bundle_size=exp.bundle_size)

        # Remove models that were optimised to be out of bounds (this is similar to a 0-1 prior)
        new_results = [
            a_model for a_model in new_results
            if not a_model.out_of_bounds(data_shape)
        ]
        oob_results = [
            a_model for a_model in new_results
            if a_model.out_of_bounds(data_shape)
        ]
        #new_results = [a_model for a_model in new_results]
        #oob_results = [a_model for a_model in new_results]
        oob_results = sorted(
            oob_results,
            key=lambda a_model: GPModel.score(a_model, exp.score),
            reverse=True)
        oob_sequence.append(oob_results)

        # Some of the scores may have failed - remove nans to prevent sorting algorithms messing up
        (new_results,
         nan_results) = remove_nan_scored_models(new_results, exp.score)
        nan_sequence.append(nan_results)
        assert (len(new_results) > 0
                )  # FIXME - Need correct control flow if this happens

        # Sort the new results
        new_results = sorted(
            new_results,
            key=lambda a_model: GPModel.score(a_model, exp.score),
            reverse=True)

        print '\nAll new results\n'
        for result in new_results:
            print 'NLL=%0.1f' % result.nll, 'BIC=%0.1f' % result.bic, 'AIC=%0.1f' % result.aic, 'PL2=%0.3f' % result.pl2, result.pretty_print(
            )

        all_results = all_results + new_results
        all_results = sorted(
            all_results,
            key=lambda a_model: GPModel.score(a_model, exp.score),
            reverse=True)

        results_sequence.append(all_results)

        # Extract the best k kernels from the new all_results
        best_results = sorted(
            new_results,
            key=lambda a_model: GPModel.score(a_model, exp.score))[0:exp.k]

        # Print best kernels
        if exp.debug:
            print '\nBest models\n'
            for model in best_results:
                print model.pretty_print()

        # Expand the best models
        current_models = grammar.expand_models(D=D,
                                               models=best_results,
                                               base_kernels=exp.base_kernels,
                                               rules=exp.search_operators)

        # Print expansion
        if exp.debug:
            print '\nExpanded models\n'
            for model in current_models:
                print model.pretty_print()

        # Convert to additive form if desired
        if exp.additive_form:
            current_models = [
                model.additive_form() for model in current_models
            ]
            current_models = ff.remove_duplicates(current_models)

            # Print expansion
            if exp.debug:
                print '\Converted into additive\n'
                for model in current_models:
                    print model.pretty_print()

        # Reduce number of kernels when in debug mode
        if exp.debug == True:
            current_models = current_models[0:4]

        # Write all_results to a temporary file at each level.
        all_results = sorted(
            all_results,
            key=lambda a_model: GPModel.score(a_model, exp.score),
            reverse=True)
        with open(results_filename + '.unfinished', 'w') as outfile:
            outfile.write('Experiment all_results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, all_results) in enumerate(results_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                if exp.verbose_results:
                    for result in all_results:
                        print >> outfile, result
                else:
                    # Only print top k kernels - i.e. those used to seed the next level of the search
                    i = 0
                    for result in sorted(all_results,
                                         key=lambda a_model: GPModel.score(
                                             a_model, exp.score))[0:exp.k]:
                        print >> outfile, result
                        scipy.io.savemat(
                            results_filename + 'lvl_' + str(depth) + '_' +
                            str(i) + '.mat1', result.gpml_result)
                        i += 1
        # Write nan scored kernels to a log file
        with open(results_filename + '.nans', 'w') as outfile:
            outfile.write('Experiment nan results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, nan_results) in enumerate(nan_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                for result in nan_results:
                    print >> outfile, result
        # Write oob kernels to a log file
        with open(results_filename + '.oob', 'w') as outfile:
            outfile.write('Experiment oob results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, nan_results) in enumerate(oob_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                for result in nan_results:
                    print >> outfile, result

        # Have we hit a stopping criterion?
        if 'no_improvement' in exp.stopping_criteria:
            new_best_score = min(
                GPModel.score(a_model, exp.score) for a_model in new_results)
            if new_best_score < best_score - exp.improvement_tolerance:
                best_score = new_best_score
            else:
                # Insufficient improvement
                print 'Insufficient improvement to score - stopping search'
                break

    # Rename temporary results file to actual results file
    os.rename(results_filename + '.unfinished', results_filename)
예제 #6
0
def evaluate_models(models, X, y, verbose=True, iters=300, local_computation=False, zip_files=False, max_jobs=500, random_seed=0, subset=False, subset_size=250, full_iters=0, bundle_size=1):
   
    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1: X = X[:, nax]
    if y.ndim == 1: y = y[:, nax]
    ndata = y.shape[0]
    
    # Create data file
    if verbose:
        print 'Creating data file locally'
    data_file = cblparallel.create_temp_file('.mat')

    scipy.io.savemat(data_file, {'X': X, 'y': y})
    
    # Move to fear if necessary
    if not local_computation:
        if verbose:
            print 'Moving data file to fear'
        cblparallel.copy_to_remote(data_file)
    
    # Create a list of MATLAB scripts to assess and optimise parameters for each kernel
    if verbose:
        print 'Creating scripts'
    scripts = [None] * len(models)
    for (i, model) in enumerate(models):
        parameters = {'datafile': data_file.split('/')[-1],
                      'writefile': '%(output_file)s', # N.B. cblparallel manages output files
                      'gpml_path': cblparallel.gpml_path(local_computation),
                      'mean_syntax': model.mean.get_gpml_expression(dimensions=X.shape[1]),
                      'mean_params': '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector),
                      'kernel_syntax': model.kernel.get_gpml_expression(dimensions=X.shape[1]),
                      'kernel_params': '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector),
                      'lik_syntax': model.likelihood.get_gpml_expression(dimensions=X.shape[1]),
                      'lik_params': '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector),
                      'inference': model.likelihood.gpml_inference_method,
                      'iters': str(iters),
                      'seed': str(np.random.randint(2**31)),
                      'subset': 'true' if subset else 'false',
                      'subset_size' : str(subset_size),
                      'full_iters' : str(full_iters)}

        scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters
        #### Need to be careful with % signs
        #### For the moment, cblparallel expects no single % signs - FIXME
        scripts[i] = re.sub('% ', '%% ', scripts[i])
    
    # Send to cblparallel and save output_files
    if verbose:
        print 'Sending scripts to cblparallel'
    if local_computation:
        output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose)  
    else:
        output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size)  
    
    # Read in results
    results = [None] * len(models)
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Reading output file %d of %d' % (i + 1, len(models))
        results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata)
    
    # Tidy up local output files
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Removing output file %d of %d' % (i + 1, len(models)) 
        os.remove(output_file)
    # Remove temporary data file (perhaps on the cluster server)
    cblparallel.remove_temp_file(data_file, local_computation)
    
    # Return results i.e. list of ScoredKernel objects
    return results
예제 #7
0
def evaluate_models(
    models,
    X,
    y,
    verbose=True,
    iters=300,
    local_computation=False,
    zip_files=False,
    max_jobs=500,
    random_seed=0,
    subset=False,
    subset_size=250,
    full_iters=0,
    bundle_size=1,
):

    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1:
        X = X[:, nax]
    if y.ndim == 1:
        y = y[:, nax]
    ndata = y.shape[0]

    # Create data file
    if verbose:
        print "Creating data file locally"
    data_file = cblparallel.create_temp_file(".mat")

    scipy.io.savemat(data_file, {"X": X, "y": y})

    # Move to fear if necessary
    if not local_computation:
        if verbose:
            print "Moving data file to fear"
        cblparallel.copy_to_remote(data_file)

    # Create a list of MATLAB scripts to assess and optimise parameters for each kernel
    if verbose:
        print "Creating scripts"
    scripts = [None] * len(models)
    for (i, model) in enumerate(models):
        parameters = {
            "datafile": data_file.split("/")[-1],
            "writefile": "%(output_file)s",  # N.B. cblparallel manages output files
            "gpml_path": cblparallel.gpml_path(local_computation),
            "mean_syntax": model.mean.get_gpml_expression(dimensions=X.shape[1]),
            "mean_params": "[ %s ]" % " ".join(str(p) for p in model.mean.param_vector),
            "kernel_syntax": model.kernel.get_gpml_expression(dimensions=X.shape[1]),
            "kernel_params": "[ %s ]" % " ".join(str(p) for p in model.kernel.param_vector),
            "lik_syntax": model.likelihood.get_gpml_expression(dimensions=X.shape[1]),
            "lik_params": "[ %s ]" % " ".join(str(p) for p in model.likelihood.param_vector),
            "inference": model.likelihood.gpml_inference_method,
            "iters": str(iters),
            "seed": str(np.random.randint(2 ** 31)),
            "subset": "true" if subset else "false",
            "subset_size": str(subset_size),
            "full_iters": str(full_iters),
        }

        scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters
        #### Need to be careful with % signs
        #### For the moment, cblparallel expects no single % signs - FIXME
        scripts[i] = re.sub("% ", "%% ", scripts[i])

    # Send to cblparallel and save output_files
    if verbose:
        print "Sending scripts to cblparallel"
    if local_computation:
        output_files = cblparallel.run_batch_locally(
            scripts,
            language="matlab",
            max_cpu=1.1,
            job_check_sleep=5,
            submit_sleep=0.1,
            max_running_jobs=10,
            verbose=verbose,
        )
    else:
        output_files = cblparallel.run_batch_on_fear(
            scripts, language="matlab", max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size
        )

    # Read in results
    results = [None] * len(models)
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print "Reading output file %d of %d" % (i + 1, len(models))
        results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata)

    # Tidy up local output files
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print "Removing output file %d of %d" % (i + 1, len(models))
        os.remove(output_file)
    # Remove temporary data file (perhaps on the cluster server)
    cblparallel.remove_temp_file(data_file, local_computation)

    # Return results i.e. list of ScoredKernel objects
    return results