Exemplo n.º 1
0
def parse_results(results_filenames, max_level=None):
    '''
    Returns the best kernel in an experiment output file as a ScoredKernel
    '''
    if not isinstance(results_filenames, list):
        # Backward compatibility wth specifying a single file
        results_filenames = [results_filenames]
    # Read relevant lines of file(s)
    result_tuples = []
    for results_filename in results_filenames:
        lines = []
        with open(results_filename) as results_file:
            score = None
            for line in results_file:
                if line.startswith('score = '):
                    score = line[8:-2]
                elif line.startswith("GPModel"):
                    lines.append(line)
                elif (not max_level is None) and (len(re.findall('Level [0-9]+', line)) > 0):
                    level = int(line.split(' ')[2])
                    if level > max_level:
                        break
        result_tuples += [ff.repr_to_model(line.strip()) for line in lines]
    if not score is None:
        best_tuple = sorted(result_tuples, key=lambda a_model : GPModel.score(a_model, score))[0]
    else:
        best_tuple = sorted(result_tuples, key=GPModel.score)[0]
    return best_tuple
Exemplo n.º 2
0
def parse_results(results_filenames, max_level=None):
    '''
    Returns the best kernel in an experiment output file as a ScoredKernel
    '''
    if not isinstance(results_filenames, list):
        # Backward compatibility wth specifying a single file
        results_filenames = [results_filenames]
    # Read relevant lines of file(s)
    result_tuples = []
    for results_filename in results_filenames:
        lines = []
        with open(results_filename) as results_file:
            score = None
            for line in results_file:
                if line.startswith('score = '):
                    score = line[8:-2]
                elif line.startswith("GPModel"):
                    lines.append(line)
                elif (not max_level is None) and (len(
                        re.findall('Level [0-9]+', line)) > 0):
                    level = int(line.split(' ')[2])
                    if level > max_level:
                        break
        result_tuples += [ff.repr_to_model(line.strip()) for line in lines]
    if not score is None:
        best_tuple = sorted(
            result_tuples,
            key=lambda a_model: GPModel.score(a_model, score))[0]
    else:
        best_tuple = sorted(result_tuples, key=GPModel.score)[0]
    return best_tuple
Exemplo n.º 3
0
def perform_kernel_search(X, y, D, experiment_data_file_name, results_filename, exp):
    '''Search for the best kernel, in parallel on fear or local machine.'''
    
    # Initialise random seeds - randomness may be used in e.g. data subsetting

    utils.misc.set_all_random_seeds(exp.random_seed)
    
    # Create location, scale and minimum period parameters to pass around for initialisations

    data_shape = {}
    data_shape['x_mean'] = [np.mean(X[:,dim]) for dim in range(X.shape[1])]
    data_shape['y_mean'] = np.mean(y) #### TODO - should this be modified for non real valued data
    data_shape['x_sd'] = np.log([np.std(X[:,dim]) for dim in range(X.shape[1])])
    data_shape['y_sd'] = np.log(np.std(y)) #### TODO - should this be modified for non real valued data
    data_shape['y_min'] = np.min(y)
    data_shape['y_max'] = np.max(y)
    data_shape['x_min'] = [np.min(X[:,dim]) for dim in range(X.shape[1])]
    data_shape['x_max'] = [np.max(X[:,dim]) for dim in range(X.shape[1])]

    # Initialise period at a multiple of the shortest / average distance between points, to prevent Nyquist problems.

    if exp.period_heuristic_type == 'none':
        data_shape['min_period'] = None
    if exp.period_heuristic_type == 'min':
        data_shape['min_period'] = np.log([exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]) for i in range(X.shape[1])])
    elif exp.period_heuristic_type == 'average':
        data_shape['min_period'] = np.log([exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0] for i in range(X.shape[1])])
    elif exp.period_heuristic_type == 'both':
        data_shape['min_period'] = np.log([max(exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]), exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0]) for i in range(X.shape[1])])
    else:
        warnings.warn('Unrecognised period heuristic type : using most conservative heuristic')
        data_shape['min_period'] = np.log([max(exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]), exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0]) for i in range(X.shape[1])])

    data_shape['max_period'] = [np.log((1.0/exp.max_period_heuristic)*(data_shape['x_max'][i] - data_shape['x_min'][i])) for i in range(X.shape[1])]

    # Initialise mean, kernel and likelihood

    m = eval(exp.mean)
    k = eval(exp.kernel)
    l = eval(exp.lik)
    current_models = [ff.GPModel(mean=m, kernel=k, likelihood=l, ndata=y.size)]

    print '\n\nStarting search with this model:\n'
    print current_models[0].pretty_print()
    print ''

    # Perform the initial expansion

    current_models = grammar.expand_models(D=D, models=current_models, base_kernels=exp.base_kernels, rules=exp.search_operators)

    # Convert to additive form if desired

    if exp.additive_form:
        current_models = [model.additive_form() for model in current_models]
        current_models = ff.remove_duplicates(current_models)   

    # Set up lists to record search
    
    all_results = [] # List of scored kernels
    results_sequence = [] # List of lists of results, indexed by level of expansion.
    nan_sequence = [] # List of list of nan scored results
    oob_sequence = [] # List of list of out of bounds results
    best_models = None

    # Other setup

    best_score = np.Inf
    
    # Perform search
    for depth in range(exp.max_depth):
        
        if exp.debug==True:
            current_models = current_models[0:4]
             
        # Add random restarts to kernels
        current_models = ff.add_random_restarts(current_models, exp.n_rand, exp.sd, data_shape=data_shape)

        # Print result of expansion
        if exp.debug:
            print '\nRandomly restarted kernels\n'
            for model in current_models:
                print model.pretty_print()
        
        # Remove any redundancy introduced into kernel expressions
        current_models = [model.simplified() for model in current_models]
        # Print result of simplification
        if exp.debug:
            print '\nSimplified kernels\n'
            for model in current_models:
                print model.pretty_print()
        current_models = ff.remove_duplicates(current_models)
        # Print result of duplicate removal
        if exp.debug:
            print '\nDuplicate removed kernels\n'
            for model in current_models:
                print model.pretty_print()
        
        # Add jitter to parameter values (empirically discovered to help optimiser)
        current_models = ff.add_jitter(current_models, exp.jitter_sd)
        # Print result of jitter
        if exp.debug:
            print '\nJittered kernels\n'
            for model in current_models:
                print model.pretty_print()
        
        # Add the previous best models - in case we just need to optimise more rather than changing structure
        if not best_models is None:
            for a_model in best_models:
                current_models = current_models + [a_model.copy()] + ff.add_jitter_to_models([a_model.copy() for dummy in range(exp.n_rand)], exp.jitter_sd)
        
        # Randomise the order of the model to distribute computational load evenly
        np.random.shuffle(current_models)

        # Print current models
        if exp.debug:
            print '\nKernels to be evaluated\n'
            for model in current_models:
                print model.pretty_print()
        
        # Optimise parameters of and score the kernels
        new_results = jc.evaluate_models(current_models, X, y, verbose=exp.verbose, local_computation=exp.local_computation,
                                          zip_files=True, max_jobs=exp.max_jobs, iters=exp.iters, random_seed=exp.random_seed,
                                          subset=exp.subset, subset_size=exp.subset_size, full_iters=exp.full_iters, bundle_size=exp.bundle_size)
            
        # Remove models that were optimised to be out of bounds (this is similar to a 0-1 prior)
        new_results = [a_model for a_model in new_results if not a_model.out_of_bounds(data_shape)]
        oob_results = [a_model for a_model in new_results if a_model.out_of_bounds(data_shape)]
        oob_results = sorted(oob_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True)
        oob_sequence.append(oob_results)
        
        # Some of the scores may have failed - remove nans to prevent sorting algorithms messing up
        (new_results, nan_results) = remove_nan_scored_models(new_results, exp.score)
        nan_sequence.append(nan_results)
        assert(len(new_results) > 0) # FIXME - Need correct control flow if this happens

        # Sort the new results
        new_results = sorted(new_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True)

        print '\nAll new results\n'
        for result in new_results:
            print 'NLL=%0.1f' % result.nll, 'BIC=%0.1f' % result.bic, 'AIC=%0.1f' % result.aic, 'PL2=%0.3f' % result.pl2, result.pretty_print()

        all_results = all_results + new_results
        all_results = sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True)

        results_sequence.append(all_results)
        
        # Extract the best k kernels from the new all_results
        best_results = sorted(new_results, key=lambda a_model : GPModel.score(a_model, exp.score))[0:exp.k]

        # Print best kernels
        if exp.debug:
            print '\nBest models\n'
            for model in best_results:
                print model.pretty_print()
        
        # Expand the best models
        current_models = grammar.expand_models(D=D, models=best_results, base_kernels=exp.base_kernels, rules=exp.search_operators)

        # Print expansion
        if exp.debug:
            print '\nExpanded models\n'
            for model in current_models:
                print model.pretty_print()
        
        # Convert to additive form if desired
        if exp.additive_form:
            current_models = [model.additive_form() for model in current_models]
            current_models = ff.remove_duplicates(current_models)   

            # Print expansion
            if exp.debug:
                print '\Converted into additive\n'
                for model in current_models:
                    print model.pretty_print()
        
        # Reduce number of kernels when in debug mode
        if exp.debug==True:
            current_models = current_models[0:4]

        # Write all_results to a temporary file at each level.
        all_results = sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True)
        with open(results_filename + '.unfinished', 'w') as outfile:
            outfile.write('Experiment all_results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, all_results) in enumerate(results_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                if exp.verbose_results:
                    for result in all_results:
                        print >> outfile, result  
                else:
                    # Only print top k kernels - i.e. those used to seed the next level of the search
                    for result in sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score))[0:exp.k]:
                        print >> outfile, result 
        # Write nan scored kernels to a log file
        with open(results_filename + '.nans', 'w') as outfile:
            outfile.write('Experiment nan results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, nan_results) in enumerate(nan_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                for result in nan_results:
                    print >> outfile, result  
        # Write oob kernels to a log file
        with open(results_filename + '.oob', 'w') as outfile:
            outfile.write('Experiment oob results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, nan_results) in enumerate(oob_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                for result in nan_results:
                    print >> outfile, result  

        # Have we hit a stopping criterion?
        if 'no_improvement' in exp.stopping_criteria:
            new_best_score = min(GPModel.score(a_model, exp.score) for a_model in new_results)
            if new_best_score < best_score - exp.improvement_tolerance:
                best_score = new_best_score
            else:
                # Insufficient improvement
                print 'Insufficient improvement to score - stopping search'
                break
    
    # Rename temporary results file to actual results file                
    os.rename(results_filename + '.unfinished', results_filename)
Exemplo n.º 4
0
def perform_kernel_search(X, y, D, experiment_data_file_name, results_filename,
                          exp):
    '''Search for the best kernel, in parallel on fear or local machine.'''

    # Initialise random seeds - randomness may be used in e.g. data subsetting

    utils.misc.set_all_random_seeds(exp.random_seed)

    # Create location, scale and minimum period parameters to pass around for initialisations

    data_shape = {}
    data_shape['x_mean'] = [np.mean(X[:, dim]) for dim in range(X.shape[1])]
    data_shape['y_mean'] = np.mean(
        y)  #### TODO - should this be modified for non real valued data
    data_shape['x_sd'] = np.log(
        [np.std(X[:, dim]) for dim in range(X.shape[1])])
    data_shape['y_sd'] = np.log(np.std(
        y))  #### TODO - should this be modified for non real valued data
    data_shape['y_min'] = np.min(y)
    data_shape['y_max'] = np.max(y)
    data_shape['x_min'] = [np.min(X[:, dim]) for dim in range(X.shape[1])]
    data_shape['x_max'] = [np.max(X[:, dim]) for dim in range(X.shape[1])]
    data_shape['x_min_abs_diff'] = np.log(
        [utils.misc.min_abs_diff(X[:, i]) for i in range(X.shape[1])])

    # Initialise period at a multiple of the shortest / average distance between points, to prevent Nyquist problems.

    if exp.period_heuristic_type == 'none':
        data_shape['min_period'] = None
    if exp.period_heuristic_type == 'min':
        data_shape['min_period'] = np.log([
            exp.period_heuristic * utils.misc.min_abs_diff(X[:, i])
            for i in range(X.shape[1])
        ])
    elif exp.period_heuristic_type == 'average':
        data_shape['min_period'] = np.log([
            exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0]
            for i in range(X.shape[1])
        ])
    elif exp.period_heuristic_type == 'both':
        data_shape['min_period'] = np.log([
            max(exp.period_heuristic * utils.misc.min_abs_diff(X[:, i]),
                exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0])
            for i in range(X.shape[1])
        ])
    else:
        warnings.warn(
            'Unrecognised period heuristic type : using most conservative heuristic'
        )
        data_shape['min_period'] = np.log([
            max(exp.period_heuristic * utils.misc.min_abs_diff(X[:, i]),
                exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0])
            for i in range(X.shape[1])
        ])

    data_shape['max_period'] = [
        np.log((1.0 / exp.max_period_heuristic) *
               (data_shape['x_max'][i] - data_shape['x_min'][i]))
        for i in range(X.shape[1])
    ]

    # Initialise mean, kernel and likelihood

    m = eval(exp.mean)
    k = eval(exp.kernel)
    l = eval(exp.lik)
    current_models = [ff.GPModel(mean=m, kernel=k, likelihood=l, ndata=y.size)]

    print '\n\nStarting search with this model:\n'
    print current_models[0].pretty_print()
    print ''

    # Perform the initial expansion

    current_models = grammar.expand_models(D=D,
                                           models=current_models,
                                           base_kernels=exp.base_kernels,
                                           rules=exp.search_operators)

    # Convert to additive form if desired

    if exp.additive_form:
        current_models = [model.additive_form() for model in current_models]
        current_models = ff.remove_duplicates(current_models)

    # Set up lists to record search

    all_results = []  # List of scored kernels
    results_sequence = [
    ]  # List of lists of results, indexed by level of expansion.
    nan_sequence = []  # List of list of nan scored results
    oob_sequence = []  # List of list of out of bounds results
    best_models = None

    # Other setup

    best_score = np.Inf

    # Perform search
    for depth in range(exp.max_depth):

        if exp.debug == True:
            current_models = current_models[0:4]

        # Add random restarts to kernels
        current_models = ff.add_random_restarts(current_models,
                                                exp.n_rand,
                                                exp.sd,
                                                data_shape=data_shape)

        # Print result of expansion
        if exp.debug:
            print '\nRandomly restarted kernels\n'
            for model in current_models:
                print model.pretty_print()

        # Remove any redundancy introduced into kernel expressions
        current_models = [model.simplified() for model in current_models]
        # Print result of simplification
        if exp.debug:
            print '\nSimplified kernels\n'
            for model in current_models:
                print model.pretty_print()
        current_models = ff.remove_duplicates(current_models)
        # Print result of duplicate removal
        if exp.debug:
            print '\nDuplicate removed kernels\n'
            for model in current_models:
                print model.pretty_print()

        # Add jitter to parameter values (empirically discovered to help optimiser)
        current_models = ff.add_jitter(current_models, exp.jitter_sd)
        # Print result of jitter
        if exp.debug:
            print '\nJittered kernels\n'
            for model in current_models:
                print model.pretty_print()

        # Add the previous best models - in case we just need to optimise more rather than changing structure
        if not best_models is None:
            for a_model in best_models:
                current_models = current_models + [
                    a_model.copy()
                ] + ff.add_jitter_to_models(
                    [a_model.copy()
                     for dummy in range(exp.n_rand)], exp.jitter_sd)

        # Randomise the order of the model to distribute computational load evenly
        np.random.shuffle(current_models)

        # Print current models
        if exp.debug:
            print '\nKernels to be evaluated\n'
            for model in current_models:
                print model.pretty_print()

        # Optimise parameters of and score the kernels
        new_results = jc.my_evaluate_models(
            current_models,
            X,
            y,
            verbose=exp.verbose,
            local_computation=exp.local_computation,
            zip_files=True,
            max_jobs=exp.max_jobs,
            iters=exp.iters,
            random_seed=exp.random_seed,
            subset=exp.subset,
            subset_size=exp.subset_size,
            full_iters=exp.full_iters,
            bundle_size=exp.bundle_size)

        # Remove models that were optimised to be out of bounds (this is similar to a 0-1 prior)
        new_results = [
            a_model for a_model in new_results
            if not a_model.out_of_bounds(data_shape)
        ]
        oob_results = [
            a_model for a_model in new_results
            if a_model.out_of_bounds(data_shape)
        ]
        #new_results = [a_model for a_model in new_results]
        #oob_results = [a_model for a_model in new_results]
        oob_results = sorted(
            oob_results,
            key=lambda a_model: GPModel.score(a_model, exp.score),
            reverse=True)
        oob_sequence.append(oob_results)

        # Some of the scores may have failed - remove nans to prevent sorting algorithms messing up
        (new_results,
         nan_results) = remove_nan_scored_models(new_results, exp.score)
        nan_sequence.append(nan_results)
        assert (len(new_results) > 0
                )  # FIXME - Need correct control flow if this happens

        # Sort the new results
        new_results = sorted(
            new_results,
            key=lambda a_model: GPModel.score(a_model, exp.score),
            reverse=True)

        print '\nAll new results\n'
        for result in new_results:
            print 'NLL=%0.1f' % result.nll, 'BIC=%0.1f' % result.bic, 'AIC=%0.1f' % result.aic, 'PL2=%0.3f' % result.pl2, result.pretty_print(
            )

        all_results = all_results + new_results
        all_results = sorted(
            all_results,
            key=lambda a_model: GPModel.score(a_model, exp.score),
            reverse=True)

        results_sequence.append(all_results)

        # Extract the best k kernels from the new all_results
        best_results = sorted(
            new_results,
            key=lambda a_model: GPModel.score(a_model, exp.score))[0:exp.k]

        # Print best kernels
        if exp.debug:
            print '\nBest models\n'
            for model in best_results:
                print model.pretty_print()

        # Expand the best models
        current_models = grammar.expand_models(D=D,
                                               models=best_results,
                                               base_kernels=exp.base_kernels,
                                               rules=exp.search_operators)

        # Print expansion
        if exp.debug:
            print '\nExpanded models\n'
            for model in current_models:
                print model.pretty_print()

        # Convert to additive form if desired
        if exp.additive_form:
            current_models = [
                model.additive_form() for model in current_models
            ]
            current_models = ff.remove_duplicates(current_models)

            # Print expansion
            if exp.debug:
                print '\Converted into additive\n'
                for model in current_models:
                    print model.pretty_print()

        # Reduce number of kernels when in debug mode
        if exp.debug == True:
            current_models = current_models[0:4]

        # Write all_results to a temporary file at each level.
        all_results = sorted(
            all_results,
            key=lambda a_model: GPModel.score(a_model, exp.score),
            reverse=True)
        with open(results_filename + '.unfinished', 'w') as outfile:
            outfile.write('Experiment all_results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, all_results) in enumerate(results_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                if exp.verbose_results:
                    for result in all_results:
                        print >> outfile, result
                else:
                    # Only print top k kernels - i.e. those used to seed the next level of the search
                    i = 0
                    for result in sorted(all_results,
                                         key=lambda a_model: GPModel.score(
                                             a_model, exp.score))[0:exp.k]:
                        print >> outfile, result
                        scipy.io.savemat(
                            results_filename + 'lvl_' + str(depth) + '_' +
                            str(i) + '.mat1', result.gpml_result)
                        i += 1
        # Write nan scored kernels to a log file
        with open(results_filename + '.nans', 'w') as outfile:
            outfile.write('Experiment nan results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, nan_results) in enumerate(nan_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                for result in nan_results:
                    print >> outfile, result
        # Write oob kernels to a log file
        with open(results_filename + '.oob', 'w') as outfile:
            outfile.write('Experiment oob results for\n datafile = %s\n\n %s \n\n' \
                          % (experiment_data_file_name, experiment_fields_to_str(exp)))
            for (i, nan_results) in enumerate(oob_sequence):
                outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i)
                for result in nan_results:
                    print >> outfile, result

        # Have we hit a stopping criterion?
        if 'no_improvement' in exp.stopping_criteria:
            new_best_score = min(
                GPModel.score(a_model, exp.score) for a_model in new_results)
            if new_best_score < best_score - exp.improvement_tolerance:
                best_score = new_best_score
            else:
                # Insufficient improvement
                print 'Insufficient improvement to score - stopping search'
                break

    # Rename temporary results file to actual results file
    os.rename(results_filename + '.unfinished', results_filename)