Пример #1
0
def test_config_parsing_relative_input_paths():

    train_dir = '../train'
    train_file = join(train_dir, 'f0.jsonlines')
    test_file = join(train_dir, 'f1.jsonlines')
    output_dir = '../output'
    custom_learner_path_input = join('other', 'majority_class_learner.py')

    # make a simple config file that has relative paths
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'evaluate',
                           'train_file': train_file,
                           'test_file': test_file,
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'results': output_dir,
                           'objective': 'f1_score_micro'}

    config_template_path = join(_my_dir, 'configs',
                                'test_relative_paths.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'relative_paths')

    (experiment_name, task, sampler, fixed_sampler_parameters,
     feature_hasher, hasher_features, id_col, label_col, train_set_name,
     test_set_name, suffix, featuresets, do_shuffle, model_path,
     do_grid_search, grid_objective, probability, results_path,
     pos_label_str, feature_scaling, min_feature_count,
     grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds,
     fixed_parameter_list, param_grid_list, featureset_names, learners,
     prediction_dir, log_path, train_path, test_path, ids_to_floats,
     class_map, custom_learner_path) = _parse_config_file(config_path)
Пример #2
0
def test_config_parsing_bad_cv_folds():
    """
    Test to ensure config file parsing raises an error with an invalid cv_folds
    """

    train_dir = join(_my_dir, 'train')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad value for cv_folds
    # but everything else is correct
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'cross_validate',
                           'train_directory': train_dir,
                           'num_cv_folds': 'random',
                           'featuresets': "[['f1', 'f2', 'f3']]",
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'results': output_dir,
                           'objective': 'f1_score_macro'}

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'bad_cv_folds')

    yield check_config_parsing_value_error, config_path
Пример #3
0
def test_config_parsing_bad_featurenames():
    # Test to ensure config file parsing raises an error with badly specified
    # featureset names

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'evaluate',
        'train_directory': train_dir,
        'test_directory': test_dir,
        'learners': "['LogisticRegression']",
        'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', "
                        "'f6']]"),
        'log': output_dir,
        'results': output_dir
    }

    for fname, sub_prefix in zip(["['set_a']", "['1', 2]", "set_a", "1"], [
            'wrong_num_names', 'wrong_type_names', 'wrong_num_and_type1',
            'wrong_num_and_type2'
    ]):
        if fname is not None:
            values_to_fill_dict['featureset_names'] = fname

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict, sub_prefix)

        yield check_config_parsing_value_error, config_path
Пример #4
0
def test_config_parsing_bad_hashing():
    """
    Test to ensure config file parsing raises an error when feature_hasher is specified but not hasher_features
    """

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'evaluate',
                           'train_directory': train_dir,
                           'test_directory': test_dir,
                           'featuresets': "[['f1', 'f2', 'f3']]",
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'results': output_dir,
                           'feature_hasher': 'True'}

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'bad_hashing')

    yield check_config_parsing_value_error, config_path
Пример #5
0
def test_config_parsing_bad_cv_folds():
    """
    Test to ensure config file parsing raises an error with an invalid cv_folds
    """

    train_dir = join(_my_dir, 'train')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad value for cv_folds
    # but everything else is correct
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'cross_validate',
        'train_directory': train_dir,
        'num_cv_folds': 'random',
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'results': output_dir,
        'objective': 'f1_score_macro'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict, 'bad_cv_folds')

    yield check_config_parsing_value_error, config_path
Пример #6
0
def test_config_parsing_bad_objective_and_objectives():
    """
    Test to ensure config file parsing raises an error with 
    a grid objectives and objective both given non default values
    """

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    values_to_fill_dict = {
        'train_directory': train_dir,
        'log': output_dir,
        'results': output_dir,
        'objectives': "['accuracy']",
        'objective': "accuracy"
    }
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'bad_objective_and_objectives')
    yield check_config_parsing_value_error, config_path
Пример #7
0
def test_config_parsing_bad_objectives():
    """
    Test to ensure config file parsing raises an error with a grid objectives given as a string
    """

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'evaluate',
                           'train_directory': train_dir,
                           'test_directory': test_dir,
                           'featuresets': "[['f1', 'f2', 'f3']]",
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'results': output_dir,
                           'objectives': "accuracy"}
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'bad_objectives')
    yield check_config_parsing_type_error, config_path
Пример #8
0
def test_default_learning_curve_options():

    train_dir = join(_my_dir, 'train')
    output_dir = join(_my_dir, 'output')

    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'learning_curve',
        'train_directory': train_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LogisticRegression', 'MultinomialNB']",
        'log': output_dir,
        'results': output_dir,
        'objective': 'f1_score_macro'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'default_learning_curve')

    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds,
     save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list,
     featureset_names, learners, prediction_dir, log_path, train_path,
     test_path, ids_to_floats, class_map, custom_learner_path,
     learning_curve_cv_folds_list,
     learning_curve_train_sizes) = _parse_config_file(config_path)

    eq_(learning_curve_cv_folds_list, [10, 10])
    ok_(np.all(learning_curve_train_sizes == np.linspace(0.1, 1.0, 5)))
Пример #9
0
def test_config_parsing_bad_objectives():
    """
    Test to ensure config file parsing raises an error with a grid objectives given as a string
    """

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'evaluate',
        'train_directory': train_dir,
        'test_directory': test_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'results': output_dir,
        'objectives': "accuracy"
    }
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict, 'bad_objectives')
    yield check_config_parsing_type_error, config_path
Пример #10
0
def test_config_parsing_relative_input_path():

    train_dir = join('..', 'train')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has an invalid option
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'cross_validate',
        'train_directory': train_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'results': output_dir,
        'objective': 'f1_score_macro'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')

    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'mislocated_input_file')

    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds,
     save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list,
     featureset_names, learners, prediction_dir, log_path, train_path,
     test_path, ids_to_floats, class_map,
     custom_learner_path) = _parse_config_file(config_path)

    eq_(normpath(train_path), (join(_my_dir, 'train')))
Пример #11
0
def test_config_parsing_bad_task():
    # Test to ensure config file parsing raises an error with invalid or
    # missing task
    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'train_directory': train_dir,
                           'test_directory': test_dir,
                           'featuresets': "[['f1', 'f2', 'f3']]",
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'results': output_dir}

    for task_value, sub_prefix in zip([None, '', 'procrastinate'],
                                      ['no_task', 'missing_task', 'bad_task']):
        if task_value is not None:
            values_to_fill_dict['task'] = task_value
        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict,
                                             sub_prefix)

        yield check_config_parsing_value_error, config_path
Пример #12
0
def test_config_parsing_bad_hashing():
    """
    Test to ensure config file parsing raises an error when feature_hasher is specified but not hasher_features
    """

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'evaluate',
        'train_directory': train_dir,
        'test_directory': test_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'results': output_dir,
        'feature_hasher': 'True'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict, 'bad_hashing')

    yield check_config_parsing_value_error, config_path
Пример #13
0
def test_config_parsing_bad_featuresets():
    # Test to ensure config file parsing raises an error with badly specified
    # featuresets

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'evaluate',
        'train_directory': train_dir,
        'test_directory': test_dir,
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'results': output_dir
    }

    for featuresets, sub_prefix in zip(
        [None, '[]', "{'f1', 'f2', 'f3'}", "[['f1', 'f2'], 'f3', 'f4']"],
        ['no_feats', 'empty_feats', 'non_list_feats1', 'non_list_feats2']):
        if featuresets is not None:
            values_to_fill_dict['featuresets'] = featuresets

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict, sub_prefix)
        yield check_config_parsing_value_error, config_path
Пример #14
0
def test_config_parsing_bad_task():
    # Test to ensure config file parsing raises an error with invalid or
    # missing task
    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'train_directory': train_dir,
        'test_directory': test_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'results': output_dir
    }

    for task_value, sub_prefix in zip([None, '', 'procrastinate'],
                                      ['no_task', 'missing_task', 'bad_task']):
        if task_value is not None:
            values_to_fill_dict['task'] = task_value
        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict, sub_prefix)

        yield check_config_parsing_value_error, config_path
Пример #15
0
def test_config_parsing_bad_learner():
    # Test to ensure config file parsing raises an error with missing, bad and
    # duplicate learners

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'evaluate',
        'train_directory': train_dir,
        'test_directory': test_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'log': output_dir,
        'results': output_dir
    }

    for learners_list, sub_prefix in zip([
            None, '[]', 'LogisticRegression', "['LogisticRegression', "
            "'LogisticRegression']"
    ], [
            'no_learner', 'empty_learner', 'not_list_learner',
            'duplicate_learner'
    ]):
        if learners_list is not None:
            values_to_fill_dict['learners'] = learners_list

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict, sub_prefix)
        yield check_config_parsing_value_error, config_path
Пример #16
0
def test_config_parsing_bad_featurenames():
    # Test to ensure config file parsing raises an error with badly specified
    # featureset names

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'evaluate',
                           'train_directory': train_dir,
                           'test_directory': test_dir,
                           'learners': "['LogisticRegression']",
                           'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', "
                                           "'f6']]"),
                           'log': output_dir,
                           'results': output_dir}

    for fname, sub_prefix in zip(["['set_a']", "['1', 2]", "set_a", "1"],
                                 ['wrong_num_names', 'wrong_type_names',
                                  'wrong_num_and_type1',
                                  'wrong_num_and_type2']):
        if fname is not None:
            values_to_fill_dict['featureset_names'] = fname

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict,
                                             sub_prefix)

        yield check_config_parsing_value_error, config_path
Пример #17
0
def test_config_parsing_bad_scaling():
    # Test to ensure config file parsing raises an error with invalid scaling
    # type

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'evaluate',
                           'train_directory': train_dir,
                           'test_directory': test_dir,
                           'learners': "['LogisticRegression']",
                           'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', "
                                           "'f6']]"),
                           'log': output_dir,
                           'results': output_dir}

    for scaling_type, sub_prefix in zip(["foo", "True", "False"],
                                        ['bad_scaling1', 'bad_scaling2',
                                         'bad_scaling3']):

        values_to_fill_dict['feature_scaling'] = scaling_type

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict,
                                             sub_prefix)

        yield check_config_parsing_value_error, config_path
Пример #18
0
def test_config_parsing_bad_learner():
    # Test to ensure config file parsing raises an error with missing, bad and
    # duplicate learners

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'evaluate',
                           'train_directory': train_dir,
                           'test_directory': test_dir,
                           'featuresets': "[['f1', 'f2', 'f3']]",
                           'log': output_dir,
                           'results': output_dir}

    for learners_list, sub_prefix in zip([None, '[]', 'LogisticRegression',
                                          "['LogisticRegression', "
                                          "'LogisticRegression']"],
                                         ['no_learner', 'empty_learner',
                                          'not_list_learner',
                                          'duplicate_learner']):
        if learners_list is not None:
            values_to_fill_dict['learners'] = learners_list

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict,
                                             sub_prefix)
        yield check_config_parsing_value_error, config_path
Пример #19
0
def test_config_parsing_option_in_wrong_section():

    train_dir = join(_my_dir, 'train')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has an option in the wrong section
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'cross_validate',
        'train_directory': train_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'results': output_dir,
        'probability': 'true',
        'objective': 'f1_score_macro'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')

    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'option_in_wrong_section')

    yield check_config_parsing_key_error, config_path
Пример #20
0
def test_config_parsing_bad_featuresets():
    # Test to ensure config file parsing raises an error with badly specified
    # featuresets

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'evaluate',
                           'train_directory': train_dir,
                           'test_directory': test_dir,
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'results': output_dir}

    for featuresets, sub_prefix in zip([None, '[]', "{'f1', 'f2', 'f3'}",
                                        "[['f1', 'f2'], 'f3', 'f4']"],
                                       ['no_feats', 'empty_feats',
                                        'non_list_feats1', 'non_list_feats2']):
        if featuresets is not None:
            values_to_fill_dict['featuresets'] = featuresets

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict,
                                             sub_prefix)
        yield check_config_parsing_value_error, config_path
Пример #21
0
def test_config_parsing_mislocated_input_path():

    train_dir = 'train'
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has an invalid option
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'cross_validate',
        'train_directory': train_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'results': output_dir,
        'objective': 'f1_score_macro'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')

    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'mislocated_input_file')

    yield check_config_parsing_file_not_found_error, config_path
Пример #22
0
def test_config_parsing_bad_scaling():
    # Test to ensure config file parsing raises an error with invalid scaling
    # type

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'evaluate',
        'train_directory': train_dir,
        'test_directory': test_dir,
        'learners': "['LogisticRegression']",
        'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', "
                        "'f6']]"),
        'log': output_dir,
        'results': output_dir
    }

    for scaling_type, sub_prefix in zip(
        ["foo", "True", "False"],
        ['bad_scaling1', 'bad_scaling2', 'bad_scaling3']):

        values_to_fill_dict['feature_scaling'] = scaling_type

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict, sub_prefix)

        yield check_config_parsing_value_error, config_path
Пример #23
0
def test_config_parsing_relative_input_paths():

    train_dir = '../train'
    train_file = join(train_dir, 'f0.jsonlines')
    test_file = join(train_dir, 'f1.jsonlines')
    output_dir = '../output'
    custom_learner_path_input = join('other', 'majority_class_learner.py')

    # make a simple config file that has relative paths
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'evaluate',
        'train_file': train_file,
        'test_file': test_file,
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'results': output_dir,
        'objective': 'f1_score_micro'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_relative_paths.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict, 'relative_paths')

    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds,
     save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list,
     featureset_names, learners, prediction_dir, log_path, train_path,
     test_path, ids_to_floats, class_map,
     custom_learner_path) = _parse_config_file(config_path)
Пример #24
0
def test_setting_number_of_cv_folds():

    train_dir = join(_my_dir, 'train')
    output_dir = join(_my_dir, 'output')
    # make a simple config file that does not set cv_folds

    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'cross_validate',
                           'train_directory': train_dir,
                           'featuresets': "[['f1', 'f2', 'f3']]",
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'results': output_dir,
                           'num_cv_folds': "5",
                           'objective': 'f1_score_macro'}

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'default_cv_folds')

    (experiment_name, task, sampler, fixed_sampler_parameters,
     feature_hasher, hasher_features, id_col, label_col, train_set_name,
     test_set_name, suffix, featuresets, do_shuffle, model_path,
     do_grid_search, grid_objective, probability, results_path,
     pos_label_str, feature_scaling, min_feature_count,
     grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds,
     fixed_parameter_list, param_grid_list, featureset_names, learners,
     prediction_dir, log_path, train_path, test_path, ids_to_floats,
     class_map, custom_learner_path) = _parse_config_file(config_path)

    eq_(cv_folds, 5)
Пример #25
0
def test_config_parsing_bad_test():
    # Test to ensure config file parsing raises an error with invalid test path
    # specifications

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'evaluate',
        'train_directory': train_dir,
        'learners': "['LogisticRegression']",
        'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', "
                        "'f6']]"),
        'log': output_dir,
        'results': output_dir
    }

    for sub_prefix in [
            'both_test_path_and_file', 'nonexistent_test_path',
            'nonexistent_test_file'
    ]:

        if sub_prefix == 'both_test_path_and_file':
            test_fh = tempfile.NamedTemporaryFile(suffix='jsonlines',
                                                  prefix=join(
                                                      _my_dir, 'other',
                                                      ('test_config_'
                                                       'parsing_')))
            values_to_fill_dict['test_file'] = test_fh.name
            values_to_fill_dict['test_directory'] = test_dir

        elif sub_prefix == 'nonexistent_test_path':
            values_to_fill_dict['test_directory'] = join(test_dir, 'foo')

        elif sub_prefix == 'nonexistent_test_file':
            values_to_fill_dict['test_file'] = 'foo.jsonlines'

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict, sub_prefix)

        yield check_config_parsing_value_error, config_path

        if sub_prefix == 'both_test_path_and_file':
            test_fh.close()
Пример #26
0
def test_config_parsing_bad_train():
    # Test to ensure config file parsing raises an error with invalid train
    # path specifications

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'evaluate',
                           'test_directory': test_dir,
                           'learners': "['LogisticRegression']",
                           'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', "
                                           "'f6']]"),
                           'log': output_dir,
                           'results': output_dir}

    for sub_prefix in ['no_train_path_or_file',
                       'both_train_path_and_file',
                       'nonexistent_train_path',
                       'nonexistent_test_file']:

        if sub_prefix == 'both_train_path_and_file':
            train_fh = tempfile.NamedTemporaryFile(suffix='jsonlines',
                                                   prefix=join(_my_dir,
                                                               'other',
                                                               ('test_config_'
                                                                'parsing_')))
            values_to_fill_dict['train_file'] = train_fh.name
            values_to_fill_dict['train_directory'] = train_dir

        elif sub_prefix == 'nonexistent_train_path':
            values_to_fill_dict['train_directory'] = join(train_dir, 'foo')

        elif sub_prefix == 'nonexistent_test_file':
            values_to_fill_dict['train_file'] = 'foo.jsonlines'

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict,
                                             sub_prefix)

        yield check_config_parsing_value_error, config_path

        if sub_prefix == 'both_train_path_and_file':
            train_fh.close()
Пример #27
0
def test_setting_fixed_parameters():

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that does not set cv_folds

    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'evaluate',
        'train_directory': train_dir,
        'test_directory': test_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LinearSVC']",
        'log': output_dir,
        'results': output_dir,
        'fixed_parameters': "[{'C': [1e-6, 0.001, 1, 10, 100, 1e5]}]",
        'objective': 'f1_score_macro'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'fixed_parameters')

    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds,
     save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list,
     featureset_names, learners, prediction_dir, log_path, train_path,
     test_path, ids_to_floats, class_map, custom_learner_path,
     learning_curve_cv_folds_list,
     learning_curve_train_sizes) = _parse_config_file(config_path)

    eq_(fixed_parameter_list[0]['C'][0], 1e-6)
    eq_(fixed_parameter_list[0]['C'][1], 1e-3)
    eq_(fixed_parameter_list[0]['C'][2], 1)
    eq_(fixed_parameter_list[0]['C'][3], 10)
    eq_(fixed_parameter_list[0]['C'][4], 100)
    eq_(fixed_parameter_list[0]['C'][5], 1e5)
Пример #28
0
def test_setting_fixed_parameters():

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that does not set cv_folds

    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'evaluate',
                           'train_directory': train_dir,
                           'test_directory': test_dir,
                           'featuresets': "[['f1', 'f2', 'f3']]",
                           'learners': "['LinearSVC']",
                           'log': output_dir,
                           'results': output_dir,
                           'fixed_parameters': "[{'C': [1e-6, 0.001, 1, 10, 100, 1e5]}]",
                           'objective': 'f1_score_macro'}

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'fixed_parameters')

    (experiment_name, task, sampler, fixed_sampler_parameters,
     feature_hasher, hasher_features, id_col, label_col, train_set_name,
     test_set_name, suffix, featuresets, do_shuffle, model_path,
     do_grid_search, grid_objective, probability, results_path,
     pos_label_str, feature_scaling, min_feature_count,
     grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds,
     fixed_parameter_list, param_grid_list, featureset_names, learners,
     prediction_dir, log_path, train_path, test_path, ids_to_floats,
     class_map, custom_learner_path) = _parse_config_file(config_path)

    eq_(fixed_parameter_list[0]['C'][0], 1e-6)
    eq_(fixed_parameter_list[0]['C'][1], 1e-3)
    eq_(fixed_parameter_list[0]['C'][2], 1)
    eq_(fixed_parameter_list[0]['C'][3], 10)
    eq_(fixed_parameter_list[0]['C'][4], 100)
    eq_(fixed_parameter_list[0]['C'][5], 1e5)
Пример #29
0
def test_config_parsing_mislocated_input_path():

    train_dir = 'train'
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has an invalid option
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'cross_validate',
                           'train_directory': train_dir,
                           'featuresets': "[['f1', 'f2', 'f3']]",
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'results': output_dir,
                           'objective': 'f1_score_macro'}

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')

    config_path = fill_in_config_options(config_template_path,
                                                   values_to_fill_dict,
                                                   'mislocated_input_file')

    yield check_config_parsing_file_not_found_error, config_path
Пример #30
0
def test_config_parsing_option_in_wrong_section():

    train_dir = join(_my_dir, 'train')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has an option in the wrong section
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'cross_validate',
                           'train_directory': train_dir,
                           'featuresets': "[['f1', 'f2', 'f3']]",
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'results': output_dir,
                           'probability': 'true',
                           'objective': 'f1_score_macro'}

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')

    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'option_in_wrong_section')

    yield check_config_parsing_key_error, config_path
Пример #31
0
def test_config_parsing_bad_objective_and_objectives():
    """
    Test to ensure config file parsing raises an error with
    a grid objectives and objective both given non default values
    """

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    values_to_fill_dict = {'train_directory': train_dir,
                           'log': output_dir,
                           'results': output_dir,
                           'objectives': "['accuracy']",
                           'objective': "accuracy"}
    config_path = fill_in_config_options(config_template_path,
                           values_to_fill_dict,
                           'bad_objective_and_objectives')
    yield check_config_parsing_value_error, config_path
Пример #32
0
def test_config_parsing_bad_task_paths():
    # Test to ensure config file parsing raises an error with various
    # incorrectly set path

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'train_directory': train_dir,
        'learners': "['LogisticRegression']",
        'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', "
                        "'f6']]"),
        'log': output_dir
    }

    for sub_prefix in [
            'predict_no_test', 'evaluate_no_test', 'xv_with_test_path',
            'train_with_test_path', 'xv_with_test_file',
            'train_with_test_file', 'train_with_results',
            'predict_with_results', 'train_no_model', 'train_with_predictions',
            'xv_with_model'
    ]:

        if sub_prefix == 'predict_no_test':
            values_to_fill_dict['task'] = 'predict'
            values_to_fill_dict['predictions'] = output_dir

        elif sub_prefix == 'evaluate_no_test':
            values_to_fill_dict['task'] = 'evaluate'
            values_to_fill_dict['results'] = output_dir

        elif sub_prefix == 'xv_with_test_path':
            values_to_fill_dict['task'] = 'cross_validate'
            values_to_fill_dict['results'] = output_dir
            values_to_fill_dict['test_directory'] = test_dir

        elif sub_prefix == 'train_with_test_path':
            values_to_fill_dict['task'] = 'train'
            values_to_fill_dict['models'] = output_dir
            values_to_fill_dict['test_directory'] = test_dir

        elif sub_prefix == 'xv_with_test_file':
            values_to_fill_dict['task'] = 'cross_validate'
            values_to_fill_dict['results'] = output_dir
            test_fh1 = tempfile.NamedTemporaryFile(suffix='jsonlines',
                                                   prefix=join(
                                                       _my_dir, 'other',
                                                       ('test_config_'
                                                        'parsing_')))
            values_to_fill_dict['test_file'] = test_fh1.name

        elif sub_prefix == 'train_with_test_file':
            values_to_fill_dict['task'] = 'train'
            values_to_fill_dict['models'] = output_dir
            test_fh2 = tempfile.NamedTemporaryFile(suffix='jsonlines',
                                                   prefix=join(
                                                       _my_dir, 'other',
                                                       ('test_config_'
                                                        'parsing_')))

            values_to_fill_dict['test_file'] = test_fh2.name

        elif sub_prefix == 'train_with_results':
            values_to_fill_dict['task'] = 'train'
            values_to_fill_dict['models'] = output_dir
            values_to_fill_dict['results'] = output_dir

        elif sub_prefix == 'predict_with_results':
            values_to_fill_dict['task'] = 'predict'
            values_to_fill_dict['test_directory'] = test_dir
            values_to_fill_dict['predictions'] = output_dir
            values_to_fill_dict['results'] = output_dir

        elif sub_prefix == 'train_no_model':
            values_to_fill_dict['task'] = 'train'

        elif sub_prefix == 'train_with_predictions':
            values_to_fill_dict['task'] = 'train'
            values_to_fill_dict['models'] = output_dir
            values_to_fill_dict['predictions'] = output_dir

        elif sub_prefix == 'xv_with_model':
            values_to_fill_dict['task'] = 'cross_validate'
            values_to_fill_dict['results'] = output_dir
            values_to_fill_dict['models'] = output_dir

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict, sub_prefix)

        yield check_config_parsing_value_error, config_path

        if sub_prefix == 'xv_with_test_file':
            test_fh1.close()

        elif sub_prefix == 'train_with_test_file':
            test_fh2.close()
Пример #33
0
def test_config_parsing_bad_task_paths():
    # Test to ensure config file parsing raises an error with various
    # incorrectly set path

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has a bad task
    # but everything else is correct
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'train_directory': train_dir,
                           'learners': "['LogisticRegression']",
                           'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', "
                                           "'f6']]"),
                           'log': output_dir}

    for sub_prefix in ['predict_no_test', 'evaluate_no_test',
                       'xv_with_test_path', 'train_with_test_path',
                       'xv_with_test_file', 'train_with_test_file',
                       'train_with_results', 'predict_with_results',
                       'train_no_model', 'train_with_predictions',
                       'xv_with_model']:

        if sub_prefix == 'predict_no_test':
            values_to_fill_dict['task'] = 'predict'
            values_to_fill_dict['predictions'] = output_dir

        elif sub_prefix == 'evaluate_no_test':
            values_to_fill_dict['task'] = 'evaluate'
            values_to_fill_dict['results'] = output_dir

        elif sub_prefix == 'xv_with_test_path':
            values_to_fill_dict['task'] = 'cross_validate'
            values_to_fill_dict['results'] = output_dir
            values_to_fill_dict['test_directory'] = test_dir

        elif sub_prefix == 'train_with_test_path':
            values_to_fill_dict['task'] = 'train'
            values_to_fill_dict['models'] = output_dir
            values_to_fill_dict['test_directory'] = test_dir

        elif sub_prefix == 'xv_with_test_file':
            values_to_fill_dict['task'] = 'cross_validate'
            values_to_fill_dict['results'] = output_dir
            test_fh1 = tempfile.NamedTemporaryFile(suffix='jsonlines',
                                                   prefix=join(_my_dir,
                                                               'other',
                                                               ('test_config_'
                                                                'parsing_')))
            values_to_fill_dict['test_file'] = test_fh1.name

        elif sub_prefix == 'train_with_test_file':
            values_to_fill_dict['task'] = 'train'
            values_to_fill_dict['models'] = output_dir
            test_fh2 = tempfile.NamedTemporaryFile(suffix='jsonlines',
                                                   prefix=join(_my_dir,
                                                               'other',
                                                               ('test_config_'
                                                                'parsing_')))

            values_to_fill_dict['test_file'] = test_fh2.name

        elif sub_prefix == 'train_with_results':
            values_to_fill_dict['task'] = 'train'
            values_to_fill_dict['models'] = output_dir
            values_to_fill_dict['results'] = output_dir

        elif sub_prefix == 'predict_with_results':
            values_to_fill_dict['task'] = 'predict'
            values_to_fill_dict['test_directory'] = test_dir
            values_to_fill_dict['predictions'] = output_dir
            values_to_fill_dict['results'] = output_dir

        elif sub_prefix == 'train_no_model':
            values_to_fill_dict['task'] = 'train'

        elif sub_prefix == 'train_with_predictions':
            values_to_fill_dict['task'] = 'train'
            values_to_fill_dict['models'] = output_dir
            values_to_fill_dict['predictions'] = output_dir

        elif sub_prefix == 'xv_with_model':
            values_to_fill_dict['task'] = 'cross_validate'
            values_to_fill_dict['results'] = output_dir
            values_to_fill_dict['models'] = output_dir

        config_template_path = join(_my_dir, 'configs',
                                    'test_config_parsing.template.cfg')
        config_path = fill_in_config_options(config_template_path,
                                             values_to_fill_dict,
                                             sub_prefix)

        yield check_config_parsing_value_error, config_path

        if sub_prefix == 'xv_with_test_file':
            test_fh1.close()

        elif sub_prefix == 'train_with_test_file':
            test_fh2.close()
Пример #34
0
def check_xval_fancy_results_file(do_grid_search,
                                  use_folds_file,
                                  use_folds_file_for_grid_search,
                                  use_additional_metrics):

    train_path = join(_my_dir, 'train', 'f0.jsonlines')
    output_dir = join(_my_dir, 'output')

    # make a simple config file for cross-validation
    values_to_fill_dict = {'experiment_name': 'test_fancy_xval',
                           'train_file': train_path,
                           'task': 'cross_validate',
                           'grid_search': 'true',
                           'objectives': "['f1_score_micro']",
                           'featureset_names': '["f0"]',
                           'num_cv_folds': '6',
                           'grid_search_folds': '4',
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'predictions': output_dir,
                           'results': output_dir}

    folds_file_path = join(_my_dir, 'train', 'folds_file_test.csv')
    if use_folds_file:
        values_to_fill_dict['folds_file'] = folds_file_path
    values_to_fill_dict['grid_search'] = str(do_grid_search)
    values_to_fill_dict['use_folds_file_for_grid_search'] = str(use_folds_file_for_grid_search)

    if use_additional_metrics:
        if PY2:
            values_to_fill_dict['metrics'] = str([b"accuracy", b"unweighted_kappa"])
        else:
            values_to_fill_dict['metrics'] = str(["accuracy", "unweighted_kappa"])

    config_template_path = join(_my_dir,
                                'configs',
                                'test_fancy.template.cfg')

    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'xval')

    # run the experiment
    run_configuration(config_path, quiet=True)

    # now make sure that the results file was produced
    results_file_path = join(_my_dir, 'output', 'test_fancy_xval_f0_LogisticRegression.results')
    ok_(exists(results_file_path))

    # read in all the lines and look at the lines up to where we print the "Total Time"
    with open(results_file_path, 'r') as resultsf:
        results_lines = resultsf.readlines()
        end_idx = [results_lines.index(l) for l in results_lines if l.startswith('Total Time:')][0]
        results_lines = results_lines[:end_idx+1]

        # read in the "keys" and "values" separated by colons into a dictionary
        results_dict = dict([rl.strip().split(': ') for rl in results_lines])

    # check that the fields we expect in the results file are there
    # and the ones that we do not expect aren't
    if do_grid_search:
        eq_(results_dict['Grid Search'], 'True')
        eq_(results_dict['Grid Objective Function'], 'f1_score_micro')
    else:
        eq_(results_dict['Grid Search'], 'False')
        ok_('Grid Search Folds' not in results_dict)
        ok_('Grid Objective Function' not in results_dict)

    if use_folds_file:
        eq_(results_dict['Number of Folds'], '5 via folds file')
        ok_('Stratified Folds' not in results_dict)
        eq_(results_dict['Specified Folds File'], folds_file_path)
        if do_grid_search:
            if use_folds_file_for_grid_search:
                eq_(results_dict['Grid Search Folds'], '5 via folds file')
                eq_(results_dict['Using Folds File for Grid Search'], 'True')
            else:
                eq_(results_dict['Grid Search Folds'], '4')
                eq_(results_dict['Using Folds File for Grid Search'], 'False')
    else:
        eq_(results_dict['Number of Folds'], '6')
        eq_(results_dict['Stratified Folds'], 'True')
        ok_('Using Folds File for Grid Search' not in results_dict)
        ok_('Specified Folds File' not in results_dict)
        if do_grid_search:
            eq_(results_dict['Grid Search Folds'], '4')

    if use_additional_metrics:
        expected_metrics = [b"accuracy", b"unweighted_kappa"] if PY2 else ["accuracy", "unweighted_kappa"]

        eq_(sorted(literal_eval(results_dict['Additional Evaluation Metrics'])),
            sorted(expected_metrics))
Пример #35
0
def check_grid_search_cv_results(task, do_grid_search):
    learners = ['LogisticRegression', 'SVC']
    expected_path = join(_my_dir, 'other', 'cv_results')
    time_field = lambda x: x.endswith('_time')
    train_path = join(_my_dir, 'train', 'f0.jsonlines')
    output_dir = join(_my_dir, 'output')

    exp_name = ('test_grid_search_cv_results_{}_{}'
                .format(task, "gs" if do_grid_search else "nogs"))

    # make a simple config file for cross-validation
    values_to_fill_dict = {'experiment_name': exp_name,
                           'train_file': train_path,
                           'task': task,
                           'grid_search': json.dumps(do_grid_search),
                           'objectives': "['f1_score_micro']",
                           'featureset_names': "['f0']",
                           'learners': '{}'.format(json.dumps(learners)),
                           'log': output_dir,
                           'results': output_dir}
    if task == 'train':
        values_to_fill_dict['models'] = output_dir
    elif task == 'cross_validate':
        values_to_fill_dict['predictions'] = output_dir
    elif task in ['evaluate', 'predict']:
        values_to_fill_dict['predictions'] = output_dir
        values_to_fill_dict['test_file'] = \
            values_to_fill_dict['train_file']

    # In the case where grid search is on and the task is
    # learning curve, grid search will automatically be turned
    # off, so simply turn it off here as well since it should
    # result in the same situation
    elif task == 'learning_curve':
        values_to_fill_dict['metrics'] = values_to_fill_dict.pop('objectives')
        if do_grid_search:
            do_grid_search = False

    config_template_path = join(_my_dir,
                                'configs',
                                'test_cv_results.template.cfg')

    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         task)

    # run the experiment
    if task in ['train', 'predict']:
        if do_grid_search:
            run_configuration(config_path, quiet=True)
        else:
            assert_raises(ValueError, run_configuration, config_path, quiet=True)
            # Short-circuit the test since a ValueError is
            # expected and is fatal
            return
    else:
        run_configuration(config_path, quiet=True)

    # now make sure that the results json file was produced
    for learner in learners:
        results_file_name = ('{}_f0_{}.results.json'
                             .format(exp_name, learner))
        actual_results_file_path = join(_my_dir, 'output',
                                        results_file_name)
        expected_results_file_path = join(expected_path,
                                          results_file_name)
        ok_(exists(actual_results_file_path))
        with open(expected_results_file_path) as expected, \
             open(actual_results_file_path) as actual:
            expected_lines = [json.loads(line) for line in expected][0]
            actual_lines = [json.loads(line) for line in actual][0]
            assert len(expected_lines) == len(actual_lines)
            if task == 'cross_validate':
                # All but the last line will have grid search-related
                # results
                for (expected_gs_cv_results,
                     actual_gs_cv_results) in zip(expected_lines[:-1],
                                                  actual_lines[:-1]):
                    assert len(expected_gs_cv_results) == len(actual_gs_cv_results)
                    for field in ['grid_score', 'grid_search_cv_results']:
                        if do_grid_search:
                            assert (set(expected_gs_cv_results)
                                    .intersection(actual_gs_cv_results) ==
                                    set(expected_gs_cv_results))
                            if field == 'grid_score':
                                assert expected_gs_cv_results[field] == \
                                       actual_gs_cv_results[field]
                            else:
                                for subfield in expected_gs_cv_results[field]:
                                    if time_field(subfield): continue
                                    assert expected_gs_cv_results[field][subfield] == \
                                           actual_gs_cv_results[field][subfield]
                        else:
                            if field == 'grid_score':
                                assert actual_gs_cv_results[field] == 0.0
                            else:
                                assert actual_gs_cv_results[field] is None
                # The last line should be for the "average" and should
                # not contain any grid search results
                assert actual_lines[-1]['fold'] == 'average'
                for field in ['grid_score', 'grid_search_cv_results']:
                    assert field not in actual_lines[-1]
            elif task == 'evaluate':
                for (expected_gs_cv_results,
                     actual_gs_cv_results) in zip(expected_lines,
                                                  actual_lines):
                    assert len(expected_gs_cv_results) == len(actual_gs_cv_results)
                    for field in ['grid_score', 'grid_search_cv_results']:
                        if do_grid_search:
                            assert (set(expected_gs_cv_results)
                                    .intersection(actual_gs_cv_results) ==
                                    set(expected_gs_cv_results))
                            if field == 'grid_score':
                                assert expected_gs_cv_results[field] == \
                                       actual_gs_cv_results[field]
                            else:
                                for subfield in expected_gs_cv_results[field]:
                                    if time_field(subfield): continue
                                    assert expected_gs_cv_results[field][subfield] == \
                                           actual_gs_cv_results[field][subfield]
                        else:
                            if field == 'grid_score':
                                assert actual_gs_cv_results[field] == 0.0
                            else:
                                assert actual_gs_cv_results[field] is None
            elif task in ['train', 'predict']:
                expected_gs_cv_results = expected_lines
                actual_gs_cv_results = actual_lines
                assert set(expected_gs_cv_results).intersection(actual_gs_cv_results) == \
                       set(expected_gs_cv_results)
                for field in ['grid_score', 'grid_search_cv_results']:
                    if field == 'grid_score':
                        assert expected_gs_cv_results[field] == \
                               actual_gs_cv_results[field]
                    else:
                        for subfield in expected_gs_cv_results[field]:
                            if time_field(subfield): continue
                            assert expected_gs_cv_results[field][subfield] == \
                                   actual_gs_cv_results[field][subfield]
            else:
                for expected_line, actual_line in zip(expected_lines,
                                                      actual_lines):
                    expected_fields = set(list(expected_line))
                    actual_fields = set(list(actual_line))
                    assert expected_fields.intersection(actual_fields) == \
                           expected_fields
                    assert all(field not in actual_fields
                               for field in ['grid_score',
                                             'grid_search_cv_results'])
Пример #36
0
def check_xval_fancy_results_file(do_grid_search, use_folds_file,
                                  use_folds_file_for_grid_search,
                                  use_additional_metrics):

    train_path = join(_my_dir, 'train', 'f0.jsonlines')
    output_dir = join(_my_dir, 'output')

    # make a simple config file for cross-validation
    values_to_fill_dict = {
        'experiment_name': 'test_fancy_xval',
        'train_file': train_path,
        'task': 'cross_validate',
        'featureset_names': '["f0"]',
        'num_cv_folds': '6',
        'grid_search_folds': '4',
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'predictions': output_dir,
        'results': output_dir
    }

    folds_file_path = join(_my_dir, 'train', 'folds_file_test.csv')
    if use_folds_file:
        values_to_fill_dict['folds_file'] = folds_file_path
    values_to_fill_dict['grid_search'] = str(do_grid_search)
    values_to_fill_dict['use_folds_file_for_grid_search'] = str(
        use_folds_file_for_grid_search)

    if use_additional_metrics:
        if PY2:
            values_to_fill_dict['metrics'] = str(
                [b"accuracy", b"unweighted_kappa"])
        else:
            values_to_fill_dict['metrics'] = str(
                ["accuracy", "unweighted_kappa"])

    config_template_path = join(_my_dir, 'configs', 'test_fancy.template.cfg')

    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict, 'xval')

    # run the experiment
    run_configuration(config_path, quiet=True)

    # now make sure that the results file was produced
    results_file_path = join(_my_dir, 'output',
                             'test_fancy_xval_f0_LogisticRegression.results')
    ok_(exists(results_file_path))

    # read in all the lines and look at the lines up to where we print the "Total Time"
    with open(results_file_path, 'r') as resultsf:
        results_lines = resultsf.readlines()
        end_idx = [
            results_lines.index(l) for l in results_lines
            if l.startswith('Total Time:')
        ][0]
        results_lines = results_lines[:end_idx + 1]

        # read in the "keys" and "values" separated by colons into a dictionary
        results_dict = dict([rl.strip().split(': ') for rl in results_lines])

    # check that the fields we expect in the results file are there
    # and the ones that we do not expect aren't
    if do_grid_search:
        eq_(results_dict['Grid Search'], 'True')
        eq_(results_dict['Grid Objective Function'], 'f1_score_micro')
    else:
        eq_(results_dict['Grid Search'], 'False')
        ok_('Grid Search Folds' not in results_dict)
        ok_('Grid Objective Function' not in results_dict)

    if use_folds_file:
        eq_(results_dict['Number of Folds'], '5 via folds file')
        ok_('Stratified Folds' not in results_dict)
        eq_(results_dict['Specified Folds File'], folds_file_path)
        if do_grid_search:
            if use_folds_file_for_grid_search:
                eq_(results_dict['Grid Search Folds'], '5 via folds file')
                eq_(results_dict['Using Folds File for Grid Search'], 'True')
            else:
                eq_(results_dict['Grid Search Folds'], '4')
                eq_(results_dict['Using Folds File for Grid Search'], 'False')
    else:
        eq_(results_dict['Number of Folds'], '6')
        eq_(results_dict['Stratified Folds'], 'True')
        ok_('Using Folds File for Grid Search' not in results_dict)
        ok_('Specified Folds File' not in results_dict)
        if do_grid_search:
            eq_(results_dict['Grid Search Folds'], '4')

    if use_additional_metrics:
        expected_metrics = [b"accuracy", b"unweighted_kappa"
                            ] if PY2 else ["accuracy", "unweighted_kappa"]

        eq_(
            sorted(literal_eval(
                results_dict['Additional Evaluation Metrics'])),
            sorted(expected_metrics))