Exemplo n.º 1
0
def test_featureset_creation_from_dataframe_with_string_labels():

    dftest = pd.DataFrame({
        "id": [1, 2],
        "score": ['yes', 'no'],
        "text": ["a b", "b c"]
    })
    dftest.set_index("id", inplace=True)
    test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
    test_dict_vectorizer = DictVectorizer()
    Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
    fs_test = FeatureSet('test',
                         ids=dftest.index.values,
                         labels=dftest['score'].values,
                         features=Xtest,
                         vectorizer=test_dict_vectorizer)

    output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines")
    test_writer = NDJWriter(output_path, fs_test)
    test_writer.write()

    # read in the written file into a featureset and confirm that the
    # two featuresets are equal
    fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read()

    assert fs_test == fs_test2
Exemplo n.º 2
0
def check_generate_predictions_console(use_threshold=False):

    # create some simple classification data without feature hashing
    train_fs, test_fs = make_classification_data(num_examples=1000,
                                                 num_features=5)

    # save the test feature set to an NDJ file
    input_file = join(_my_dir, 'test',
                      'test_generate_predictions.jsonlines')
    writer = NDJWriter(input_file, test_fs)
    writer.write()

    # create a learner that uses an SGD classifier
    learner = Learner('SGDClassifier', probability=use_threshold)

    # train the learner with grid search
    learner.train(train_fs, grid_search=True)

    # get the predictions on the test featureset
    predictions = learner.predict(test_fs)

    # if we asked for probabilities, then use the threshold
    # to convert them into binary predictions
    if use_threshold:
        threshold = 0.6
        predictions = [int(p[1] >= threshold) for p in predictions]
    else:
        predictions = predictions.tolist()
        threshold = None

    # save the learner to a file
    model_file = join(_my_dir, 'output',
                      'test_generate_predictions_console.model')
    learner.save(model_file)

    # now call main() from generate_predictions.py
    generate_cmd = []
    if use_threshold:
        generate_cmd.append('-t {}'.format(threshold))
    generate_cmd.extend([model_file, input_file])

    # we need to capture stdout since that's what main() writes to
    err = ''
    try:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = mystdout = StringIO()
        sys.stderr = mystderr = StringIO()
        gp.main(generate_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
        predictions_after_saving = [int(x) for x in out.strip().split('\n')]
        eq_(predictions, predictions_after_saving)
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr
        print(err)
Exemplo n.º 3
0
def test_write_hashed_featureset():
    """
    Test to check that hashed featuresets cannot be written out
    """
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     use_feature_hashing=True,
                                     feature_bins=2,
                                     random_state=1234)
    output_dir = join(_my_dir, 'output')
    writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs)
    writer.write()
def test_write_hashed_featureset():
    """
    Test to check that hashed featuresets cannot be written out
    """
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     use_feature_hashing=True,
                                     feature_bins=2,
                                     random_state=1234)
    output_dir = join(_my_dir, 'output')
    writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs)
    writer.write()
Exemplo n.º 5
0
def make_learning_curve_data():

    # Load in the digits data set
    digits = load_digits()
    X, y = digits.data, digits.target

    # create featureset with all features
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))
    fs1 = FeatureSet('train1',
                     features=features,
                     labels=y,
                     ids=list(range(X.shape[0])))

    # Write this feature set to file
    train_path = join(_my_dir, 'train', 'test_learning_curve1.jsonlines')
    writer = NDJWriter(train_path, fs1)
    writer.write()

    # create featureset with all except the last feature
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names[:-1], row)))
    fs2 = FeatureSet('train2',
                     features=features,
                     labels=y,
                     ids=list(range(X.shape[0])))

    # Write this feature set to file
    train_path = join(_my_dir, 'train', 'test_learning_curve2.jsonlines')
    writer = NDJWriter(train_path, fs2)
    writer.write()
Exemplo n.º 6
0
def create_jsonlines_feature_files(path):

    # we only need to create the feature files if they
    # don't already exist under the given path
    feature_files_to_create = [
        join(path, 'f{}.jsonlines'.format(i)) for i in range(6)
    ]
    if all([exists(ff) for ff in feature_files_to_create]):
        return
    else:
        num_examples = 1000
        np.random.seed(1234567890)

        # Create lists we will write files from
        ids = []
        features = []
        labels = []
        for j in range(num_examples):
            y = "dog" if j % 2 == 0 else "cat"
            ex_id = "{}{}".format(y, j)
            x = {
                "f{}".format(feat_num): np.random.randint(0, 4)
                for feat_num in range(5)
            }
            x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
            ids.append(ex_id)
            labels.append(y)
            features.append(x)

        for i in range(5):
            file_path = join(path, 'f{}.jsonlines'.format(i))
            sub_features = []
            for example_num in range(num_examples):
                feat_num = i
                x = {
                    "f{}".format(feat_num):
                    features[example_num]["f{}".format(feat_num)]
                }
                sub_features.append(x)
            fs = FeatureSet('ablation_cv',
                            ids,
                            features=sub_features,
                            labels=labels)

            writer = NDJWriter(file_path, fs)
            writer.write()

        # now write out the last file which is basically
        # identical to the last featureset we wrote
        # except that it has two extra instances
        fs = FeatureSet(
            'extra',
            ids +
            ['cat{}'.format(num_examples), 'dog{}'.format(num_examples + 1)],
            features=sub_features + [{}, {}],
            labels=labels + ['cat', 'dog'])
        file_path = join(path, 'f5.jsonlines')
        writer = NDJWriter(file_path, fs)
        writer.write()
Exemplo n.º 7
0
def create_jsonlines_feature_files(path):

    # we only need to create the feature files if they
    # don't already exist under the given path
    feature_files_to_create = [
        join(path, 'f{}.jsonlines'.format(i)) for i in range(5)
    ]
    if all([exists(ff) for ff in feature_files_to_create]):
        return
    else:
        num_examples = 1000
        np.random.seed(1234567890)

        # Create lists we will write files from
        ids = []
        features = []
        labels = []
        for j in range(num_examples):
            y = "dog" if j % 2 == 0 else "cat"
            ex_id = "{}{}".format(y, j)
            x = {
                "f{}".format(feat_num): np.random.randint(0, 4)
                for feat_num in range(5)
            }
            x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
            ids.append(ex_id)
            labels.append(y)
            features.append(x)

        for i in range(5):
            file_path = join(path, 'f{}.jsonlines'.format(i))
            sub_features = []
            for example_num in range(num_examples):
                feat_num = i
                x = {
                    "f{}".format(feat_num):
                    features[example_num]["f{}".format(feat_num)]
                }
                sub_features.append(x)
            fs = FeatureSet('ablation_cv',
                            ids,
                            features=sub_features,
                            labels=labels)
            writer = NDJWriter(file_path, fs)
            writer.write()
def test_writing_ndj_featureset_with_string_ids():
    test_dict_vectorizer = DictVectorizer()
    test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
    Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
    fs_test = FeatureSet('test',
                         ids=['1', '2'],
                         labels=[1, 2],
                         features=Xtest,
                         vectorizer=test_dict_vectorizer)
    output_path = join(_my_dir, "other", "test_string_ids.jsonlines")
    test_writer = NDJWriter(output_path, fs_test)
    test_writer.write()

    # read in the written file into a featureset and confirm that the
    # two featuresets are equal
    fs_test2 = NDJReader.for_path(output_path).read()

    assert fs_test == fs_test2
Exemplo n.º 9
0
def test_writing_ndj_featureset_with_string_ids():
    test_dict_vectorizer = DictVectorizer()
    test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
    Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
    fs_test = FeatureSet('test',
                         ids=['1', '2'],
                         labels=[1, 2],
                         features=Xtest,
                         vectorizer=test_dict_vectorizer)
    output_path = join(_my_dir, "other", "test_string_ids.jsonlines")
    test_writer = NDJWriter(output_path, fs_test)
    test_writer.write()

    # read in the written file into a featureset and confirm that the
    # two featuresets are equal
    fs_test2 = NDJReader.for_path(output_path).read()

    assert fs_test == fs_test2
Exemplo n.º 10
0
def test_custom_learner_model_loading():
    num_labels = 10

    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_model_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_model_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    # run the configuration that trains the custom model and saves it
    cfgfile = 'test_model_save_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    # save the predictions from disk into memory
    # and delete the predictions file
    outprefix = 'test_model_custom_learner'
    pred_file = join(_my_dir, 'output',
                     '{}_{}_CustomLogisticRegressionWrapper'
                     '.predictions'.format(outprefix,
                                           outprefix))
    preds1 = read_predictions(pred_file)
    os.unlink(pred_file)

    # run the configuration that loads the saved model
    # and generates the predictions again
    cfgfile = 'test_model_load_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, overwrite=False, quiet=True)

    # load the newly generated predictions
    preds2 = read_predictions(pred_file)

    # make sure that they are the same as before
    assert_array_equal(preds1, preds2)
Exemplo n.º 11
0
def make_ablation_data():
    # Remove old CV data
    for old_file in glob.glob(join(_my_dir, 'output',
                                   'ablation_cv_*.results')):
        os.remove(old_file)

    num_examples = 1000

    np.random.seed(1234567890)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {
            "f{}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(5)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)

    for i in range(5):
        train_path = join(_my_dir, 'train', 'f{}.jsonlines'.format(i))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i
            x = {
                "f{}".format(feat_num):
                features[example_num]["f{}".format(feat_num)]
            }
            sub_features.append(x)
        train_fs = FeatureSet('ablation_cv',
                              ids,
                              features=sub_features,
                              labels=labels)
        writer = NDJWriter(train_path, train_fs)
        writer.write()
Exemplo n.º 12
0
def make_learning_curve_data():

    # Load in the digits data set
    digits = load_digits()
    X, y = digits.data, digits.target

    # create featureset with all features
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))
    fs1 = FeatureSet('train1', features=features, labels=y, ids=list(range(X.shape[0])))

    # Write this feature set to file
    train_path = join(_my_dir, 'train', 'test_learning_curve1.jsonlines')
    writer = NDJWriter(train_path, fs1)
    writer.write()

    # create featureset with all except the last feature
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names[:-1], row)))
    fs2 = FeatureSet('train2', features=features, labels=y, ids=list(range(X.shape[0])))

    # Write this feature set to file
    train_path = join(_my_dir, 'train', 'test_learning_curve2.jsonlines')
    writer = NDJWriter(train_path, fs2)
    writer.write()
Exemplo n.º 13
0
def make_summary_data():
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=2,
                                                 num_features=3,
                                                 non_negative=True)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train', 'test_summary.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test', 'test_summary.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()
def test_featureset_creation_from_dataframe_with_string_labels():

    dftest = pd.DataFrame({"id": [1, 2],
                           "score": ['yes', 'no'],
                           "text": ["a b", "b c"]})
    dftest.set_index("id", inplace=True)
    test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
    test_dict_vectorizer = DictVectorizer()
    Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
    fs_test = FeatureSet('test',
                         ids=dftest.index.values,
                         labels=dftest['score'].values,
                         features=Xtest,
                         vectorizer=test_dict_vectorizer)
    output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines")
    test_writer = NDJWriter(output_path, fs_test)
    test_writer.write()

    # read in the written file into a featureset and confirm that the
    # two featuresets are equal
    fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read()

    assert fs_test == fs_test2
Exemplo n.º 15
0
def make_ablation_data():
    # Remove old CV data
    for old_file in glob.glob(join(_my_dir, 'output',
                                   'ablation_cv_*.results')):
        os.remove(old_file)

    num_examples = 1000

    np.random.seed(1234567890)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {"f{}".format(feat_num): np.random.randint(0, 4) for feat_num in
             range(5)}
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)

    for i in range(5):
        train_path = join(_my_dir, 'train', 'f{}.jsonlines'.format(i))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i
            x = {"f{}".format(feat_num):
                 features[example_num]["f{}".format(feat_num)]}
            sub_features.append(x)
        train_fs = FeatureSet('ablation_cv', ids, features=sub_features,
                              labels=labels)
        writer = NDJWriter(train_path, train_fs)
        writer.write()
Exemplo n.º 16
0
def make_class_map_data():
    # Create training file
    train_path = join(_my_dir, 'train', 'test_class_map.jsonlines')
    ids = []
    labels = []
    features = []
    class_names = ['beagle', 'cat', 'dachsund', 'cat']
    for i in range(1, 101):
        y = class_names[i % 4]
        ex_id = "{}{}".format(y, i)
        # note that f1 and f5 are missing in all instances but f4 is not
        x = {"f2": i + 1, "f3": i + 2, "f4": i + 5}
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    train_fs = FeatureSet('train_class_map',
                          ids,
                          features=features,
                          labels=labels)
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Create test file
    test_path = join(_my_dir, 'test', 'test_class_map.jsonlines')
    ids = []
    labels = []
    features = []
    for i in range(1, 51):
        y = class_names[i % 4]
        ex_id = "{}{}".format(y, i)
        # f1 and f5 are not missing in any instances here but f4 is
        x = {"f1": i, "f2": i + 2, "f3": i % 10, "f5": i * 2}
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    test_fs = FeatureSet('test_class_map',
                         ids,
                         features=features,
                         labels=labels)
    writer = NDJWriter(test_path, test_fs)
    writer.write()
Exemplo n.º 17
0
def test_logistic_custom_learner():
    num_labels = 10

    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_logistic_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_logistic_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    cfgfile = 'test_logistic_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    outprefix = 'test_logistic_custom_learner'
    preds = read_predictions(join(_my_dir, 'output',
                                  ('{}_{}_CustomLogisticRegressionWrapper'
                                   '.predictions'.format(outprefix,
                                                         outprefix))))

    expected = read_predictions(join(_my_dir, 'output',
                                     ('{}_{}_LogisticRegression.predictions'
                                      .format(outprefix, outprefix))))

    assert_array_equal(preds, expected)
Exemplo n.º 18
0
def test_majority_class_custom_learner():
    num_labels = 10

    # This will make data where the last class happens about 50% of the time.
    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_majority_class_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_majority_class_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    cfgfile = 'test_majority_class_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    outprefix = 'test_majority_class_custom_learner'

    preds = read_predictions(
        join(_my_dir, 'output',
             ('{}_{}_MajorityClassLearner_predictions.tsv'.format(
                 outprefix, outprefix))))
    expected = np.array([float(num_labels - 1) for x in preds])
    assert_array_equal(preds, expected)
Exemplo n.º 19
0
def make_summary_data():
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=2,
                                                 num_features=3,
                                                 non_negative=True)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train', 'test_summary.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test', 'test_summary.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()
def test_logistic_custom_learner():
    num_labels = 10

    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_logistic_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_logistic_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    cfgfile = 'test_logistic_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    outprefix = 'test_logistic_custom_learner'
    preds = read_predictions(join(_my_dir, 'output',
                                  ('{}_{}_CustomLogisticRegressionWrapper'
                                   '_predictions.tsv'.format(outprefix,
                                                             outprefix))))

    expected = read_predictions(join(_my_dir, 'output',
                                     ('{}_{}_LogisticRegression_predictions.tsv'
                                      .format(outprefix, outprefix))))

    assert_array_equal(preds, expected)
def test_majority_class_custom_learner():
    num_labels = 10

    # This will make data where the last class happens about 50% of the time.
    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_majority_class_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_majority_class_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    cfgfile = 'test_majority_class_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    outprefix = 'test_majority_class_custom_learner'

    preds = read_predictions(join(_my_dir, 'output',
                                  ('{}_{}_MajorityClassLearner_predictions.tsv'
                                   .format(outprefix, outprefix))))
    expected = np.array([float(num_labels - 1) for x in preds])
    assert_array_equal(preds, expected)
Exemplo n.º 22
0
def make_class_map_data():
    # Create training file
    train_path = join(_my_dir, 'train', 'test_class_map.jsonlines')
    ids = []
    labels = []
    features = []
    class_names = ['beagle', 'cat', 'dachsund', 'cat']
    for i in range(1, 101):
        y = class_names[i % 4]
        ex_id = "{}{}".format(y, i)
        # note that f1 and f5 are missing in all instances but f4 is not
        x = {"f2": i + 1, "f3": i + 2, "f4": i + 5}
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    train_fs = FeatureSet('train_class_map', ids, features=features,
                          labels=labels)
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Create test file
    test_path = join(_my_dir, 'test', 'test_class_map.jsonlines')
    ids = []
    labels = []
    features = []
    for i in range(1, 51):
        y = class_names[i % 4]
        ex_id = "{}{}".format(y, i)
        # f1 and f5 are not missing in any instances here but f4 is
        x = {"f1": i, "f2": i + 2, "f3": i % 10, "f5": i * 2}
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    test_fs = FeatureSet('test_class_map', ids, features=features,
                         labels=labels)
    writer = NDJWriter(test_path, test_fs)
    writer.write()
Exemplo n.º 23
0
def test_fancy_output():
    """
    Test the descriptive statistics output in the results file for a regressor
    """
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                num_features=3)

    # train a regression model using the train feature set
    learner = Learner('LinearRegression')
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # evaluate the trained model using the test feature set
    resultdict = learner.evaluate(test_fs)
    actual_stats_from_api = dict(resultdict[2]['descriptive']['actual'])
    pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted'])

    # write out the training and test feature set
    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'),
                             train_fs)
    train_writer.write()
    test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs)
    test_writer.write()

    # now get the config file template, fill it in and run it
    # so that we can get a results file
    config_template_path = join(_my_dir, 'configs',
                                'test_regression_fancy_output.template.cfg')
    config_path = fill_in_config_paths_for_fancy_output(config_template_path)

    run_configuration(config_path, quiet=True)

    # read in the results file and get the descriptive statistics
    actual_stats_from_file = {}
    pred_stats_from_file = {}
    with open(
            join(output_dir, ('regression_fancy_output_train_fancy_train.'
                              'jsonlines_test_fancy_test.jsonlines'
                              '_LinearRegression.results')), 'r') as resultf:

        result_output = resultf.read().strip().split('\n')
        for desc_stat_line in result_output[26:30]:
            desc_stat_line = desc_stat_line.strip()
            if not desc_stat_line:
                continue
            else:
                m = re.search(
                    r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+'
                    r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+'
                    r'\((predicted)\)', desc_stat_line)
                stat_type, actual_value, _, pred_value, _ = m.groups()
                actual_stats_from_file[stat_type.lower()] = float(actual_value)
                pred_stats_from_file[stat_type.lower()] = float(pred_value)

    for stat_type in actual_stats_from_api:

        assert_almost_equal(actual_stats_from_file[stat_type],
                            actual_stats_from_api[stat_type],
                            places=4)

        assert_almost_equal(pred_stats_from_file[stat_type],
                            pred_stats_from_api[stat_type],
                            places=4)
Exemplo n.º 24
0
def test_fancy_output():
    """
    Test the descriptive statistics output in the results file for a regressor
    """
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                num_features=3)

    # train a regression model using the train feature set
    learner = Learner('LinearRegression')
    learner.train(train_fs, grid_objective='pearson')

    # evaluate the trained model using the test feature set
    resultdict = learner.evaluate(test_fs)
    actual_stats_from_api = dict(resultdict[2]['descriptive']['actual'])
    pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted'])

    # write out the training and test feature set
    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'),
                             train_fs)
    train_writer.write()
    test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs)
    test_writer.write()

    # now get the config file template, fill it in and run it
    # so that we can get a results file
    config_template_path = join(_my_dir, 'configs',
                                'test_regression_fancy_output.template.cfg')
    config_path = fill_in_config_paths_for_fancy_output(config_template_path)

    run_configuration(config_path, quiet=True)

    # read in the results file and get the descriptive statistics
    actual_stats_from_file = {}
    pred_stats_from_file = {}
    with open(join(output_dir, ('regression_fancy_output_train_fancy_train.'
                                'jsonlines_test_fancy_test.jsonlines'
                                '_LinearRegression.results')),
              'r') as resultf:

        result_output = resultf.read().strip().split('\n')
        for desc_stat_line in result_output[27:31]:
            desc_stat_line = desc_stat_line.strip()
            if not desc_stat_line:
                continue
            else:
                m = re.search(r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+'
                              r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+'
                              r'\((predicted)\)', desc_stat_line)
                stat_type, actual_value, _, pred_value, _ = m.groups()
                actual_stats_from_file[stat_type.lower()] = float(actual_value)
                pred_stats_from_file[stat_type.lower()] = float(pred_value)

    for stat_type in actual_stats_from_api:

        assert_almost_equal(actual_stats_from_file[stat_type],
                            actual_stats_from_api[stat_type],
                            places=4)

        assert_almost_equal(pred_stats_from_file[stat_type],
                            pred_stats_from_api[stat_type],
                            places=4)