示例#1
0
def test_merge_missing_labels():
    """
    Test to ensure that labels are sucessfully copied when merging
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create a different feature set with no labels specified
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      feature_prefix='g',
                                      empty_labels=True,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # merge the two featuresets in different orders
    fs12 = fs1 + fs2
    fs21 = fs2 + fs1

    # make sure that the labels are the same after merging
    assert_array_equal(fs12.labels, fs1.labels)
    assert_array_equal(fs21.labels, fs1.labels)
示例#2
0
def test_merge_missing_labels():
    """
    Test to ensure that labels are sucessfully copied when merging
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create a different feature set with no labels specified
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      feature_prefix='g',
                                      empty_labels=True,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # merge the two featuresets in different orders
    fs12 = fs1 + fs2
    fs21 = fs2 + fs1

    # make sure that the labels are the same after merging
    assert_array_equal(fs12.labels, fs1.labels)
    assert_array_equal(fs21.labels, fs1.labels)
示例#3
0
def test_subtract():
    """
    Test to ensure that subtraction works
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=2,
                                      train_test_ratio=1.0,
                                      random_state=1234)

    # create a different feature set with the same feature names
    # but different feature values
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=2,
                                      num_labels=2,
                                      train_test_ratio=1.0,
                                      random_state=5678)

    # subtract fs1 from fs2, i.e., the features in fs2
    # should be removed from fs1 but nothing else should change
    fs = fs1 - fs2

    # ensure that the labels are the same in fs and fs1
    assert_array_equal(fs.labels, fs1.labels)

    # ensure that there are only two features left
    eq_(fs.features.shape[1], 2)

    # and that they are f3 and f4
    assert_array_equal(np.array(fs.vectorizer.feature_names_), ['f03', 'f04'])
示例#4
0
def test_subtract():
    """
    Test to ensure that subtraction works
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=2,
                                      train_test_ratio=1.0,
                                      random_state=1234)

    # create a different feature set with the same feature names
    # but different feature values
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=2,
                                      num_labels=2,
                                      train_test_ratio=1.0,
                                      random_state=5678)

    # subtract fs1 from fs2, i.e., the features in fs2
    # should be removed from fs1 but nothing else should change
    fs = fs1 - fs2

    # ensure that the labels are the same in fs and fs1
    assert_array_equal(fs.labels, fs1.labels)

    # ensure that there are only two features left
    eq_(fs.features.shape[1], 2)

    # and that they are f3 and f4
    assert_array_equal(np.array(fs.vectorizer.feature_names_), ['f03', 'f04'])
示例#5
0
def check_filter_labels(inverse=False):

    # create a feature set
    fs, _ = make_classification_data(num_examples=1000,
                                     num_features=4,
                                     num_labels=5,
                                     train_test_ratio=1.0)

    # keep just the instaces with 0, 1 and 2 labels
    labels_to_filter = [0, 1, 2]

    # do the actual filtering
    fs.filter(labels=labels_to_filter, inverse=inverse)

    # make sure that we removed the right things
    if inverse:
        ids_kept = fs.ids[np.where(
            np.logical_not(np.in1d(fs.labels, labels_to_filter)))]
    else:
        ids_kept = fs.ids[np.where(np.in1d(fs.labels, labels_to_filter))]

    assert_array_equal(fs.ids, np.array(ids_kept))

    # make sure that number of ids, labels and features are the same
    eq_(fs.ids.shape[0], fs.labels.shape[0])
    eq_(fs.labels.shape[0], fs.features.shape[0])
def test_learner_api_load_into_existing_instance():
    """
    Check that `Learner.load()` works as expected
    """

    # create a LinearSVC instance and train it on some data
    learner1 = Learner('LinearSVC')
    (train_fs,
     test_fs) = make_classification_data(num_examples=200,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)
    learner1.train(train_fs, grid_search=False)

    # now use `load()` to replace the existing instance with a
    # different saved learner
    other_model_file = join(_my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0]))
    learner1.load(other_model_file)

    # now load the saved model into another instance using the class method
    # `from_file()`
    learner2 = Learner.from_file(other_model_file)

    # check that the two instances are now basically the same
    eq_(learner1.model_type, learner2.model_type)
    eq_(learner1.model_params, learner2.model_params)
    eq_(learner1.model_kwargs, learner2.model_kwargs)
示例#7
0
def test_string_feature():
    """
    Test to make sure that string-valued features are properly
    encoded as binary features
    """
    # create a featureset that is derived from an original
    # set of features containing 3 numeric features and
    # one string-valued feature that can take six possible
    # values between 'a' to 'f'. This means that the
    # featureset will have 3 numeric + 6 binary features.
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     one_string_feature=True,
                                     num_string_values=6,
                                     train_test_ratio=1.0)

    # confirm that the number of features are as expected
    eq_(fs.features.shape, (100, 9))

    # confirm the feature names
    eq_(fs.vectorizer.feature_names_, ['f01', 'f02', 'f03',
                                       'f04=a', 'f04=b', 'f04=c',
                                       'f04=d', 'f04=e', 'f04=f'])

    # confirm that the final six features are binary
    assert_array_equal(fs.features[:, [3, 4, 5, 6, 7, 8]].data, 1)
def check_train_and_score_function(model_type):
    """
    Check that the _train_and_score() function works as expected
    """

    # create train and test data
    (train_fs,
     test_fs) = make_classification_data(num_examples=500,
                                         train_test_ratio=0.7,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)

    # call _train_and_score() on this data
    estimator_name = 'LogisticRegression' if model_type == 'classifier' else 'Ridge'
    metric = 'accuracy' if model_type == 'classifier' else 'pearson'
    learner1 = Learner(estimator_name)
    train_score1, test_score1 = _train_and_score(learner1, train_fs, test_fs, metric)

    # this should yield identical results when training another instance
    # of the same learner without grid search and shuffling and evaluating
    # that instance on the train and the test set
    learner2 = Learner(estimator_name)
    learner2.train(train_fs, grid_search=False, shuffle=False)
    train_score2 = learner2.evaluate(train_fs, output_metrics=[metric])[-1][metric]
    test_score2 = learner2.evaluate(test_fs, output_metrics=[metric])[-1][metric]

    eq_(train_score1, train_score2)
    eq_(test_score1, test_score2)
def make_single_file_featureset_data():
    """
    Write a training file and a test file for tests that check whether
    specifying train_file and test_file actually works.
    """
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=2,
                                                 num_features=3,
                                                 non_negative=False)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train', 'train_single_file.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test', 'test_single_file.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    # Also write another test feature set that has fewer features than the training set
    test_fs.filter(features=['f01', 'f02'])
    test_path = join(_my_dir, 'test', 'test_single_file_subset.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()
示例#10
0
def check_filter_labels(inverse=False):

    # create a feature set
    fs, _ = make_classification_data(num_examples=1000,
                                     num_features=4,
                                     num_labels=5,
                                     train_test_ratio=1.0)

    # keep just the instaces with 0, 1 and 2 labels
    labels_to_filter = [0, 1, 2]

    # do the actual filtering
    fs.filter(labels=labels_to_filter, inverse=inverse)

    # make sure that we removed the right things
    if inverse:
        ids_kept = fs.ids[np.where(np.logical_not(np.in1d(fs.labels,
                                                          labels_to_filter)))]
    else:
        ids_kept = fs.ids[np.where(np.in1d(fs.labels, labels_to_filter))]

    assert_array_equal(fs.ids, np.array(ids_kept))

    # make sure that number of ids, labels and features are the same
    eq_(fs.ids.shape[0], fs.labels.shape[0])
    eq_(fs.labels.shape[0], fs.features.shape[0])
示例#11
0
def test_skll_convert_libsvm_map():
    """
    Test to check whether the --reuse_libsvm_map option works for skll_convert
    """

    # create some simple classification data
    orig_fs, _ = make_classification_data(train_test_ratio=1.0,
                                          one_string_feature=True)

    # now write out this feature set as a libsvm file
    orig_libsvm_file = join(_my_dir, 'other',
                            'test_skll_convert_libsvm_map.libsvm')
    writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True)
    writer.write()

    # now make a copy of the dataset
    swapped_fs = copy.deepcopy(orig_fs)

    # now modify this new featureset to swap the first two columns
    del swapped_fs.vectorizer.vocabulary_['f01']
    del swapped_fs.vectorizer.vocabulary_['f02']
    swapped_fs.vectorizer.vocabulary_['f01'] = 1
    swapped_fs.vectorizer.vocabulary_['f02'] = 0
    tmp = swapped_fs.features[:, 0]
    swapped_fs.features[:, 0] = swapped_fs.features[:, 1]
    swapped_fs.features[:, 1] = tmp

    # now write out this new feature set as a MegaM file
    swapped_megam_file = join(_my_dir, 'other',
                              'test_skll_convert_libsvm_map.megam')
    writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True)
    writer.write()

    # now run skll_convert to convert this into a libsvm file
    # but using the mapping specified in the first libsvm file
    converted_libsvm_file = join(_my_dir, 'other',
                                 'test_skll_convert_libsvm_map2.libsvm')

    # now call skll convert's main function
    skll_convert_cmd = [
        '--reuse_libsvm_map', orig_libsvm_file, '--quiet', orig_libsvm_file,
        converted_libsvm_file
    ]
    err = ''
    try:
        old_stderr = sys.stderr
        sys.stderr = mystderr = StringIO()
        sk.main(skll_convert_cmd)
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        print(err)

    # now read the converted libsvm file into a featureset
    reader = LibSVMReader(converted_libsvm_file, quiet=True)
    converted_fs = reader.read()

    # now ensure that this new featureset and the original
    # featureset are the same
    eq_(orig_fs, converted_fs)
示例#12
0
def check_print_model_weights(task='classification'):

    # create some simple classification or regression data
    if task == 'classification':
        train_fs, _ = make_classification_data(train_test_ratio=0.8)
    else:
        train_fs, _, _ = make_regression_data(num_features=4,
                                              train_test_ratio=0.8)

    # now train the appropriate model
    if task == 'classification':
        learner = Learner('LogisticRegression')
        learner.train(train_fs)
    else:
        learner = Learner('LinearRegression')
        learner.train(train_fs, grid_objective='pearson')

    # now save the model to disk
    model_file = join(_my_dir, 'output',
                      'test_print_model_weights.model')
    learner.save(model_file)

    # now call print_model_weights main() and capture the output
    print_model_weights_cmd = [model_file]
    err = ''
    try:
        old_stderr = sys.stderr
        old_stdout = sys.stdout
        sys.stderr = mystderr = StringIO()
        sys.stdout = mystdout = StringIO()
        pmw.main(print_model_weights_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        sys.stdout = old_stdout
        print(err)

    # now parse the output of the print_model_weight command
    # and get the intercept and the feature values
    if task == 'classification':
        lines_to_parse = [l for l in out.split('\n')[1:] if l]
        intercept = safe_float(lines_to_parse[0].split('\t')[0])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[2], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_[0])
        assert_allclose(learner.model.coef_[0], feature_values)
    else:
        lines_to_parse = [l for l in out.split('\n') if l]
        intercept = safe_float(lines_to_parse[0].split('=')[1])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[1], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_)
        assert_allclose(learner.model.coef_, feature_values)
示例#13
0
def check_train_and_score_function(model_type):
    """
    Check that the _train_and_score() function works as expected
    """

    # create train and test data
    (train_fs, test_fs) = make_classification_data(num_examples=500,
                                                   train_test_ratio=0.7,
                                                   num_features=5,
                                                   use_feature_hashing=False,
                                                   non_negative=True)

    # call _train_and_score() on this data
    estimator_name = 'LogisticRegression' if model_type == 'classifier' else 'Ridge'
    metric = 'accuracy' if model_type == 'classifier' else 'pearson'
    learner1 = Learner(estimator_name)
    train_score1, test_score1 = _train_and_score(learner1, train_fs, test_fs,
                                                 metric)

    # this should yield identical results when training another instance
    # of the same learner without grid search and shuffling and evaluating
    # that instance on the train and the test set
    learner2 = Learner(estimator_name)
    learner2.train(train_fs, grid_search=False, shuffle=False)
    train_score2 = learner2.evaluate(train_fs,
                                     output_metrics=[metric])[-1][metric]
    test_score2 = learner2.evaluate(test_fs,
                                    output_metrics=[metric])[-1][metric]

    eq_(train_score1, train_score2)
    eq_(test_score1, test_score2)
示例#14
0
def test_learner_api_load_into_existing_instance():
    """
    Check that `Learner.load()` works as expected
    """

    # create a LinearSVC instance and train it on some data
    learner1 = Learner('LinearSVC')
    (train_fs, test_fs) = make_classification_data(num_examples=200,
                                                   num_features=5,
                                                   use_feature_hashing=False,
                                                   non_negative=True)
    learner1.train(train_fs, grid_search=False)

    # now use `load()` to replace the existing instance with a
    # different saved learner
    other_model_file = join(
        _my_dir, 'other',
        'test_load_saved_model.{}.model'.format(sys.version_info[0]))
    learner1.load(other_model_file)

    # now load the saved model into another instance using the class method
    # `from_file()`
    learner2 = Learner.from_file(other_model_file)

    # check that the two instances are now basically the same
    eq_(learner1.model_type, learner2.model_type)
    eq_(learner1.model_params, learner2.model_params)
    eq_(learner1.model_kwargs, learner2.model_kwargs)
示例#15
0
def make_single_file_featureset_data():
    """
    Write a training file and a test file for tests that check whether
    specifying train_file and test_file actually works.
    """
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=2,
                                                 num_features=3,
                                                 non_negative=False)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train', 'train_single_file.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test', 'test_single_file.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    # Also write another test feature set that has fewer features than the training set
    test_fs.filter(features=['f01', 'f02'])
    test_path = join(_my_dir, 'test', 'test_single_file_subset.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()
示例#16
0
def test_string_feature():
    """
    Test to make sure that string-valued features are properly
    encoded as binary features
    """
    # create a featureset that is derived from an original
    # set of features containing 3 numeric features and
    # one string-valued feature that can take six possible
    # values between 'a' to 'f'. This means that the
    # featureset will have 3 numeric + 6 binary features.
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     one_string_feature=True,
                                     num_string_values=6,
                                     train_test_ratio=1.0)

    # confirm that the number of features are as expected
    eq_(fs.features.shape, (100, 9))

    # confirm the feature names
    eq_(fs.vectorizer.feature_names_, [
        'f01', 'f02', 'f03', 'f04=a', 'f04=b', 'f04=c', 'f04=d', 'f04=e',
        'f04=f'
    ])

    # confirm that the final six features are binary
    assert_array_equal(fs.features[:, [3, 4, 5, 6, 7, 8]].data, 1)
示例#17
0
def test_skll_convert_libsvm_map():
    """
    Test to check whether the --reuse_libsvm_map option works for skll_convert
    """

    # create some simple classification data
    orig_fs, _ = make_classification_data(train_test_ratio=1.0,
                                          one_string_feature=True)

    # now write out this feature set as a libsvm file
    orig_libsvm_file = join(_my_dir, 'other',
                            'test_skll_convert_libsvm_map.libsvm')
    writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True)
    writer.write()

    # now make a copy of the dataset
    swapped_fs = copy.deepcopy(orig_fs)

    # now modify this new featureset to swap the first two columns
    del swapped_fs.vectorizer.vocabulary_['f01']
    del swapped_fs.vectorizer.vocabulary_['f02']
    swapped_fs.vectorizer.vocabulary_['f01'] = 1
    swapped_fs.vectorizer.vocabulary_['f02'] = 0
    tmp = swapped_fs.features[:, 0]
    swapped_fs.features[:, 0] = swapped_fs.features[:, 1]
    swapped_fs.features[:, 1] = tmp

    # now write out this new feature set as a MegaM file
    swapped_megam_file = join(_my_dir, 'other',
                              'test_skll_convert_libsvm_map.megam')
    writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True)
    writer.write()

    # now run skll_convert to convert this into a libsvm file
    # but using the mapping specified in the first libsvm file
    converted_libsvm_file = join(_my_dir, 'other',
                                 'test_skll_convert_libsvm_map2.libsvm')

    # now call skll convert's main function
    skll_convert_cmd = ['--reuse_libsvm_map', orig_libsvm_file,
                        '--quiet', orig_libsvm_file,
                        converted_libsvm_file]
    err = ''
    try:
        old_stderr = sys.stderr
        sys.stderr = mystderr = StringIO()
        sk.main(skll_convert_cmd)
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        print(err)

    # now read the converted libsvm file into a featureset
    reader = LibSVMReader(converted_libsvm_file, quiet=True)
    converted_fs = reader.read()

    # now ensure that this new featureset and the original
    # featureset are the same
    eq_(orig_fs, converted_fs)
示例#18
0
def check_generate_predictions_console(use_threshold=False):

    # create some simple classification data without feature hashing
    train_fs, test_fs = make_classification_data(num_examples=1000,
                                                 num_features=5)

    # save the test feature set to an NDJ file
    input_file = join(_my_dir, 'test',
                      'test_generate_predictions.jsonlines')
    writer = NDJWriter(input_file, test_fs)
    writer.write()

    # create a learner that uses an SGD classifier
    learner = Learner('SGDClassifier', probability=use_threshold)

    # train the learner with grid search
    learner.train(train_fs, grid_search=True)

    # get the predictions on the test featureset
    predictions = learner.predict(test_fs)

    # if we asked for probabilities, then use the threshold
    # to convert them into binary predictions
    if use_threshold:
        threshold = 0.6
        predictions = [int(p[1] >= threshold) for p in predictions]
    else:
        predictions = predictions.tolist()
        threshold = None

    # save the learner to a file
    model_file = join(_my_dir, 'output',
                      'test_generate_predictions_console.model')
    learner.save(model_file)

    # now call main() from generate_predictions.py
    generate_cmd = []
    if use_threshold:
        generate_cmd.append('-t {}'.format(threshold))
    generate_cmd.extend([model_file, input_file])

    # we need to capture stdout since that's what main() writes to
    err = ''
    try:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = mystdout = StringIO()
        sys.stderr = mystderr = StringIO()
        gp.main(generate_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
        predictions_after_saving = [int(x) for x in out.strip().split('\n')]
        eq_(predictions, predictions_after_saving)
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr
        print(err)
示例#19
0
def test_custom_learner_model_loading():
    num_labels = 10

    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_model_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_model_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    # run the configuration that trains the custom model and saves it
    cfgfile = 'test_model_save_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    # save the predictions from disk into memory
    # and delete the predictions file
    outprefix = 'test_model_custom_learner'
    pred_file = join(_my_dir, 'output',
                     '{}_{}_CustomLogisticRegressionWrapper'
                     '.predictions'.format(outprefix,
                                           outprefix))
    preds1 = read_predictions(pred_file)
    os.unlink(pred_file)

    # run the configuration that loads the saved model
    # and generates the predictions again
    cfgfile = 'test_model_load_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, overwrite=False, quiet=True)

    # load the newly generated predictions
    preds2 = read_predictions(pred_file)

    # make sure that they are the same as before
    assert_array_equal(preds1, preds2)
示例#20
0
def test_merge_different_vectorizers():
    """
    Test to ensure rejection of merging featuresets with different vectorizers
    """

    # create a featureset each with a DictVectorizer
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create another featureset using hashing
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      feature_prefix='g',
                                      num_labels=3,
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True)
    # This should raise a ValueError
    fs1 + fs2
示例#21
0
def test_merge_different_vectorizers():
    """
    Test to ensure rejection of merging featuresets with different vectorizers
    """

    # create a featureset each with a DictVectorizer
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create another featureset using hashing
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      feature_prefix='g',
                                      num_labels=3,
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True)
    # This should raise a ValueError
    fs1 + fs2
示例#22
0
def test_length():
    """
    Test to whether len() returns the number of instances
    """

    # create a featureset
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     train_test_ratio=1.0)

    eq_(len(fs), 100)
示例#23
0
def test_empty_labels():
    """
    Test to check behaviour when labels is None
    """

    # create a feature set with empty labels
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     empty_labels=True,
                                     train_test_ratio=1.0)
    assert np.isnan(fs.labels).all()
示例#24
0
def check_learner_api_grid_search_no_objective(task='train'):

    (train_fs, test_fs) = make_classification_data(num_examples=500,
                                                   train_test_ratio=0.7,
                                                   num_features=5,
                                                   use_feature_hashing=False,
                                                   non_negative=True)
    learner = Learner('LogisticRegression')
    if task == 'train':
        _ = learner.train(train_fs)
    else:
        _ = learner.cross_validate(train_fs)
示例#25
0
def test_write_hashed_featureset():
    """
    Test to check that hashed featuresets cannot be written out
    """
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     use_feature_hashing=True,
                                     feature_bins=2,
                                     random_state=1234)
    output_dir = join(_my_dir, 'output')
    writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs)
    writer.write()
示例#26
0
def test_length():
    """
    Test to whether len() returns the number of instances
    """

    # create a featureset
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     train_test_ratio=1.0)

    eq_(len(fs), 100)
示例#27
0
def test_empty_labels():
    """
    Test to check behaviour when labels is None
    """

    # create a feature set with empty labels
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     empty_labels=True,
                                     train_test_ratio=1.0)
    assert np.isnan(fs.labels).all()
def test_write_hashed_featureset():
    """
    Test to check that hashed featuresets cannot be written out
    """
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     use_feature_hashing=True,
                                     feature_bins=2,
                                     random_state=1234)
    output_dir = join(_my_dir, 'output')
    writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs)
    writer.write()
def check_learner_api_grid_search_no_objective(task='train'):

    (train_fs,
     test_fs) = make_classification_data(num_examples=500,
                                         train_test_ratio=0.7,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)
    learner = Learner('LogisticRegression')
    if task == 'train':
        _ = learner.train(train_fs)
    else:
        _ = learner.cross_validate(train_fs)
示例#30
0
def test_new_labels_in_test_set():
    """
    Test classification experiment with an unseen label in the test set.
    """
    train_fs, test_fs = make_classification_data(num_labels=3,
                                                 train_test_ratio=0.8)
    # add new labels to the test set
    test_fs.labels[-3:] = 3

    learner = Learner('SVC')
    learner.train(train_fs, grid_search=False)
    res = learner.evaluate(test_fs)
    yield check_results_with_unseen_labels, res, 4, [3]
    yield assert_almost_equal, res[1], 0.3
示例#31
0
def test_all_new_labels_in_test():
    """
    Test classification with all labels in test set unseen
    """
    train_fs, test_fs = make_classification_data(num_labels=3,
                                                 train_test_ratio=0.8)
    # change all test labels
    test_fs.labels = test_fs.labels+3

    learner = Learner('SVC')
    learner.train(train_fs, grid_search=False)
    res = learner.evaluate(test_fs)
    yield check_results_with_unseen_labels, res, 6, [3, 4, 5]
    yield assert_almost_equal, res[1], 0
def test_all_new_labels_in_test():
    """
    Test classification with all labels in test set unseen
    """
    train_fs, test_fs = make_classification_data(num_labels=3,
                                                 train_test_ratio=0.8)
    # change all test labels
    test_fs.labels = test_fs.labels + 3

    learner = Learner('SVC')
    learner.train(train_fs, grid_search=False)
    res = learner.evaluate(test_fs)
    yield check_results_with_unseen_labels, res, 6, [3, 4, 5]
    yield assert_almost_equal, res[1], 0
示例#33
0
def test_merge_different_hashers():
    """
    Test to ensure rejection of merging featuresets with different FeatureHashers
    """

    # create a feature set with 4 feature hashing bins
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=10,
                                      num_labels=3,
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True,
                                      feature_bins=4)

    # create a second feature set with 3 feature hashing bins
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=10,
                                      num_labels=3,
                                      feature_prefix='g',
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True,
                                      feature_bins=3)
    # This should raise a ValueError
    fs1 + fs2
def test_new_labels_in_test_set():
    """
    Test classification experiment with an unseen label in the test set.
    """
    train_fs, test_fs = make_classification_data(num_labels=3,
                                                 train_test_ratio=0.8)
    # add new labels to the test set
    test_fs.labels[-3:] = 3

    learner = Learner('SVC')
    learner.train(train_fs, grid_search=False)
    res = learner.evaluate(test_fs)
    yield check_results_with_unseen_labels, res, 4, [3]
    yield assert_almost_equal, res[1], 0.3
示例#35
0
def test_merge_different_hashers():
    """
    Test to ensure rejection of merging featuresets with different FeatureHashers
    """

    # create a feature set with 4 feature hashing bins
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=10,
                                      num_labels=3,
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True,
                                      feature_bins=4)

    # create a second feature set with 3 feature hashing bins
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=10,
                                      num_labels=3,
                                      feature_prefix='g',
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True,
                                      feature_bins=3)
    # This should raise a ValueError
    fs1 + fs2
示例#36
0
def check_predict(model, use_feature_hashing=False):
    """
    This tests whether predict task runs and generates the same
    number of predictions as samples in the test set. The specified
    model indicates whether to generate random regression
    or classification data.
    """

    # create the random data for the given model
    if model._estimator_type == 'regressor':
        train_fs, test_fs, _ = \
            make_regression_data(use_feature_hashing=use_feature_hashing,
                                 feature_bins=5)
    # feature hashing will not work for Naive Bayes since it requires
    # non-negative feature values
    elif model.__name__ == 'MultinomialNB':
        train_fs, test_fs = \
            make_classification_data(use_feature_hashing=False,
                                     non_negative=True)
    else:
        train_fs, test_fs = \
            make_classification_data(use_feature_hashing=use_feature_hashing,
                                     feature_bins=25)

    # create the learner with the specified model
    learner = Learner(model.__name__)

    # now train the learner on the training data and use feature hashing when
    # specified and when we are not using a Naive Bayes model
    learner.train(train_fs, grid_search=False)

    # now make predictions on the test set
    predictions = learner.predict(test_fs)

    # make sure we have the same number of outputs as the
    # number of test set samples
    eq_(len(predictions), test_fs.features.shape[0])
def check_predict(model, use_feature_hashing=False):
    """
    This tests whether predict task runs and generates the same
    number of predictions as samples in the test set. The specified
    model indicates whether to generate random regression
    or classification data.
    """

    # create the random data for the given model
    if model._estimator_type == 'regressor':
        train_fs, test_fs, _ = \
            make_regression_data(use_feature_hashing=use_feature_hashing,
                                 feature_bins=5)
    # feature hashing will not work for Naive Bayes since it requires
    # non-negative feature values
    elif model.__name__ == 'MultinomialNB':
        train_fs, test_fs = \
            make_classification_data(use_feature_hashing=False,
                                     non_negative=True)
    else:
        train_fs, test_fs = \
            make_classification_data(use_feature_hashing=use_feature_hashing,
                                     feature_bins=25)

    # create the learner with the specified model
    learner = Learner(model.__name__)

    # now train the learner on the training data and use feature hashing when
    # specified and when we are not using a Naive Bayes model
    learner.train(train_fs, grid_search=False)

    # now make predictions on the test set
    predictions = learner.predict(test_fs)

    # make sure we have the same number of outputs as the
    # number of test set samples
    eq_(len(predictions), test_fs.features.shape[0])
示例#38
0
def test_merge_different_labels_same_ids():
    """
    Test to ensure rejection of merging featuresets that have conflicting labels
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create a different feature set that has everything
    # the same but has different labels for the same IDs
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      feature_prefix='g',
                                      train_test_ratio=1.0)

    # artificially modify the class labels
    fs2.labels = fs2.labels + 1

    # This should raise a ValueError
    fs1 + fs2
示例#39
0
def test_merge_different_labels_same_ids():
    """
    Test to ensure rejection of merging featuresets that have conflicting labels
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create a different feature set that has everything
    # the same but has different labels for the same IDs
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      feature_prefix='g',
                                      train_test_ratio=1.0)

    # artificially modify the class labels
    fs2.labels = fs2.labels + 1

    # This should raise a ValueError
    fs1 + fs2
示例#40
0
def test_filter_with_hashing():
    """
    Test to ensure rejection of filtering by features when using hashing
    """

    # create a feature set
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=5,
                                     num_labels=3,
                                     train_test_ratio=1.0,
                                     use_feature_hashing=True,
                                     feature_bins=2)

    # filter features f1 and f4 or their inverse
    fs.filter(features=['f1', 'f4'])
示例#41
0
def test_iteration_without_dictvectorizer():
    """
    Test to allow iteration only if the vectorizer is a DictVectorizer
    """

    # create a feature set
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     train_test_ratio=1.0,
                                     use_feature_hashing=True,
                                     feature_bins=2)
    # This should raise a ValueError
    for _ in fs:
        pass
示例#42
0
def test_filter_with_hashing():
    """
    Test to ensure rejection of filtering by features when using hashing
    """

    # create a feature set
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=5,
                                     num_labels=3,
                                     train_test_ratio=1.0,
                                     use_feature_hashing=True,
                                     feature_bins=2)

    # filter features f1 and f4 or their inverse
    fs.filter(features=['f1', 'f4'])
示例#43
0
def test_iteration_without_dictvectorizer():
    """
    Test to allow iteration only if the vectorizer is a DictVectorizer
    """

    # create a feature set
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     train_test_ratio=1.0,
                                     use_feature_hashing=True,
                                     feature_bins=2)
    # This should raise a ValueError
    for _ in fs:
        pass
示例#44
0
def make_summary_data():
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=2,
                                                 num_features=3,
                                                 non_negative=True)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train', 'test_summary.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test', 'test_summary.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()
示例#45
0
def make_summary_data():
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=2,
                                                 num_features=3,
                                                 non_negative=True)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train', 'test_summary.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test', 'test_summary.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()
def test_new_labels_in_test_set_change_order():
    """
    Test classification with an unseen label in the test set when the new label falls between the existing labels
    """
    train_fs, test_fs = make_classification_data(num_labels=3,
                                                 train_test_ratio=0.8)
    # change train labels to create a gap
    train_fs.labels = train_fs.labels * 10
    # add new test labels
    test_fs.labels = test_fs.labels * 10
    test_fs.labels[-3:] = 15

    learner = Learner('SVC')
    learner.train(train_fs, grid_search=False)
    res = learner.evaluate(test_fs)
    yield check_results_with_unseen_labels, res, 4, [15]
    yield assert_almost_equal, res[1], 0.3
示例#47
0
def test_new_labels_in_test_set_change_order():
    """
    Test classification with an unseen label in the test set when the new label falls between the existing labels
    """
    train_fs, test_fs = make_classification_data(num_labels=3,
                                                 train_test_ratio=0.8)
    # change train labels to create a gap
    train_fs.labels = train_fs.labels*10
    # add new test labels
    test_fs.labels = test_fs.labels*10
    test_fs.labels[-3:] = 15

    learner = Learner('SVC')
    learner.train(train_fs, grid_search=False)
    res = learner.evaluate(test_fs)
    yield check_results_with_unseen_labels, res, 4, [15]
    yield assert_almost_equal, res[1], 0.3
示例#48
0
def check_generate_predictions(use_feature_hashing=False,
                               use_threshold=False,
                               test_on_subset=False):

    # create some simple classification feature sets for training and testing
    train_fs, test_fs = make_classification_data(
        num_examples=1000,
        num_features=5,
        use_feature_hashing=use_feature_hashing,
        feature_bins=4)

    # create a learner that uses an SGD classifier
    learner = Learner('SGDClassifier', probability=use_threshold)

    # train the learner with grid search
    learner.train(train_fs, grid_search=True)

    # if we are asked to use only a subset, then filter out
    # one of the features if we are not using feature hashing,
    # do nothing if we are using feature hashing
    if test_on_subset and not use_feature_hashing:
        test_fs.filter(features=['f01', 'f02', 'f03', 'f04'])

    # get the predictions on the test featureset
    predictions = learner.predict(test_fs)

    # if we asked for probabilities, then use the threshold
    # to convert them into binary predictions
    if use_threshold:
        threshold = 0.6
        predictions = [int(p[1] >= threshold) for p in predictions]
    else:
        predictions = predictions.tolist()
        threshold = None

    # save the learner to a file
    model_file = join(_my_dir, 'output', 'test_generate_predictions.model')
    learner.save(model_file)

    # now use Predictor to generate the predictions and make
    # sure that they are the same as before saving the model
    p = gp.Predictor(model_file, threshold=threshold)
    predictions_after_saving = p.predict(test_fs)

    eq_(predictions, predictions_after_saving)
示例#49
0
def test_logistic_custom_learner():
    num_labels = 10

    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_logistic_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_logistic_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    cfgfile = 'test_logistic_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    outprefix = 'test_logistic_custom_learner'
    preds = read_predictions(join(_my_dir, 'output',
                                  ('{}_{}_CustomLogisticRegressionWrapper'
                                   '.predictions'.format(outprefix,
                                                         outprefix))))

    expected = read_predictions(join(_my_dir, 'output',
                                     ('{}_{}_LogisticRegression.predictions'
                                      .format(outprefix, outprefix))))

    assert_array_equal(preds, expected)
def test_logistic_custom_learner():
    num_labels = 10

    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_logistic_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_logistic_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    cfgfile = 'test_logistic_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    outprefix = 'test_logistic_custom_learner'
    preds = read_predictions(join(_my_dir, 'output',
                                  ('{}_{}_CustomLogisticRegressionWrapper'
                                   '_predictions.tsv'.format(outprefix,
                                                             outprefix))))

    expected = read_predictions(join(_my_dir, 'output',
                                     ('{}_{}_LogisticRegression_predictions.tsv'
                                      .format(outprefix, outprefix))))

    assert_array_equal(preds, expected)
示例#51
0
def check_filter_ids(inverse=False):

    # create a feature set
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     train_test_ratio=1.0)

    # keep just the IDs after Example_50 or do the inverse
    ids_to_filter = ['EXAMPLE_{}'.format(i) for i in range(51, 101)]
    if inverse:
        ids_kept = ['EXAMPLE_{}'.format(i) for i in range(1, 51)]
    else:
        ids_kept = ids_to_filter
    fs.filter(ids=ids_to_filter, inverse=inverse)

    # make sure that we removed the right things
    assert_array_equal(fs.ids, np.array(ids_kept))

    # make sure that number of ids, labels and features are the same
    eq_(fs.ids.shape[0], fs.labels.shape[0])
    eq_(fs.labels.shape[0], fs.features.shape[0])
def test_mlp_classification():
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=3,
                                                 num_features=5)

    # train an MLPCLassifier on the training data and evalute on the
    # testing data
    learner = Learner('MLPClassifier')
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=ConvergenceWarning)
        learner.train(train_fs, grid_search=False)

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated
    accuracy = accuracy_score(predictions, test_fs.labels)
    assert_almost_equal(accuracy, 0.858, places=3)
示例#53
0
def test_majority_class_custom_learner():
    num_labels = 10

    # This will make data where the last class happens about 50% of the time.
    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_majority_class_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_majority_class_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    cfgfile = 'test_majority_class_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    outprefix = 'test_majority_class_custom_learner'

    preds = read_predictions(
        join(_my_dir, 'output',
             ('{}_{}_MajorityClassLearner_predictions.tsv'.format(
                 outprefix, outprefix))))
    expected = np.array([float(num_labels - 1) for x in preds])
    assert_array_equal(preds, expected)
示例#54
0
def check_filter_ids(inverse=False):

    # create a feature set
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     train_test_ratio=1.0)

    # keep just the IDs after Example_50 or do the inverse
    ids_to_filter = ['EXAMPLE_{}'.format(i) for i in range(51, 101)]
    if inverse:
        ids_kept = ['EXAMPLE_{}'.format(i) for i in range(1, 51)]
    else:
        ids_kept = ids_to_filter
    fs.filter(ids=ids_to_filter, inverse=inverse)

    # make sure that we removed the right things
    assert_array_equal(fs.ids, np.array(ids_kept))

    # make sure that number of ids, labels and features are the same
    eq_(fs.ids.shape[0], fs.labels.shape[0])
    eq_(fs.labels.shape[0], fs.features.shape[0])
示例#55
0
def test_mlp_classification():
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=3,
                                                 num_features=5)

    # train an MLPCLassifier on the training data and evalute on the
    # testing data
    learner = Learner('MLPClassifier')
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=ConvergenceWarning)
        learner.train(train_fs, grid_search=True)

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated
    accuracy = accuracy_score(predictions, test_fs.labels)
    assert_almost_equal(accuracy, 0.825)
def test_majority_class_custom_learner():
    num_labels = 10

    # This will make data where the last class happens about 50% of the time.
    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_majority_class_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_majority_class_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    cfgfile = 'test_majority_class_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    outprefix = 'test_majority_class_custom_learner'

    preds = read_predictions(join(_my_dir, 'output',
                                  ('{}_{}_MajorityClassLearner_predictions.tsv'
                                   .format(outprefix, outprefix))))
    expected = np.array([float(num_labels - 1) for x in preds])
    assert_array_equal(preds, expected)
示例#57
0
def check_filter_features(inverse=False):

    # create a feature set
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=5,
                                     num_labels=3,
                                     train_test_ratio=1.0)

    # store the features in a separate matrix before filtering
    X = fs.features.todense()

    # filter features f1 and f4 or their inverse
    fs.filter(features=['f01', 'f04'], inverse=inverse)

    # make sure that we have the right number of feature columns
    # depending on whether we are inverting
    feature_shape = (100, 3) if inverse else (100, 2)
    eq_(fs.features.shape, feature_shape)

    # and that they are the first and fourth columns
    # of X that we generated, if not inverting and
    # the second, third and fifth, if inverting
    if inverse:
        feature_columns = X[:, [1, 2, 4]]
    else:
        feature_columns = X[:, [0, 3]]

    assert (fs.features.todense() == feature_columns).all()

    # make sure that the feature names that we kept are also correct
    feature_names = ['f02', 'f03', 'f05'] if inverse else ['f01', 'f04']
    assert_array_equal(np.array(fs.vectorizer.feature_names_),
                       feature_names)

    # make sure that number of ids, labels and features are the same
    eq_(fs.ids.shape[0], fs.labels.shape[0])
    eq_(fs.labels.shape[0], fs.features.shape[0])