예제 #1
0
def check_linear_models(name,
                        use_feature_hashing=False,
                        use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        (train_fs,
         test_fs,
         weightdict) = make_regression_data(num_examples=5000,
                                            num_features=10,
                                            use_feature_hashing=True,
                                            feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the weights are close to the weights
    # that we got from make_regression_data. Take the
    # ceiling before comparing since just comparing
    # the ceilings should be enough to make sure nothing
    # catastrophic happened. However, sometimes with
    # feature hashing, the ceiling is not exactly identical
    # so when that fails we want to check that the rounded
    # feature values are the same. One of those two equalities
    # _must_ be satisified.

    # get the weights for this trained model
    learned_weights = learner.model_params[0]

    for feature_name in learned_weights:
        learned_w_ceil = math.ceil(learned_weights[feature_name])
        given_w_ceil = math.ceil(weightdict[feature_name])
        learned_w_round = round(learned_weights[feature_name], 0)
        given_w_round = round(weightdict[feature_name], 0)
        ceil_equal = learned_w_ceil == given_w_ceil
        round_equal = learned_w_round == given_w_round
        either_equal = ceil_equal or round_equal
        assert either_equal

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
예제 #2
0
def check_ensemble_models(name,
                          use_feature_hashing=False,
                          use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('AdaBoostRegressor'):
        if use_feature_hashing:
            expected_feature_importances = [0.749811,
                                            0.001373,
                                            0.23357,
                                            0.011691,
                                            0.003554]
        else:
            expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
    else:
        expected_feature_importances = ([0.735756,
                                         0.001034,
                                         0.242734,
                                         0.015836,
                                         0.00464] if use_feature_hashing else
                                        [0.082621,
                                         0.166652,
                                         0.750726])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
예제 #3
0
def check_tree_models(name,
                      use_feature_hashing=False,
                      use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([0.730811,
                                         0.001834,
                                         0.247603,
                                         0.015241,
                                         0.004511] if use_feature_hashing else
                                        [0.08926899,
                                         0.15585068,
                                         0.75488033])
    else:
        expected_feature_importances = ([0.733654,
                                         0.002528,
                                         0.245527,
                                         0.013664,
                                         0.004627] if use_feature_hashing else
                                        [0.07974267,
                                         0.16121895,
                                         0.75903838])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
예제 #4
0
def test_feature_merging_order_invariance():
    """
    Test whether featuresets with different orders of IDs can be merged
    """

    # First, randomly generate two feature sets and then make sure they have
    # the same labels.
    train_fs1, _, _ = make_regression_data()
    train_fs2, _, _ = make_regression_data(start_feature_num=3,
                                           random_state=87654321)
    train_fs2.labels = train_fs1.labels.copy()

    # make a reversed copy of feature set 2
    shuffled_indices = list(range(len(train_fs2.ids)))
    np.random.seed(123456789)
    np.random.shuffle(shuffled_indices)
    train_fs2_ids_shuf = train_fs2.ids[shuffled_indices]
    train_fs2_labels_shuf = train_fs2.labels[shuffled_indices]
    train_fs2_features_shuf = train_fs2.features[shuffled_indices]
    train_fs2_shuf = FeatureSet("f2_shuf",
                                train_fs2_ids_shuf,
                                labels=train_fs2_labels_shuf,
                                features=train_fs2_features_shuf,
                                vectorizer=train_fs2.vectorizer)

    # merge feature set 1 with feature set 2 and its reversed version
    merged_fs = train_fs1 + train_fs2
    merged_fs_shuf = train_fs1 + train_fs2_shuf

    # check that the two merged versions are the same
    feature_names = (train_fs1.vectorizer.get_feature_names() +
                     train_fs2.vectorizer.get_feature_names())
    assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names)
    assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(),
                       feature_names)

    assert_array_equal(merged_fs.labels, train_fs1.labels)
    assert_array_equal(merged_fs.labels, train_fs2.labels)
    assert_array_equal(merged_fs.labels, merged_fs_shuf.labels)

    assert_array_equal(merged_fs.ids, train_fs1.ids)
    assert_array_equal(merged_fs.ids, train_fs2.ids)
    assert_array_equal(merged_fs.ids, merged_fs_shuf.ids)

    assert_array_equal(merged_fs.features[:, 0:2].todense(),
                       train_fs1.features.todense())
    assert_array_equal(merged_fs.features[:, 2:4].todense(),
                       train_fs2.features.todense())
    assert_array_equal(merged_fs.features.todense(),
                       merged_fs_shuf.features.todense())

    assert not np.all(
        merged_fs.features[:,
                           0:2].todense() == merged_fs.features[:,
                                                                2:4].todense())
예제 #5
0
def check_invalid_regression_grid_objective(learner, grid_objective):
    """
    Checks whether the grid objective function is valid for this regressor
    """
    (train_fs, _, _) = make_regression_data()
    clf = Learner(learner)
    clf.train(train_fs, grid_objective=grid_objective)
예제 #6
0
def check_invalid_regression_metric(learner, metric, by_itself=False):
    """
    Checks that invalid metrics raise exceptions
    """
    (train_fs, test_fs, _) = make_regression_data()
    clf = Learner(learner)
    clf.train(train_fs, grid_search=False)
    output_metrics = [metric] if by_itself else ['pearson', metric]
    clf.evaluate(test_fs, output_metrics=output_metrics)
예제 #7
0
def check_rescaling(name, grid_search=False):

    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                sd_noise=4,
                                                num_features=3)

    # instantiate the given learner and its rescaled counterpart
    learner = Learner(name)
    rescaled_learner = Learner('Rescaled' + name)

    # train both the regular regressor and the rescaled regressor
    # with and without using grid search
    if grid_search:
        learner.train(train_fs, grid_search=True, grid_objective='pearson')
        rescaled_learner.train(train_fs,
                               grid_search=True,
                               grid_objective='pearson')
    else:
        learner.train(train_fs, grid_search=False)
        rescaled_learner.train(train_fs, grid_search=False)

    # now generate both sets of predictions on the test feature set
    predictions = learner.predict(test_fs)
    rescaled_predictions = rescaled_learner.predict(test_fs)

    # ... and on the training feature set
    train_predictions = learner.predict(train_fs)
    rescaled_train_predictions = rescaled_learner.predict(train_fs)

    # make sure that both sets of correlations are close to perfectly
    # correlated, since the only thing different is that one set has been
    # rescaled
    assert_almost_equal(pearsonr(predictions, rescaled_predictions)[0],
                        1.0,
                        places=3)

    # make sure that the standard deviation of the rescaled test set
    # predictions is higher than the standard deviation of the regular test set
    # predictions
    p_std = np.std(predictions)
    rescaled_p_std = np.std(rescaled_predictions)
    assert_greater(rescaled_p_std, p_std)

    # make sure that the standard deviation of the rescaled predictions
    # on the TRAINING set (not the TEST) is closer to the standard
    # deviation of the training set labels than the standard deviation
    # of the regular predictions.
    train_y_std = np.std(train_fs.labels)
    train_p_std = np.std(train_predictions)
    rescaled_train_p_std = np.std(rescaled_train_predictions)
    assert_less(abs(rescaled_train_p_std - train_y_std),
                abs(train_p_std - train_y_std))
예제 #8
0
def check_non_linear_models(name,
                            use_feature_hashing=False,
                            use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, weightdict = make_regression_data(
            num_examples=5000,
            num_features=10,
            use_feature_hashing=True,
            feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # Note that we cannot check the feature weights here
    # since `model_params()` is not defined for non-linear
    # kernels.

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
예제 #9
0
def test_additional_metrics():
    """
    Test additional metrics in the results file for a regressor
    """
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                num_features=3)

    # train a regression model using the train feature set
    learner = Learner('LinearRegression')
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # evaluate the trained model using the test feature set
    results = learner.evaluate(test_fs, output_metrics=['spearman',
                                                        'kendall_tau'])

    # check that the values for the additional metrics are as expected
    additional_scores_dict = results[-1]
    assert_almost_equal(additional_scores_dict['spearman'], 0.9996, places=4)
    assert_almost_equal(additional_scores_dict['kendall_tau'], 0.9847, places=4)
예제 #10
0
def check_adaboost_regression(base_estimator):
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                sd_noise=4,
                                                num_features=3)

    # train an AdaBoostRegressor on the training data and evalute on the
    # testing data
    learner = Learner('AdaBoostRegressor',
                      model_kwargs={'base_estimator': base_estimator})
    learner.train(train_fs, grid_search=False)

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
예제 #11
0
def check_ransac_regression(base_estimator, pearson_value):
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                sd_noise=4,
                                                num_features=3)

    # train a RANSACRegressor on the training data and evalute on the
    # testing data
    model_kwargs = {'base_estimator': base_estimator} if base_estimator else {}
    learner = Learner('RANSACRegressor', model_kwargs=model_kwargs)
    learner.train(train_fs, grid_search=False)

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated and the value
    # of the correlation is as expected
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, pearson_value)
예제 #12
0
def check_mlp_regression(use_rescaling=False):
    train_fs, test_fs, _ = make_regression_data(num_examples=500,
                                                sd_noise=4,
                                                num_features=5)

    # train an MLPRegressor on the training data and evalute on the
    # testing data
    name = 'MLPRegressor' if use_rescaling else 'RescaledMLPRegressor'
    learner = Learner(name)
    # we don't want to see any convergence warnings during the grid search
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=ConvergenceWarning)
        learner.train(train_fs, grid_search=False)

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.98)
예제 #13
0
def test_fancy_output():
    """
    Test the descriptive statistics output in the results file for a regressor
    """
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                num_features=3)

    # train a regression model using the train feature set
    learner = Learner('LinearRegression')
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # evaluate the trained model using the test feature set
    resultdict = learner.evaluate(test_fs)
    actual_stats_from_api = dict(resultdict[2]['descriptive']['actual'])
    pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted'])

    # write out the training and test feature set
    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'),
                             train_fs)
    train_writer.write()
    test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs)
    test_writer.write()

    # now get the config file template, fill it in and run it
    # so that we can get a results file
    config_template_path = join(_my_dir, 'configs',
                                'test_regression_fancy_output.template.cfg')
    config_path = fill_in_config_paths_for_fancy_output(config_template_path)

    run_configuration(config_path, quiet=True)

    # read in the results file and get the descriptive statistics
    actual_stats_from_file = {}
    pred_stats_from_file = {}
    with open(
            join(output_dir, ('regression_fancy_output_train_fancy_train.'
                              'jsonlines_test_fancy_test.jsonlines'
                              '_LinearRegression.results')), 'r') as resultf:

        result_output = resultf.read().strip().split('\n')
        for desc_stat_line in result_output[26:30]:
            desc_stat_line = desc_stat_line.strip()
            if not desc_stat_line:
                continue
            else:
                m = re.search(
                    r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+'
                    r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+'
                    r'\((predicted)\)', desc_stat_line)
                stat_type, actual_value, _, pred_value, _ = m.groups()
                actual_stats_from_file[stat_type.lower()] = float(actual_value)
                pred_stats_from_file[stat_type.lower()] = float(pred_value)

    for stat_type in actual_stats_from_api:

        assert_almost_equal(actual_stats_from_file[stat_type],
                            actual_stats_from_api[stat_type],
                            places=4)

        assert_almost_equal(pred_stats_from_file[stat_type],
                            pred_stats_from_api[stat_type],
                            places=4)