def check_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: (train_fs, test_fs, weightdict) = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the weights are close to the weights # that we got from make_regression_data. Take the # ceiling before comparing since just comparing # the ceilings should be enough to make sure nothing # catastrophic happened. However, sometimes with # feature hashing, the ceiling is not exactly identical # so when that fails we want to check that the rounded # feature values are the same. One of those two equalities # _must_ be satisified. # get the weights for this trained model learned_weights = learner.model_params[0] for feature_name in learned_weights: learned_w_ceil = math.ceil(learned_weights[feature_name]) given_w_ceil = math.ceil(weightdict[feature_name]) learned_w_round = round(learned_weights[feature_name], 0) given_w_round = round(weightdict[feature_name], 0) ceil_equal = learned_w_ceil == given_w_ceil round_equal = learned_w_round == given_w_round either_equal = ceil_equal or round_equal assert either_equal # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [0.749811, 0.001373, 0.23357, 0.011691, 0.003554] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([0.735756, 0.001034, 0.242734, 0.015836, 0.00464] if use_feature_hashing else [0.082621, 0.166652, 0.750726]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.730811, 0.001834, 0.247603, 0.015241, 0.004511] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) else: expected_feature_importances = ([0.733654, 0.002528, 0.245527, 0.013664, 0.004627] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def test_feature_merging_order_invariance(): """ Test whether featuresets with different orders of IDs can be merged """ # First, randomly generate two feature sets and then make sure they have # the same labels. train_fs1, _, _ = make_regression_data() train_fs2, _, _ = make_regression_data(start_feature_num=3, random_state=87654321) train_fs2.labels = train_fs1.labels.copy() # make a reversed copy of feature set 2 shuffled_indices = list(range(len(train_fs2.ids))) np.random.seed(123456789) np.random.shuffle(shuffled_indices) train_fs2_ids_shuf = train_fs2.ids[shuffled_indices] train_fs2_labels_shuf = train_fs2.labels[shuffled_indices] train_fs2_features_shuf = train_fs2.features[shuffled_indices] train_fs2_shuf = FeatureSet("f2_shuf", train_fs2_ids_shuf, labels=train_fs2_labels_shuf, features=train_fs2_features_shuf, vectorizer=train_fs2.vectorizer) # merge feature set 1 with feature set 2 and its reversed version merged_fs = train_fs1 + train_fs2 merged_fs_shuf = train_fs1 + train_fs2_shuf # check that the two merged versions are the same feature_names = (train_fs1.vectorizer.get_feature_names() + train_fs2.vectorizer.get_feature_names()) assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names) assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(), feature_names) assert_array_equal(merged_fs.labels, train_fs1.labels) assert_array_equal(merged_fs.labels, train_fs2.labels) assert_array_equal(merged_fs.labels, merged_fs_shuf.labels) assert_array_equal(merged_fs.ids, train_fs1.ids) assert_array_equal(merged_fs.ids, train_fs2.ids) assert_array_equal(merged_fs.ids, merged_fs_shuf.ids) assert_array_equal(merged_fs.features[:, 0:2].todense(), train_fs1.features.todense()) assert_array_equal(merged_fs.features[:, 2:4].todense(), train_fs2.features.todense()) assert_array_equal(merged_fs.features.todense(), merged_fs_shuf.features.todense()) assert not np.all( merged_fs.features[:, 0:2].todense() == merged_fs.features[:, 2:4].todense())
def check_invalid_regression_grid_objective(learner, grid_objective): """ Checks whether the grid objective function is valid for this regressor """ (train_fs, _, _) = make_regression_data() clf = Learner(learner) clf.train(train_fs, grid_objective=grid_objective)
def check_invalid_regression_metric(learner, metric, by_itself=False): """ Checks that invalid metrics raise exceptions """ (train_fs, test_fs, _) = make_regression_data() clf = Learner(learner) clf.train(train_fs, grid_search=False) output_metrics = [metric] if by_itself else ['pearson', metric] clf.evaluate(test_fs, output_metrics=output_metrics)
def check_rescaling(name, grid_search=False): train_fs, test_fs, _ = make_regression_data(num_examples=2000, sd_noise=4, num_features=3) # instantiate the given learner and its rescaled counterpart learner = Learner(name) rescaled_learner = Learner('Rescaled' + name) # train both the regular regressor and the rescaled regressor # with and without using grid search if grid_search: learner.train(train_fs, grid_search=True, grid_objective='pearson') rescaled_learner.train(train_fs, grid_search=True, grid_objective='pearson') else: learner.train(train_fs, grid_search=False) rescaled_learner.train(train_fs, grid_search=False) # now generate both sets of predictions on the test feature set predictions = learner.predict(test_fs) rescaled_predictions = rescaled_learner.predict(test_fs) # ... and on the training feature set train_predictions = learner.predict(train_fs) rescaled_train_predictions = rescaled_learner.predict(train_fs) # make sure that both sets of correlations are close to perfectly # correlated, since the only thing different is that one set has been # rescaled assert_almost_equal(pearsonr(predictions, rescaled_predictions)[0], 1.0, places=3) # make sure that the standard deviation of the rescaled test set # predictions is higher than the standard deviation of the regular test set # predictions p_std = np.std(predictions) rescaled_p_std = np.std(rescaled_predictions) assert_greater(rescaled_p_std, p_std) # make sure that the standard deviation of the rescaled predictions # on the TRAINING set (not the TEST) is closer to the standard # deviation of the training set labels than the standard deviation # of the regular predictions. train_y_std = np.std(train_fs.labels) train_p_std = np.std(train_predictions) rescaled_train_p_std = np.std(rescaled_train_predictions) assert_less(abs(rescaled_train_p_std - train_y_std), abs(train_p_std - train_y_std))
def check_non_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, weightdict = make_regression_data( num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # Note that we cannot check the feature weights here # since `model_params()` is not defined for non-linear # kernels. # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def test_additional_metrics(): """ Test additional metrics in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_search=True, grid_objective='pearson') # evaluate the trained model using the test feature set results = learner.evaluate(test_fs, output_metrics=['spearman', 'kendall_tau']) # check that the values for the additional metrics are as expected additional_scores_dict = results[-1] assert_almost_equal(additional_scores_dict['spearman'], 0.9996, places=4) assert_almost_equal(additional_scores_dict['kendall_tau'], 0.9847, places=4)
def check_adaboost_regression(base_estimator): train_fs, test_fs, _ = make_regression_data(num_examples=2000, sd_noise=4, num_features=3) # train an AdaBoostRegressor on the training data and evalute on the # testing data learner = Learner('AdaBoostRegressor', model_kwargs={'base_estimator': base_estimator}) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_ransac_regression(base_estimator, pearson_value): train_fs, test_fs, _ = make_regression_data(num_examples=2000, sd_noise=4, num_features=3) # train a RANSACRegressor on the training data and evalute on the # testing data model_kwargs = {'base_estimator': base_estimator} if base_estimator else {} learner = Learner('RANSACRegressor', model_kwargs=model_kwargs) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated and the value # of the correlation is as expected cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, pearson_value)
def check_mlp_regression(use_rescaling=False): train_fs, test_fs, _ = make_regression_data(num_examples=500, sd_noise=4, num_features=5) # train an MLPRegressor on the training data and evalute on the # testing data name = 'MLPRegressor' if use_rescaling else 'RescaledMLPRegressor' learner = Learner(name) # we don't want to see any convergence warnings during the grid search with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.98)
def test_fancy_output(): """ Test the descriptive statistics output in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_search=True, grid_objective='pearson') # evaluate the trained model using the test feature set resultdict = learner.evaluate(test_fs) actual_stats_from_api = dict(resultdict[2]['descriptive']['actual']) pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted']) # write out the training and test feature set train_dir = join(_my_dir, 'train') test_dir = join(_my_dir, 'test') output_dir = join(_my_dir, 'output') train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'), train_fs) train_writer.write() test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs) test_writer.write() # now get the config file template, fill it in and run it # so that we can get a results file config_template_path = join(_my_dir, 'configs', 'test_regression_fancy_output.template.cfg') config_path = fill_in_config_paths_for_fancy_output(config_template_path) run_configuration(config_path, quiet=True) # read in the results file and get the descriptive statistics actual_stats_from_file = {} pred_stats_from_file = {} with open( join(output_dir, ('regression_fancy_output_train_fancy_train.' 'jsonlines_test_fancy_test.jsonlines' '_LinearRegression.results')), 'r') as resultf: result_output = resultf.read().strip().split('\n') for desc_stat_line in result_output[26:30]: desc_stat_line = desc_stat_line.strip() if not desc_stat_line: continue else: m = re.search( r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+' r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+' r'\((predicted)\)', desc_stat_line) stat_type, actual_value, _, pred_value, _ = m.groups() actual_stats_from_file[stat_type.lower()] = float(actual_value) pred_stats_from_file[stat_type.lower()] = float(pred_value) for stat_type in actual_stats_from_api: assert_almost_equal(actual_stats_from_file[stat_type], actual_stats_from_api[stat_type], places=4) assert_almost_equal(pred_stats_from_file[stat_type], pred_stats_from_api[stat_type], places=4)