def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.37483895, 0.08816508, 0.25379838, 0.18337128, 0.09982631] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: expected_feature_importances = ([0.40195798, 0.06702903, 0.25816559, 0.18185518, 0.09099222] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.37331461, 0.08572699, 0.2543484, 0.1841172, 0.1024928] if use_feature_hashing else [0.08931994, 0.15545093, 0.75522913]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: if use_feature_hashing: expected_feature_importances = [0.40195655, 0.06702161, 0.25814858, 0.18183947, 0.09103379] else: expected_feature_importances = [0.07975691, 0.16122862, 0.75901447] expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, rtol=1e-2) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [0.33718443, 0.07810721, 0.25621769, 0.19489766, 0.13359301] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([0.204, 0.172, 0.178, 0.212, 0.234] if use_feature_hashing else [0.262, 0.288, 0.45]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: (train_fs, test_fs, weightdict) = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the weights are close to the weights # that we got from make_regression_data. Take the # ceiling before comparing since just comparing # the ceilings should be enough to make sure nothing # catastrophic happened. However, sometimes with # feature hashing, the ceiling is not exactly identical # so when that fails we want to check that the rounded # feature values are the same. One of those two equalities # _must_ be satisified. # get the weights for this trained model learned_weights = learner.model_params[0] for feature_name in learned_weights: learned_w_ceil = math.ceil(learned_weights[feature_name]) given_w_ceil = math.ceil(weightdict[feature_name]) learned_w_round = round(learned_weights[feature_name], 0) given_w_round = round(weightdict[feature_name], 0) ceil_equal = learned_w_ceil == given_w_ceil round_equal = learned_w_round == given_w_round either_equal = ceil_equal or round_equal assert either_equal # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.730811, 0.001834, 0.247603, 0.015241, 0.004511] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) else: expected_feature_importances = ([0.733654, 0.002528, 0.245527, 0.013664, 0.004627] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [0.749811, 0.001373, 0.23357, 0.011691, 0.003554] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([0.735756, 0.001034, 0.242734, 0.015836, 0.00464] if use_feature_hashing else [0.082621, 0.166652, 0.750726]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def test_feature_merging_order_invariance(): """ Test whether featuresets with different orders of IDs can be merged """ # First, randomly generate two feature sets and then make sure they have # the same labels. train_fs1, _, _ = make_regression_data() train_fs2, _, _ = make_regression_data(start_feature_num=3, random_state=87654321) train_fs2.labels = train_fs1.labels.copy() # make a reversed copy of feature set 2 shuffled_indices = list(range(len(train_fs2.ids))) np.random.seed(123456789) np.random.shuffle(shuffled_indices) train_fs2_ids_shuf = train_fs2.ids[shuffled_indices] train_fs2_labels_shuf = train_fs2.labels[shuffled_indices] train_fs2_features_shuf = train_fs2.features[shuffled_indices] train_fs2_shuf = FeatureSet("f2_shuf", train_fs2_ids_shuf, labels=train_fs2_labels_shuf, features=train_fs2_features_shuf, vectorizer=train_fs2.vectorizer) # merge feature set 1 with feature set 2 and its reversed version merged_fs = train_fs1 + train_fs2 merged_fs_shuf = train_fs1 + train_fs2_shuf # check that the two merged versions are the same feature_names = (train_fs1.vectorizer.get_feature_names() + train_fs2.vectorizer.get_feature_names()) assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names) assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(), feature_names) assert_array_equal(merged_fs.labels, train_fs1.labels) assert_array_equal(merged_fs.labels, train_fs2.labels) assert_array_equal(merged_fs.labels, merged_fs_shuf.labels) assert_array_equal(merged_fs.ids, train_fs1.ids) assert_array_equal(merged_fs.ids, train_fs2.ids) assert_array_equal(merged_fs.ids, merged_fs_shuf.ids) assert_array_equal(merged_fs.features[:, 0:2].todense(), train_fs1.features.todense()) assert_array_equal(merged_fs.features[:, 2:4].todense(), train_fs2.features.todense()) assert_array_equal(merged_fs.features.todense(), merged_fs_shuf.features.todense()) assert not np.all( merged_fs.features[:, 0:2].todense() == merged_fs.features[:, 2:4].todense())
def test_feature_merging_order_invariance(): """ Test whether featuresets with different orders of IDs can be merged """ # First, randomly generate two feature sets and then make sure they have # the same labels. train_fs1, _, _ = make_regression_data() train_fs2, _, _ = make_regression_data(start_feature_num=3, random_state=87654321) train_fs2.labels = train_fs1.labels.copy() # make a reversed copy of feature set 2 shuffled_indices = list(range(len(train_fs2.ids))) np.random.seed(123456789) np.random.shuffle(shuffled_indices) train_fs2_ids_shuf = train_fs2.ids[shuffled_indices] train_fs2_labels_shuf = train_fs2.labels[shuffled_indices] train_fs2_features_shuf = train_fs2.features[shuffled_indices] train_fs2_shuf = FeatureSet("f2_shuf", train_fs2_ids_shuf, labels=train_fs2_labels_shuf, features=train_fs2_features_shuf, vectorizer=train_fs2.vectorizer) # merge feature set 1 with feature set 2 and its reversed version merged_fs = train_fs1 + train_fs2 merged_fs_shuf = train_fs1 + train_fs2_shuf # check that the two merged versions are the same feature_names = (train_fs1.vectorizer.get_feature_names() + train_fs2.vectorizer.get_feature_names()) assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names) assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(), feature_names) assert_array_equal(merged_fs.labels, train_fs1.labels) assert_array_equal(merged_fs.labels, train_fs2.labels) assert_array_equal(merged_fs.labels, merged_fs_shuf.labels) assert_array_equal(merged_fs.ids, train_fs1.ids) assert_array_equal(merged_fs.ids, train_fs2.ids) assert_array_equal(merged_fs.ids, merged_fs_shuf.ids) assert_array_equal(merged_fs.features[:, 0:2].todense(), train_fs1.features.todense()) assert_array_equal(merged_fs.features[:, 2:4].todense(), train_fs2.features.todense()) assert_array_equal(merged_fs.features.todense(), merged_fs_shuf.features.todense()) assert not np.all(merged_fs.features[:, 0:2].todense() == merged_fs.features[:, 2:4].todense())
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [ 0.33718443, 0.07810721, 0.25621769, 0.19489766, 0.13359301 ] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([ 0.204, 0.172, 0.178, 0.212, 0.234 ] if use_feature_hashing else [0.262, 0.288, 0.45]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([ 0.37331461, 0.08572699, 0.2543484, 0.1841172, 0.1024928 ] if use_feature_hashing else [0.08931994, 0.15545093, 0.75522913]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: if use_feature_hashing: expected_feature_importances = [ 0.40195655, 0.06702161, 0.25814858, 0.18183947, 0.09103379 ] else: expected_feature_importances = [0.07975691, 0.16122862, 0.75901447] expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, rtol=1e-2) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [ 0.749811, 0.001373, 0.23357, 0.011691, 0.003554 ] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([ 0.735756, 0.001034, 0.242734, 0.015836, 0.00464 ] if use_feature_hashing else [0.082621, 0.166652, 0.750726]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, weightdict = make_regression_data( num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the weights are close to the weights # that we got from make_regression_data. Take the # ceiling before comparing since just comparing # the ceilings should be enough to make sure nothing # catastrophic happened. Note though that we cannot # test feature weights if we are using feature hashing # since model_params is not defined with a featurehasher. if not use_feature_hashing: # get the weights for this trained model learned_weights = learner.model_params[0] for feature_name in learned_weights: learned_w = math.ceil(learned_weights[feature_name]) given_w = math.ceil(weightdict[feature_name]) eq_(learned_w, given_w) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([ 0.37483895, 0.08816508, 0.25379838, 0.18337128, 0.09982631 ] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: expected_feature_importances = ([ 0.40195798, 0.06702903, 0.25816559, 0.18185518, 0.09099222 ] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_print_model_weights(task='classification'): # create some simple classification or regression data if task == 'classification': train_fs, _ = make_classification_data(train_test_ratio=0.8) else: train_fs, _, _ = make_regression_data(num_features=4, train_test_ratio=0.8) # now train the appropriate model if task == 'classification': learner = Learner('LogisticRegression') learner.train(train_fs) else: learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') # now save the model to disk model_file = join(_my_dir, 'output', 'test_print_model_weights.model') learner.save(model_file) # now call print_model_weights main() and capture the output print_model_weights_cmd = [model_file] err = '' try: old_stderr = sys.stderr old_stdout = sys.stdout sys.stderr = mystderr = StringIO() sys.stdout = mystdout = StringIO() pmw.main(print_model_weights_cmd) out = mystdout.getvalue() err = mystderr.getvalue() finally: sys.stderr = old_stderr sys.stdout = old_stdout print(err) # now parse the output of the print_model_weight command # and get the intercept and the feature values if task == 'classification': lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = safe_float(lines_to_parse[0].split('\t')[0]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[2], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_[0]) assert_allclose(learner.model.coef_[0], feature_values) else: lines_to_parse = [l for l in out.split('\n') if l] intercept = safe_float(lines_to_parse[0].split('=')[1]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values)
def check_invalid_regr_grid_obj_func(learner_name, grid_objective_function): """ Checks whether the grid objective function is valid for this regression learner """ (train_fs, _, _) = make_regression_data() clf = Learner(learner_name) clf.train(train_fs, grid_objective=grid_objective_function)
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([ 0.730811, 0.001834, 0.247603, 0.015241, 0.004511 ] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) else: expected_feature_importances = ([ 0.733654, 0.002528, 0.245527, 0.013664, 0.004627 ] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_rescaling(name, grid_search=False): train_fs, test_fs, _ = make_regression_data(num_examples=2000, sd_noise=4, num_features=3) # instantiate the given learner and its rescaled counterpart learner = Learner(name) rescaled_learner = Learner('Rescaled' + name) # train both the regular regressor and the rescaled regressor # with and without using grid search if grid_search: learner.train(train_fs, grid_search=True, grid_objective='pearson') rescaled_learner.train(train_fs, grid_search=True, grid_objective='pearson') else: learner.train(train_fs, grid_search=False) rescaled_learner.train(train_fs, grid_search=False) # now generate both sets of predictions on the test feature set predictions = learner.predict(test_fs) rescaled_predictions = rescaled_learner.predict(test_fs) # ... and on the training feature set train_predictions = learner.predict(train_fs) rescaled_train_predictions = rescaled_learner.predict(train_fs) # make sure that both sets of correlations are close to perfectly # correlated, since the only thing different is that one set has been # rescaled assert_almost_equal(pearsonr(predictions, rescaled_predictions)[0], 1.0, places=3) # make sure that the standard deviation of the rescaled test set # predictions is higher than the standard deviation of the regular test set # predictions p_std = np.std(predictions) rescaled_p_std = np.std(rescaled_predictions) assert_greater(rescaled_p_std, p_std) # make sure that the standard deviation of the rescaled predictions # on the TRAINING set (not the TEST) is closer to the standard # deviation of the training set labels than the standard deviation # of the regular predictions. train_y_std = np.std(train_fs.labels) train_p_std = np.std(train_predictions) rescaled_train_p_std = np.std(rescaled_train_predictions) assert_less(abs(rescaled_train_p_std - train_y_std), abs(train_p_std - train_y_std))
def check_non_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, weightdict = make_regression_data( num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # Note that we cannot check the feature weights here # since `model_params()` is not defined for non-linear # kernels. # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_non_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, weightdict = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # Note that we cannot check the feature weights here # since `model_params()` is not defined for non-linear # kernels. # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def test_additional_metrics(): """ Test additional metrics in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_search=True, grid_objective='pearson') # evaluate the trained model using the test feature set results = learner.evaluate(test_fs, output_metrics=['spearman', 'kendall_tau']) # check that the values for the additional metrics are as expected additional_scores_dict = results[-1] assert_almost_equal(additional_scores_dict['spearman'], 0.9996, places=4) assert_almost_equal(additional_scores_dict['kendall_tau'], 0.9846, places=4)
def check_adaboost_regression(base_estimator): train_fs, test_fs, _ = make_regression_data(num_examples=2000, sd_noise=4, num_features=3) # train an AdaBoostClassifier on the training data and evalute on the # testing data learner = Learner('AdaBoostRegressor', model_kwargs={'base_estimator': base_estimator}) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_ransac_regression(base_estimator, pearson_value): train_fs, test_fs, _ = make_regression_data(num_examples=2000, sd_noise=4, num_features=3) # train a RANSACRegressor on the training data and evalute on the # testing data model_kwargs = {'base_estimator': base_estimator} if base_estimator else {} learner = Learner('RANSACRegressor', model_kwargs=model_kwargs) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated and the value # of the correlation is as expected cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, pearson_value)
def test_additional_metrics(): """ Test additional metrics in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') # evaluate the trained model using the test feature set results = learner.evaluate(test_fs, output_metrics=['spearman', 'kendall_tau']) # check that the values for the additional metrics are as expected additional_scores_dict = results[-1] assert_almost_equal(additional_scores_dict['spearman'], 0.9996, places=4) assert_almost_equal(additional_scores_dict['kendall_tau'], 0.9846, places=4)
def check_mlp_regression(use_rescaling=False): train_fs, test_fs, _ = make_regression_data(num_examples=500, sd_noise=4, num_features=5) # train an MLPRegressor on the training data and evalute on the # testing data name = 'MLPRegressor' if use_rescaling else 'RescaledMLPRegressor' learner = Learner(name) # we don't want to see any convergence warnings during the grid search with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) learner.train(train_fs, grid_search=True, grid_objective='pearson') # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.98)
def check_mlp_regression(use_rescaling=False): train_fs, test_fs, _ = make_regression_data(num_examples=500, sd_noise=4, num_features=5) # train an MLPRegressor on the training data and evalute on the # testing data name = 'MLPRegressor' if use_rescaling else 'RescaledMLPRegressor' learner = Learner(name) # we don't want to see any convergence warnings during the grid search with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.98)
def check_predict(model, use_feature_hashing=False): """ This tests whether predict task runs and generates the same number of predictions as samples in the test set. The specified model indicates whether to generate random regression or classification data. """ # create the random data for the given model if model._estimator_type == 'regressor': train_fs, test_fs, _ = \ make_regression_data(use_feature_hashing=use_feature_hashing, feature_bins=5) # feature hashing will not work for Naive Bayes since it requires # non-negative feature values elif model.__name__ == 'MultinomialNB': train_fs, test_fs = \ make_classification_data(use_feature_hashing=False, non_negative=True) else: train_fs, test_fs = \ make_classification_data(use_feature_hashing=use_feature_hashing, feature_bins=25) # create the learner with the specified model learner = Learner(model.__name__) # now train the learner on the training data and use feature hashing when # specified and when we are not using a Naive Bayes model learner.train(train_fs, grid_search=False) # now make predictions on the test set predictions = learner.predict(test_fs) # make sure we have the same number of outputs as the # number of test set samples eq_(len(predictions), test_fs.features.shape[0])
def test_fancy_output(): """ Test the descriptive statistics output in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') # evaluate the trained model using the test feature set resultdict = learner.evaluate(test_fs) actual_stats_from_api = dict(resultdict[2]['descriptive']['actual']) pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted']) # write out the training and test feature set train_dir = join(_my_dir, 'train') test_dir = join(_my_dir, 'test') output_dir = join(_my_dir, 'output') train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'), train_fs) train_writer.write() test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs) test_writer.write() # now get the config file template, fill it in and run it # so that we can get a results file config_template_path = join(_my_dir, 'configs', 'test_regression_fancy_output.template.cfg') config_path = fill_in_config_paths_for_fancy_output(config_template_path) run_configuration(config_path, quiet=True) # read in the results file and get the descriptive statistics actual_stats_from_file = {} pred_stats_from_file = {} with open(join(output_dir, ('regression_fancy_output_train_fancy_train.' 'jsonlines_test_fancy_test.jsonlines' '_LinearRegression.results')), 'r') as resultf: result_output = resultf.read().strip().split('\n') for desc_stat_line in result_output[27:31]: desc_stat_line = desc_stat_line.strip() if not desc_stat_line: continue else: m = re.search(r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+' r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+' r'\((predicted)\)', desc_stat_line) stat_type, actual_value, _, pred_value, _ = m.groups() actual_stats_from_file[stat_type.lower()] = float(actual_value) pred_stats_from_file[stat_type.lower()] = float(pred_value) for stat_type in actual_stats_from_api: assert_almost_equal(actual_stats_from_file[stat_type], actual_stats_from_api[stat_type], places=4) assert_almost_equal(pred_stats_from_file[stat_type], pred_stats_from_api[stat_type], places=4)
def check_print_model_weights(task='classification'): # create some simple classification or regression data if task == 'classification': train_fs, _ = make_classification_data(train_test_ratio=0.8) elif task == 'multiclass_classification': train_fs, _ = make_classification_data(train_test_ratio=0.8, num_labels=3) else: train_fs, _, _ = make_regression_data(num_features=4, train_test_ratio=0.8) # now train the appropriate model if task == 'classification' or task == 'multiclass_classification': learner = Learner('LogisticRegression') learner.train(train_fs, grid_objective='f1_score_micro') elif task == 'regression': learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') else: learner = Learner('LinearSVR') learner.train(train_fs, grid_objective='pearson') # now save the model to disk model_file = join(_my_dir, 'output', 'test_print_model_weights.model') learner.save(model_file) # now call print_model_weights main() and capture the output print_model_weights_cmd = [model_file] err = '' try: old_stderr = sys.stderr old_stdout = sys.stdout sys.stderr = mystderr = StringIO() sys.stdout = mystdout = StringIO() pmw.main(print_model_weights_cmd) out = mystdout.getvalue() err = mystderr.getvalue() finally: sys.stderr = old_stderr sys.stdout = old_stdout print(err) # now parse the output of the print_model_weight command # and get the intercept and the feature values if task == 'classification': lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = safe_float(lines_to_parse[0].split('\t')[0]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[2], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_[0]) assert_allclose(learner.model.coef_[0], feature_values) elif task == 'multiclass_classification': # for multiple classes we get an intercept for each class # as well as a list of weights for each class lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = [] for intercept_string in lines_to_parse[0:3]: intercept.append(safe_float(intercept_string.split('\t')[0])) feature_values = [[], [], []] for ltp in lines_to_parse[3:]: fields = ltp.split('\t') feature_values[int(fields[1])].append((fields[2], safe_float(fields[0]))) for index, weights in enumerate(feature_values): feature_values[index] = [t[1] for t in sorted(weights)] for index, weights in enumerate(learner.model.coef_): assert_array_almost_equal(weights, feature_values[index]) assert_array_almost_equal(intercept, learner.model.intercept_) elif task == 'regression': lines_to_parse = [l for l in out.split('\n') if l] intercept = safe_float(lines_to_parse[0].split('=')[1]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values) else: lines_to_parse = [l for l in out.split('\n') if l] intercept_list = ast.literal_eval(lines_to_parse[0].split('=')[1].strip()) intercept = [] for intercept_string in intercept_list: intercept.append(safe_float(intercept_string)) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_array_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values)
def check_print_model_weights(task='classification'): # create some simple classification or regression data if task == 'classification' or task == 'classification_no_intercept': train_fs, _ = make_classification_data(train_test_ratio=0.8) elif task == 'multiclass_classification': train_fs, _ = make_classification_data(train_test_ratio=0.8, num_labels=3) else: train_fs, _, _ = make_regression_data(num_features=4, train_test_ratio=0.8) # now train the appropriate model if task == 'classification' or task == 'multiclass_classification': learner = Learner('LogisticRegression') learner.train(train_fs, grid_objective='f1_score_micro') elif task == 'classification_no_intercept': learner = Learner('LogisticRegression') learner.train(train_fs, grid_objective='f1_score_micro', param_grid=[{'fit_intercept':[False]}]) elif task == 'regression': learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') else: learner = Learner('LinearSVR') learner.train(train_fs, grid_objective='pearson') # now save the model to disk model_file = join(_my_dir, 'output', 'test_print_model_weights.model') learner.save(model_file) # now call print_model_weights main() and capture the output print_model_weights_cmd = [model_file] err = '' try: old_stderr = sys.stderr old_stdout = sys.stdout sys.stderr = mystderr = StringIO() sys.stdout = mystdout = StringIO() pmw.main(print_model_weights_cmd) out = mystdout.getvalue() err = mystderr.getvalue() finally: sys.stderr = old_stderr sys.stdout = old_stdout print(err) # now parse the output of the print_model_weight command # and get the intercept and the feature values if task == 'classification': lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = safe_float(lines_to_parse[0].split('\t')[0]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[2], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_[0]) assert_allclose(learner.model.coef_[0], feature_values) elif task == 'multiclass_classification': # for multiple classes we get an intercept for each class # as well as a list of weights for each class lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = [] for intercept_string in lines_to_parse[0:3]: intercept.append(safe_float(intercept_string.split('\t')[0])) feature_values = [[], [], []] for ltp in lines_to_parse[3:]: fields = ltp.split('\t') feature_values[int(fields[1])].append((fields[2], safe_float(fields[0]))) for index, weights in enumerate(feature_values): feature_values[index] = [t[1] for t in sorted(weights)] for index, weights in enumerate(learner.model.coef_): assert_array_almost_equal(weights, feature_values[index]) assert_array_almost_equal(intercept, learner.model.intercept_) elif task == 'classification_no_intercept': lines_to_parse = [l for l in out.split('\n')[0:] if l] intercept = safe_float(lines_to_parse[0].split('=')[1]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[2], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_[0], feature_values) elif task == 'regression': lines_to_parse = [l for l in out.split('\n') if l] intercept = safe_float(lines_to_parse[0].split('=')[1]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values) else: lines_to_parse = [l for l in out.split('\n') if l] intercept_list = ast.literal_eval(lines_to_parse[0].split('=')[1].strip()) intercept = [] for intercept_string in intercept_list: intercept.append(safe_float(intercept_string)) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_array_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values)
def test_fancy_output(): """ Test the descriptive statistics output in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') # evaluate the trained model using the test feature set resultdict = learner.evaluate(test_fs) actual_stats_from_api = dict(resultdict[2]['descriptive']['actual']) pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted']) # write out the training and test feature set train_dir = join(_my_dir, 'train') test_dir = join(_my_dir, 'test') output_dir = join(_my_dir, 'output') train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'), train_fs) train_writer.write() test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs) test_writer.write() # now get the config file template, fill it in and run it # so that we can get a results file config_template_path = join(_my_dir, 'configs', 'test_regression_fancy_output.template.cfg') config_path = fill_in_config_paths_for_fancy_output(config_template_path) run_configuration(config_path, quiet=True) # read in the results file and get the descriptive statistics actual_stats_from_file = {} pred_stats_from_file = {} with open( join(output_dir, ('regression_fancy_output_train_fancy_train.' 'jsonlines_test_fancy_test.jsonlines' '_LinearRegression.results')), 'r') as resultf: result_output = resultf.read().strip().split('\n') for desc_stat_line in result_output[27:31]: desc_stat_line = desc_stat_line.strip() if not desc_stat_line: continue else: m = re.search( r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+' r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+' r'\((predicted)\)', desc_stat_line) stat_type, actual_value, _, pred_value, _ = m.groups() actual_stats_from_file[stat_type.lower()] = float(actual_value) pred_stats_from_file[stat_type.lower()] = float(pred_value) for stat_type in actual_stats_from_api: assert_almost_equal(actual_stats_from_file[stat_type], actual_stats_from_api[stat_type], places=4) assert_almost_equal(pred_stats_from_file[stat_type], pred_stats_from_api[stat_type], places=4)
def test_pipeline_attribute(): # define the classifier and regressor feature dictionaries and labels that we will test on # and also the classes and targets respectively cfeature_dicts = [{"f01": -2.87, "f02": 0.713, "f03": 2.86, "f04": 0.385, "f05": -0.989, "f06": 0.380, "f07": -0.365, "f08": -0.224, "f09": 3.45, "f10": 0.622}, {"f01": 0.058, "f02": -1.14, "f03": 2.85, "f04": 1.41, "f05": 1.60, "f06": 1.04, "f07": -0.669, "f08": -0.727, "f09": 1.82, "f10": 1.336}, {"f01": -1.80, "f02": 3.21, "f03": 0.79, "f04": -0.55, "f05": 0.059, "f06": -5.66, "f07": -3.08, "f08": -0.95, "f09": 0.188, "f10": -1.24}, {"f01": 2.270, "f02": 2.271, "f03": 2.285, "f04": 2.951, "f05": 1.018, "f06": -0.59, "f07": 0.432, "f08": 1.614, "f09": -0.69, "f10": -1.27}, {"f01": 2.98, "f02": 3.74, "f03": 1.96, "f04": 0.80, "f05": 0.425, "f06": -0.76, "f07": 4.013, "f08": 3.119, "f09": 2.104, "f10": 0.195}, {"f01": 2.560, "f02": -2.05, "f03": 1.793, "f04": 0.955, "f05": 2.914, "f06": 2.239, "f07": -1.41, "f08": -1.24, "f09": -4.44, "f10": 0.273}, {"f01": 1.86, "f02": -0.017, "f03": 1.337, "f04": -2.14, "f05": 2.255, "f06": -1.21, "f07": -0.24, "f08": -0.66, "f09": -2.51, "f10": -1.06}, {"f01": -1.95, "f02": -1.81, "f03": 2.105, "f04": 0.976, "f05": -1.480, "f06": 1.120, "f07": -1.22, "f08": 0.704, "f09": -3.66, "f10": -1.72}, {"f01": -1.54, "f02": -2.17, "f03": -4.18, "f04": 1.708, "f05": 0.514, "f06": 0.354, "f07": -3.55, "f08": 2.285, "f09": -3.47, "f10": -0.79}, {"f01": 2.162, "f02": -0.71, "f03": -0.448, "f04": 0.326, "f05": 3.384, "f06": -0.455, "f07": 1.253, "f08": 0.998, "f09": 3.193, "f10": 1.342}] classes = [1, 1, 0, 2, 1, 2, 0, 1, 2, 1] rfeature_dicts = [{'f1': 1.351, 'f2': -0.117, 'f3': 0.570, 'f4': 0.0619, 'f5': 1.569, 'f6': 0.805}, {'f1': -0.557, 'f2': -1.704, 'f3': 0.0913, 'f4': 0.767, 'f5': 1.281, 'f6': -0.803}, {'f1': 0.720, 'f2': -0.268, 'f3': 0.760, 'f4': 0.861, 'f5': -0.403, 'f6': 0.814}, {'f1': 1.737, 'f2': -0.228, 'f3': 1.340, 'f4': 2.031, 'f5': 2.170, 'f6': 1.498}, {'f1': 0.344, 'f2': 0.340, 'f3': 0.572, 'f4': -1.06, 'f5': 1.044, 'f6': 2.065}, {'f1': -0.489, 'f2': -0.420, 'f3': 0.428, 'f4': 0.707, 'f5': -1.306, 'f6': 0.0081}, {'f1': 0.805, 'f2': 0.570, 'f3': 1.351, 'f4': -0.117, 'f5': 0.0619, 'f6': 1.569}, {'f1': -1.083, 'f2': 0.0369, 'f3': -0.413, 'f4': 1.391, 'f5': 1.417, 'f6': -1.118}, {'f1': -1.945, 'f2': -0.332, 'f3': -1.393, 'f4': 0.952, 'f5': -0.816, 'f6': 1.417}, {'f1': 1.976, 'f2': -0.220, 'f3': -1.636, 'f4': 0.795, 'f5': -2.34, 'f6': -0.148}] targets = [96.057, -176.017, -182.32, -56.46, -50.14, -84.53, 241.71, -17.84, -47.09, 77.65] # create training featuresets that we will use to train our estimator function_args_dict = defaultdict(dict) for estimator_type in ['classifier', 'regressor']: for do_feature_hashing in [True, False]: if estimator_type == 'classifier': (train_fs, test_fs) = make_classification_data(num_examples=500, num_features=10, num_labels=3, feature_bins=4, non_negative=True, use_feature_hashing=do_feature_hashing) labels = classes feature_dicts = cfeature_dicts else: (train_fs, test_fs, _) = make_regression_data(num_examples=500, num_features=6, feature_bins=4, use_feature_hashing=do_feature_hashing) labels = targets feature_dicts = rfeature_dicts # if we are doing feature hashing, we need to transform our test # cases to the same space. If we are not, then we don't need to worry # beacuse we have manually ensured that the number of features are the # same for the non-hashing case (10 for classification, and 6 for # regression) test_fs = FeatureSet('test', ids=list(range(1, 11)), features=feature_dicts, labels=labels, vectorizer=train_fs.vectorizer if do_feature_hashing else None) function_args_dict[estimator_type][do_feature_hashing] = [train_fs, test_fs, feature_dicts, labels] function_args_dict = dict(function_args_dict) # now set up the test cases learners = ['LinearSVC', 'LogisticRegression', 'MultinomialNB', 'SVC', 'GradientBoostingClassifier', 'Lars', 'LinearSVR', 'Ridge', 'SVR', 'GradientBoostingRegressor'] use_hashing = [True, False] min_feature_counts = [1, 2] samplers = [None, 'RBFSampler', 'SkewedChi2Sampler'] scalers = ['none', 'with_mean', 'with_std', 'both'] for (learner_name, do_feature_hashing, min_count, scaling_type, sampler_name) in product(learners, use_hashing, min_feature_counts, scalers, samplers): # skip the case for MultinomialNB with feature hashing # or feature sampling since it does not support those if learner_name == 'MultinomialNB': if do_feature_hashing or sampler_name is not None: continue # if we are using a SkewedChi2Sampler, we need to set the # some parameters to make sure it works as expected if sampler_name == 'SkewedChi2Sampler': sampler_kwargs = {'skewedness': 15, 'n_components': 10} else: sampler_kwargs = {} # create a learner instance with the given parameters # and with pipeline attribute set to True learner = Learner(learner_name, min_feature_count=min_count, sampler=sampler_name, sampler_kwargs=sampler_kwargs, feature_scaling=scaling_type, pipeline=True) yield (check_pipeline_attribute, learner_name, do_feature_hashing, min_count, scaling_type, sampler_name, learner, function_args_dict)