def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data( use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs, grid_search=True, grid_objective='f1_score_micro') test_output = learner.evaluate(test_fs) fmeasures = [ test_output[2][0]['F-measure'], test_output[2][1]['F-measure'] ] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([ 0.6699507389162562, 0.6598984771573605 ] if not use_scaling else [0.7058823529411765, 0.7417840375586855]) else: expected_fmeasures = ([ 0.5288461538461539, 0.4895833333333333 ] if not use_scaling else [0.632183908045977, 0.7168141592920354]) assert_almost_equal(expected_fmeasures, fmeasures)
def test_predict_on_subset_with_existing_model(): """ Test generating predictions on subset with existing model """ # Create data files make_single_file_featureset_data() # train and save a model on the training file train_fs = NDJReader.for_path(join(_my_dir, 'train', 'train_single_file.jsonlines')).read() learner = Learner('RandomForestClassifier') learner.train(train_fs, grid_search=True, grid_objective="accuracy") model_filename = join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file_subset.jsonlines_RandomForestClassifier' '.model')) learner.save(model_filename) # Run experiment config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_single_file_saved_subset" ".template.cfg"), join(_my_dir, 'train', 'train_single_file.jsonlines'), join(_my_dir, 'test', 'test_single_file_subset.' 'jsonlines')) run_configuration(config_path, quiet=True, overwrite=False) # Check results with open(join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file_subset.jsonlines_RandomForestClassifier' '.results.json'))) as f: result_dict = json.load(f)[0] assert_almost_equal(result_dict['accuracy'], 0.7333333)
def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data(use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs) test_output = learner.evaluate(test_fs) fmeasures = [test_output[2][0]['F-measure'], test_output[2][1]['F-measure']] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([0.55276381909547745, 0.55721393034825872] if not use_scaling else [0.65217391304347827, 0.70370370370370372]) else: expected_fmeasures = ([0.54255319148936176, 0.59433962264150941] if not use_scaling else [0.69950738916256161, 0.69035532994923865]) assert_almost_equal(expected_fmeasures, fmeasures)
def check_train_and_score_function(model_type): """ Check that the _train_and_score() function works as expected """ # create train and test data (train_fs, test_fs) = make_classification_data(num_examples=500, train_test_ratio=0.7, num_features=5, use_feature_hashing=False, non_negative=True) # call _train_and_score() on this data estimator_name = 'LogisticRegression' if model_type == 'classifier' else 'Ridge' metric = 'accuracy' if model_type == 'classifier' else 'pearson' learner1 = Learner(estimator_name) train_score1, test_score1 = _train_and_score(learner1, train_fs, test_fs, metric) # this should yield identical results when training another instance # of the same learner without grid search and shuffling and evaluating # that instance on the train and the test set learner2 = Learner(estimator_name) learner2.train(train_fs, grid_search=False, shuffle=False) train_score2 = learner2.evaluate(train_fs, output_metrics=[metric])[-1][metric] test_score2 = learner2.evaluate(test_fs, output_metrics=[metric])[-1][metric] eq_(train_score1, train_score2) eq_(test_score1, test_score2)
def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data( use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs) test_output = learner.evaluate(test_fs) fmeasures = [test_output[2][0]['F-measure'], test_output[2][1]['F-measure']] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([0.7979797979797979, 0.80198019801980192] if not use_scaling else [0.94883720930232551, 0.94054054054054048]) else: expected_fmeasures = ([0.83962264150943389, 0.81914893617021278] if not use_scaling else [0.88038277511961716, 0.86910994764397898]) assert_almost_equal(expected_fmeasures, fmeasures)
def check_print_model_weights(task='classification'): # create some simple classification or regression data if task == 'classification': train_fs, _ = make_classification_data(train_test_ratio=0.8) else: train_fs, _, _ = make_regression_data(num_features=4, train_test_ratio=0.8) # now train the appropriate model if task == 'classification': learner = Learner('LogisticRegression') learner.train(train_fs) else: learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') # now save the model to disk model_file = join(_my_dir, 'output', 'test_print_model_weights.model') learner.save(model_file) # now call print_model_weights main() and capture the output print_model_weights_cmd = [model_file] err = '' try: old_stderr = sys.stderr old_stdout = sys.stdout sys.stderr = mystderr = StringIO() sys.stdout = mystdout = StringIO() pmw.main(print_model_weights_cmd) out = mystdout.getvalue() err = mystderr.getvalue() finally: sys.stderr = old_stderr sys.stdout = old_stdout print(err) # now parse the output of the print_model_weight command # and get the intercept and the feature values if task == 'classification': lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = safe_float(lines_to_parse[0].split('\t')[0]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[2], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_[0]) assert_allclose(learner.model.coef_[0], feature_values) else: lines_to_parse = [l for l in out.split('\n') if l] intercept = safe_float(lines_to_parse[0].split('=')[1]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values)
def check_specified_cv_folds(numeric_ids): make_cv_folds_data(numeric_ids) # test_cv_folds1.cfg has prespecified folds and should have ~50% accuracy # test_cv_folds2.cfg doesn't have prespecified folds and >95% accuracy for experiment_name, test_func, grid_size in [('test_cv_folds1', lambda x: x < 0.6, 3), ('test_cv_folds2', lambda x: x > 0.95, 10)]: config_template_file = '{}.template.cfg'.format(experiment_name) config_template_path = os.path.join(_my_dir, 'configs', config_template_file) config_path = os.path.join(_my_dir, fill_in_config_paths(config_template_path)) # Modify config file to change ids_to_floats depending on numeric_ids # setting with open(config_path, 'r+') as config_template_file: lines = config_template_file.readlines() config_template_file.seek(0) config_template_file.truncate() for line in lines: if line.startswith('ids_to_floats='): if numeric_ids: line = 'ids_to_floats=true\n' else: line = 'ids_to_floats=false\n' config_template_file.write(line) run_configuration(config_path, quiet=True) result_filename = ('{}_test_cv_folds_LogisticRegression.' + 'results').format(experiment_name) with open(os.path.join(_my_dir, 'output', result_filename)) as f: # check held out scores outstr = f.read() score = float(SCORE_OUTPUT_RE.search(outstr).groups()[-1]) assert test_func(score) grid_score_matches = GRID_RE.findall(outstr) assert len(grid_score_matches) == grid_size for match_str in grid_score_matches: assert test_func(float(match_str)) # try the same tests for just training (and specifying the folds for the # grid search) dirpath = os.path.join(_my_dir, 'train') suffix = '.jsonlines' featureset = ['test_cv_folds'] examples = _load_featureset(dirpath, featureset, suffix, quiet=True) clf = Learner('LogisticRegression', probability=True) cv_folds = _load_cv_folds(os.path.join(_my_dir, 'train', 'test_cv_folds.csv')) grid_search_score = clf.train(examples, grid_search_folds=cv_folds, grid_objective='accuracy', grid_jobs=1) assert grid_search_score < 0.6 grid_search_score = clf.train(examples, grid_search_folds=5, grid_objective='accuracy', grid_jobs=1) assert grid_search_score > 0.95
def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data( use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs, grid_search=True, grid_objective='f1_score_micro') test_output = learner.evaluate(test_fs) fmeasures = [ test_output[2][0]['F-measure'], test_output[2][1]['F-measure'] ] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([ 0.5333333333333333, 0.4842105263157895 ] if not use_scaling else [0.7219512195121951, 0.7076923076923077]) else: expected_fmeasures = ([ 0.5288461538461539, 0.4895833333333333 ] if not use_scaling else [0.663157894736842, 0.6952380952380952]) assert_almost_equal(expected_fmeasures, fmeasures)
def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data( use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs, grid_search=True, grid_objective='f1_score_micro') test_output = learner.evaluate(test_fs) fmeasures = [ test_output[2][0]['F-measure'], test_output[2][1]['F-measure'] ] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([ 0.55276381909547745, 0.55721393034825872 ] if not use_scaling else [0.65217391304347827, 0.70370370370370372]) else: expected_fmeasures = ([ 0.54255319148936176, 0.59433962264150941 ] if not use_scaling else [0.69950738916256161, 0.69035532994923865]) assert_almost_equal(expected_fmeasures, fmeasures)
def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data(use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs) test_output = learner.evaluate(test_fs) fmeasures = [test_output[2][0]['F-measure'], test_output[2][1]['F-measure']] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([0.77319587628865982, 0.78640776699029125] if not use_scaling else [0.94930875576036866, 0.93989071038251359]) else: expected_fmeasures = ([0.42774566473988435, 0.5638766519823788] if not use_scaling else [0.87323943661971837, 0.85561497326203206]) assert_almost_equal(expected_fmeasures, fmeasures)
def check_invalid_regression_grid_objective(learner, grid_objective): """ Checks whether the grid objective function is valid for this regressor """ (train_fs, _, _) = make_regression_data() clf = Learner(learner) clf.train(train_fs, grid_objective=grid_objective)
def test_retrieve_cv_folds(): """ Test to make sure that the fold ids get returned correctly after cross-validation """ # Setup learner = Learner('LogisticRegression') num_folds = 5 cv_fs, custom_cv_folds = make_cv_folds_data(num_examples_per_fold=2, num_folds=num_folds) # Test 1: learner.cross_validate() makes the folds itself. expected_fold_ids = {'EXAMPLE_0': '0', 'EXAMPLE_1': '4', 'EXAMPLE_2': '3', 'EXAMPLE_3': '1', 'EXAMPLE_4': '2', 'EXAMPLE_5': '2', 'EXAMPLE_6': '1', 'EXAMPLE_7': '0', 'EXAMPLE_8': '4', 'EXAMPLE_9': '3'} _, _, _, skll_fold_ids, _ = learner.cross_validate(cv_fs, stratified=True, cv_folds=num_folds, grid_search=True, grid_objective='f1_score_micro', shuffle=False, save_cv_folds=True) assert_equal(skll_fold_ids, expected_fold_ids) # Test 2: if we pass in custom fold ids, those are also preserved. _, _, _, skll_fold_ids, _ = learner.cross_validate(cv_fs, stratified=True, cv_folds=custom_cv_folds, grid_search=True, grid_objective='f1_score_micro', shuffle=False, save_cv_folds=True) assert_equal(skll_fold_ids, custom_cv_folds) # Test 3: when learner.cross_validate() makes the folds but stratified=False # and grid_search=False, so that KFold is used. expected_fold_ids = {'EXAMPLE_0': '0', 'EXAMPLE_1': '0', 'EXAMPLE_2': '1', 'EXAMPLE_3': '1', 'EXAMPLE_4': '2', 'EXAMPLE_5': '2', 'EXAMPLE_6': '3', 'EXAMPLE_7': '3', 'EXAMPLE_8': '4', 'EXAMPLE_9': '4'} _, _, _, skll_fold_ids, _ = learner.cross_validate(cv_fs, stratified=False, cv_folds=num_folds, grid_search=False, shuffle=False, save_cv_folds=True) assert_equal(skll_fold_ids, custom_cv_folds)
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.37483895, 0.08816508, 0.25379838, 0.18337128, 0.09982631] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: expected_feature_importances = ([0.40195798, 0.06702903, 0.25816559, 0.18185518, 0.09099222] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.37331461, 0.08572699, 0.2543484, 0.1841172, 0.1024928] if use_feature_hashing else [0.08931994, 0.15545093, 0.75522913]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: if use_feature_hashing: expected_feature_importances = [0.40195655, 0.06702161, 0.25814858, 0.18183947, 0.09103379] else: expected_feature_importances = [0.07975691, 0.16122862, 0.75901447] expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, rtol=1e-2) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def test_retrieve_cv_folds(): """ Test to make sure that the fold ids get returned correctly after cross-validation """ # Setup learner = Learner('LogisticRegression') num_folds = 5 cv_fs, custom_cv_folds = make_cv_folds_data(num_examples_per_fold=2, num_folds=num_folds) # Test 1: learner.cross_validate() makes the folds itself. expected_fold_ids = {'EXAMPLE_0': '0', 'EXAMPLE_1': '4', 'EXAMPLE_2': '3', 'EXAMPLE_3': '1', 'EXAMPLE_4': '2', 'EXAMPLE_5': '2', 'EXAMPLE_6': '1', 'EXAMPLE_7': '0', 'EXAMPLE_8': '4', 'EXAMPLE_9': '3'} _, _, _, skll_fold_ids = learner.cross_validate(cv_fs, stratified=True, cv_folds=num_folds, grid_search=True, grid_objective='f1_score_micro', shuffle=False, save_cv_folds=True) assert_equal(skll_fold_ids, expected_fold_ids) # Test 2: if we pass in custom fold ids, those are also preserved. _, _, _, skll_fold_ids = learner.cross_validate(cv_fs, stratified=True, cv_folds=custom_cv_folds, grid_search=True, grid_objective='f1_score_micro', shuffle=False, save_cv_folds=True) assert_equal(skll_fold_ids, custom_cv_folds) # Test 3: when learner.cross_validate() makes the folds but stratified=False # and grid_search=False, so that KFold is used. expected_fold_ids = {'EXAMPLE_0': '0', 'EXAMPLE_1': '0', 'EXAMPLE_2': '1', 'EXAMPLE_3': '1', 'EXAMPLE_4': '2', 'EXAMPLE_5': '2', 'EXAMPLE_6': '3', 'EXAMPLE_7': '3', 'EXAMPLE_8': '4', 'EXAMPLE_9': '4'} _, _, _, skll_fold_ids = learner.cross_validate(cv_fs, stratified=False, cv_folds=num_folds, grid_search=False, shuffle=False, save_cv_folds=True) assert_equal(skll_fold_ids, custom_cv_folds)
def check_invalid_regr_grid_obj_func(learner_name, grid_objective_function): """ Checks whether the grid objective function is valid for this regression learner """ (train_fs, _, _) = make_regression_data() clf = Learner(learner_name) clf.train(train_fs, grid_objective=grid_objective_function)
def test_predict_dict_hasher(): train_file = join(_my_dir, 'other', 'examples_train.jsonlines') test_file = join(_my_dir, 'other', 'examples_test.jsonlines') train_fs = NDJReader.for_path(train_file).read() test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=3).read() learner = Learner('LogisticRegression') learner.train(train_fs, grid_search=False) _ = learner.predict(test_fs)
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [0.33718443, 0.07810721, 0.25621769, 0.19489766, 0.13359301] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([0.204, 0.172, 0.178, 0.212, 0.234] if use_feature_hashing else [0.262, 0.288, 0.45]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: (train_fs, test_fs, weightdict) = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the weights are close to the weights # that we got from make_regression_data. Take the # ceiling before comparing since just comparing # the ceilings should be enough to make sure nothing # catastrophic happened. However, sometimes with # feature hashing, the ceiling is not exactly identical # so when that fails we want to check that the rounded # feature values are the same. One of those two equalities # _must_ be satisified. # get the weights for this trained model learned_weights = learner.model_params[0] for feature_name in learned_weights: learned_w_ceil = math.ceil(learned_weights[feature_name]) given_w_ceil = math.ceil(weightdict[feature_name]) learned_w_round = round(learned_weights[feature_name], 0) given_w_round = round(weightdict[feature_name], 0) ceil_equal = learned_w_ceil == given_w_ceil round_equal = learned_w_round == given_w_round either_equal = ceil_equal or round_equal assert either_equal # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_bad_xval_float_classes(do_stratified_xval): float_class_fs = make_float_class_data() prediction_prefix = join(_my_dir, 'output', 'float_class') learner = Learner('LogisticRegression') learner.cross_validate(float_class_fs, stratified=do_stratified_xval, grid_objective='accuracy', prediction_prefix=prediction_prefix)
def test_predict_dict_dict(): train_file = join(_my_dir, 'other', 'examples_train.jsonlines') test_file = join(_my_dir, 'other', 'examples_test.jsonlines') train_fs = NDJReader.for_path(train_file).read() test_fs = NDJReader.for_path(test_file).read() learner = Learner('LogisticRegression') learner.train(train_fs, grid_search=False) predictions = learner.predict(test_fs) eq_(len(predictions), test_fs.features.shape[0])
def test_predict_hasher_hasher_same_bins(): train_file = join(_my_dir, 'other', 'examples_train.jsonlines') test_file = join(_my_dir, 'other', 'examples_test.jsonlines') train_fs = NDJReader.for_path(train_file, feature_hasher=True, num_features=3).read() test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=3).read() learner = Learner('LogisticRegression') learner.train(train_fs, grid_search=False) predictions = learner.predict(test_fs) eq_(len(predictions), test_fs.features.shape[0])
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [0.749811, 0.001373, 0.23357, 0.011691, 0.003554] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([0.735756, 0.001034, 0.242734, 0.015836, 0.00464] if use_feature_hashing else [0.082621, 0.166652, 0.750726]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.730811, 0.001834, 0.247603, 0.015241, 0.004511] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) else: expected_feature_importances = ([0.733654, 0.002528, 0.245527, 0.013664, 0.004627] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_sparse_predict(use_feature_hashing=False): train_fs, test_fs = make_sparse_data( use_feature_hashing=use_feature_hashing) # train a linear SVM on the training data and evalute on the testing data learner = Learner('LogisticRegression') learner.train(train_fs, grid_search=False) test_score = learner.evaluate(test_fs)[1] expected_score = 0.51 if use_feature_hashing else 0.45 assert_almost_equal(test_score, expected_score)
def check_adaboost_predict(base_estimator, algorithm, expected_score): train_fs, test_fs = make_sparse_data() # train an AdaBoostClassifier on the training data and evalute on the # testing data learner = Learner('AdaBoostClassifier', model_kwargs={'base_estimator': base_estimator, 'algorithm': algorithm}) learner.train(train_fs, grid_search=False) test_score = learner.evaluate(test_fs)[1] assert_almost_equal(test_score, expected_score)
def check_bad_xval_float_classes(do_stratified_xval): float_class_fs = make_float_class_data() prediction_prefix = join(_my_dir, 'output', 'float_class') learner = Learner('LogisticRegression') learner.cross_validate(float_class_fs, stratified=do_stratified_xval, grid_search=True, grid_objective='accuracy', prediction_prefix=prediction_prefix)
def check_sparse_predict(learner_name, expected_score, use_feature_hashing=False): train_fs, test_fs = make_sparse_data( use_feature_hashing=use_feature_hashing) # train the given classifier on the training # data and evalute on the testing data learner = Learner(learner_name) learner.train(train_fs, grid_search=False) test_score = learner.evaluate(test_fs)[1] assert_almost_equal(test_score, expected_score)
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [ 0.33718443, 0.07810721, 0.25621769, 0.19489766, 0.13359301 ] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([ 0.204, 0.172, 0.178, 0.212, 0.234 ] if use_feature_hashing else [0.262, 0.288, 0.45]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_learner_api_grid_search_no_objective(task='train'): (train_fs, test_fs) = make_classification_data(num_examples=500, train_test_ratio=0.7, num_features=5, use_feature_hashing=False, non_negative=True) learner = Learner('LogisticRegression') if task == 'train': _ = learner.train(train_fs) else: _ = learner.cross_validate(train_fs)
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([ 0.37331461, 0.08572699, 0.2543484, 0.1841172, 0.1024928 ] if use_feature_hashing else [0.08931994, 0.15545093, 0.75522913]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: if use_feature_hashing: expected_feature_importances = [ 0.40195655, 0.06702161, 0.25814858, 0.18183947, 0.09103379 ] else: expected_feature_importances = [0.07975691, 0.16122862, 0.75901447] expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, rtol=1e-2) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def test_all_new_labels_in_test(): """ Test classification with all labels in test set unseen """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change all test labels test_fs.labels = test_fs.labels+3 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 6, [3, 4, 5] yield assert_almost_equal, res[1], 0
def test_new_labels_in_test_set(): """ Test classification experiment with an unseen label in the test set. """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # add new labels to the test set test_fs.labels[-3:] = 3 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 4, [3] yield assert_almost_equal, res[1], 0.3
def test_all_new_labels_in_test(): """ Test classification with all labels in test set unseen """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change all test labels test_fs.labels = test_fs.labels + 3 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 6, [3, 4, 5] yield assert_almost_equal, res[1], 0
def check_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, weightdict = make_regression_data( num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the weights are close to the weights # that we got from make_regression_data. Take the # ceiling before comparing since just comparing # the ceilings should be enough to make sure nothing # catastrophic happened. Note though that we cannot # test feature weights if we are using feature hashing # since model_params is not defined with a featurehasher. if not use_feature_hashing: # get the weights for this trained model learned_weights = learner.model_params[0] for feature_name in learned_weights: learned_w = math.ceil(learned_weights[feature_name]) given_w = math.ceil(weightdict[feature_name]) eq_(learned_w, given_w) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([ 0.37483895, 0.08816508, 0.25379838, 0.18337128, 0.09982631 ] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: expected_feature_importances = ([ 0.40195798, 0.06702903, 0.25816559, 0.18185518, 0.09099222 ] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def test_learning_curve_implementation(): """ Test to ensure that the learning curve results match scikit-learn """ # This test is different from the other tests which just use regression data. # The reason is that we want this test to fail in case our implementation # diverges from the scikit-learn implementation. This test essentially # serves as a regression test as well. # Load in the digits data set digits = load_digits() X, y = digits.data, digits.target # get the learning curve results from scikit-learn for this data cv_folds = 10 random_state = 123456789 cv = ShuffleSplit(n_splits=cv_folds, test_size=0.2, random_state=random_state) estimator = MultinomialNB() train_sizes = np.linspace(.1, 1.0, 5) train_sizes1, train_scores1, test_scores1 = learning_curve(estimator, X, y, cv=cv, train_sizes=train_sizes, scoring='accuracy') # get the features from this data into a FeatureSet instance we can use # with the SKLL API feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])] features = [] for row in X: features.append(dict(zip(feature_names, row))) fs = FeatureSet('train', features=features, labels=y, ids=list(range(X.shape[0]))) # we don't want to filter out any features since scikit-learn # does not do that either learner = Learner('MultinomialNB', min_feature_count=0) (train_scores2, test_scores2, train_sizes2) = learner.learning_curve(fs, cv_folds=cv_folds, train_sizes=train_sizes, metric='accuracy') assert np.all(train_sizes1 == train_sizes2) assert np.allclose(train_scores1, train_scores2) assert np.allclose(test_scores1, test_scores2)
def __init__(self, model_path, threshold=None, positive_label=1, logger=None): """ Initialize the predictor. Parameters ---------- model_path : str Path to use when loading trained model. threshold : float, optional If the model we're using is generating probabilities of the positive label, return 1 if it meets/exceeds the given threshold and 0 otherwise. Defaults to ``None``. positive_label : int, optional If the model is only being used to predict the probability of a particular class, this specifies the index of the class we're predicting. 1 = second class, which is default for binary classification. Defaults to 1. logger : logging object, optional A logging object. If ``None`` is passed, get logger from ``__name__``. Defaults to ``None``. """ # self.logger = logger if logger else logging.getLogger(__name__) self._learner = Learner.from_file(model_path) self._pos_index = positive_label self.threshold = threshold
def __init__(self, model_path, threshold=None, positive_label=1): """ Initialize the predictor. Parameters ---------- model_path : str Path to use when loading trained model. threshold : float, optional If the model we're using is generating probabilities of the positive label, return 1 if it meets/exceeds the given threshold and 0 otherwise. Defaults to ``None``. positive_label : int, optional If the model is only being used to predict the probability of a particular class, this specifies the index of the class we're predicting. 1 = second class, which is default for binary classification. Defaults to 1. """ self._learner = Learner.from_file(model_path) # garyfeng: fixing error msg "AttributeError: 'Learner' object has no attribute 'logger'" # by passing to the learner the root logger self._learner.logger = logging.getLogger(__name__) self._pos_index = positive_label self.threshold = threshold
def check_dummy_classifier_predict(model_args, train_labels, expected_output): # create hard-coded featuresets based with known labels train_fs = FeatureSet('classification_train', ['TrainExample{}'.format(i) for i in range(20)], labels=train_labels, features=[{"feature": i} for i in range(20)]) test_fs = FeatureSet('classification_test', ['TestExample{}'.format(i) for i in range(10)], features=[{"feature": i} for i in range(20, 30)]) # Ensure predictions are as expectedfor the given strategy learner = Learner('DummyClassifier', model_kwargs=model_args) learner.train(train_fs, grid_search=False) predictions = learner.predict(test_fs) eq_(np.array_equal(expected_output, predictions), True)
def test_new_labels_in_test_set_change_order(): """ Test classification with an unseen label in the test set when the new label falls between the existing labels """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change train labels to create a gap train_fs.labels = train_fs.labels * 10 # add new test labels test_fs.labels = test_fs.labels * 10 test_fs.labels[-3:] = 15 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 4, [15] yield assert_almost_equal, res[1], 0.3
def test_new_labels_in_test_set_change_order(): """ Test classification with an unseen label in the test set when the new label falls between the existing labels """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change train labels to create a gap train_fs.labels = train_fs.labels*10 # add new test labels test_fs.labels = test_fs.labels*10 test_fs.labels[-3:] = 15 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 4, [15] yield assert_almost_equal, res[1], 0.3
def test_rare_class(): """ Test cross-validation when some labels are very rare """ rare_class_fs = make_rare_class_data() prediction_prefix = join(_my_dir, 'output', 'rare_class') learner = Learner('LogisticRegression') learner.cross_validate(rare_class_fs, grid_objective='unweighted_kappa', prediction_prefix=prediction_prefix) with open(prediction_prefix + '_predictions.tsv', 'r') as f: reader = csv.reader(f, dialect='excel-tab') next(reader) pred = [row[1] for row in reader] eq_(len(pred), 15)
def check_rescaling(name): train_fs, test_fs, _ = make_regression_data(num_examples=2000, sd_noise=4, num_features=3) # instantiate the given learner and its rescaled counterpart learner = Learner(name) rescaled_learner = Learner('Rescaled' + name) # train both the regular regressor and the rescaled regressor learner.train(train_fs, grid_objective='pearson') rescaled_learner.train(train_fs, grid_objective='pearson') # now generate both sets of predictions on the test feature set predictions = learner.predict(test_fs) rescaled_predictions = rescaled_learner.predict(test_fs) # ... and on the training feature set train_predictions = learner.predict(train_fs) rescaled_train_predictions = rescaled_learner.predict(train_fs) # make sure that both sets of correlations are close to perfectly # correlated, since the only thing different is that one set has been # rescaled assert_almost_equal(pearsonr(predictions, rescaled_predictions)[0], 1.0, places=3) # make sure that the standard deviation of the rescaled test set # predictions is higher than the standard deviation of the regular test set # predictions p_std = np.std(predictions) rescaled_p_std = np.std(rescaled_predictions) assert_greater(rescaled_p_std, p_std) # make sure that the standard deviation of the rescaled predictions # on the TRAINING set (not the TEST) is closer to the standard # deviation of the training set labels than the standard deviation # of the regular predictions. train_y_std = np.std(train_fs.labels) train_p_std = np.std(train_predictions) rescaled_train_p_std = np.std(rescaled_train_predictions) assert_less(abs(rescaled_train_p_std - train_y_std), abs(train_p_std - train_y_std))