def test_merge_missing_labels(): """ Test to ensure that labels are sucessfully copied when merging """ # create a feature set fs1, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) # create a different feature set with no labels specified fs2, _ = make_classification_data(num_examples=100, num_features=4, feature_prefix='g', empty_labels=True, num_labels=3, train_test_ratio=1.0) # merge the two featuresets in different orders fs12 = fs1 + fs2 fs21 = fs2 + fs1 # make sure that the labels are the same after merging assert_array_equal(fs12.labels, fs1.labels) assert_array_equal(fs21.labels, fs1.labels)
def test_subtract(): """ Test to ensure that subtraction works """ # create a feature set fs1, _ = make_classification_data(num_examples=100, num_features=4, num_labels=2, train_test_ratio=1.0, random_state=1234) # create a different feature set with the same feature names # but different feature values fs2, _ = make_classification_data(num_examples=100, num_features=2, num_labels=2, train_test_ratio=1.0, random_state=5678) # subtract fs1 from fs2, i.e., the features in fs2 # should be removed from fs1 but nothing else should change fs = fs1 - fs2 # ensure that the labels are the same in fs and fs1 assert_array_equal(fs.labels, fs1.labels) # ensure that there are only two features left eq_(fs.features.shape[1], 2) # and that they are f3 and f4 assert_array_equal(np.array(fs.vectorizer.feature_names_), ['f03', 'f04'])
def check_filter_labels(inverse=False): # create a feature set fs, _ = make_classification_data(num_examples=1000, num_features=4, num_labels=5, train_test_ratio=1.0) # keep just the instaces with 0, 1 and 2 labels labels_to_filter = [0, 1, 2] # do the actual filtering fs.filter(labels=labels_to_filter, inverse=inverse) # make sure that we removed the right things if inverse: ids_kept = fs.ids[np.where( np.logical_not(np.in1d(fs.labels, labels_to_filter)))] else: ids_kept = fs.ids[np.where(np.in1d(fs.labels, labels_to_filter))] assert_array_equal(fs.ids, np.array(ids_kept)) # make sure that number of ids, labels and features are the same eq_(fs.ids.shape[0], fs.labels.shape[0]) eq_(fs.labels.shape[0], fs.features.shape[0])
def test_learner_api_load_into_existing_instance(): """ Check that `Learner.load()` works as expected """ # create a LinearSVC instance and train it on some data learner1 = Learner('LinearSVC') (train_fs, test_fs) = make_classification_data(num_examples=200, num_features=5, use_feature_hashing=False, non_negative=True) learner1.train(train_fs, grid_search=False) # now use `load()` to replace the existing instance with a # different saved learner other_model_file = join(_my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0])) learner1.load(other_model_file) # now load the saved model into another instance using the class method # `from_file()` learner2 = Learner.from_file(other_model_file) # check that the two instances are now basically the same eq_(learner1.model_type, learner2.model_type) eq_(learner1.model_params, learner2.model_params) eq_(learner1.model_kwargs, learner2.model_kwargs)
def test_string_feature(): """ Test to make sure that string-valued features are properly encoded as binary features """ # create a featureset that is derived from an original # set of features containing 3 numeric features and # one string-valued feature that can take six possible # values between 'a' to 'f'. This means that the # featureset will have 3 numeric + 6 binary features. fs, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, one_string_feature=True, num_string_values=6, train_test_ratio=1.0) # confirm that the number of features are as expected eq_(fs.features.shape, (100, 9)) # confirm the feature names eq_(fs.vectorizer.feature_names_, ['f01', 'f02', 'f03', 'f04=a', 'f04=b', 'f04=c', 'f04=d', 'f04=e', 'f04=f']) # confirm that the final six features are binary assert_array_equal(fs.features[:, [3, 4, 5, 6, 7, 8]].data, 1)
def check_train_and_score_function(model_type): """ Check that the _train_and_score() function works as expected """ # create train and test data (train_fs, test_fs) = make_classification_data(num_examples=500, train_test_ratio=0.7, num_features=5, use_feature_hashing=False, non_negative=True) # call _train_and_score() on this data estimator_name = 'LogisticRegression' if model_type == 'classifier' else 'Ridge' metric = 'accuracy' if model_type == 'classifier' else 'pearson' learner1 = Learner(estimator_name) train_score1, test_score1 = _train_and_score(learner1, train_fs, test_fs, metric) # this should yield identical results when training another instance # of the same learner without grid search and shuffling and evaluating # that instance on the train and the test set learner2 = Learner(estimator_name) learner2.train(train_fs, grid_search=False, shuffle=False) train_score2 = learner2.evaluate(train_fs, output_metrics=[metric])[-1][metric] test_score2 = learner2.evaluate(test_fs, output_metrics=[metric])[-1][metric] eq_(train_score1, train_score2) eq_(test_score1, test_score2)
def make_single_file_featureset_data(): """ Write a training file and a test file for tests that check whether specifying train_file and test_file actually works. """ train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=2, num_features=3, non_negative=False) # Write training feature set to a file train_path = join(_my_dir, 'train', 'train_single_file.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_single_file.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() # Also write another test feature set that has fewer features than the training set test_fs.filter(features=['f01', 'f02']) test_path = join(_my_dir, 'test', 'test_single_file_subset.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write()
def check_filter_labels(inverse=False): # create a feature set fs, _ = make_classification_data(num_examples=1000, num_features=4, num_labels=5, train_test_ratio=1.0) # keep just the instaces with 0, 1 and 2 labels labels_to_filter = [0, 1, 2] # do the actual filtering fs.filter(labels=labels_to_filter, inverse=inverse) # make sure that we removed the right things if inverse: ids_kept = fs.ids[np.where(np.logical_not(np.in1d(fs.labels, labels_to_filter)))] else: ids_kept = fs.ids[np.where(np.in1d(fs.labels, labels_to_filter))] assert_array_equal(fs.ids, np.array(ids_kept)) # make sure that number of ids, labels and features are the same eq_(fs.ids.shape[0], fs.labels.shape[0]) eq_(fs.labels.shape[0], fs.features.shape[0])
def test_skll_convert_libsvm_map(): """ Test to check whether the --reuse_libsvm_map option works for skll_convert """ # create some simple classification data orig_fs, _ = make_classification_data(train_test_ratio=1.0, one_string_feature=True) # now write out this feature set as a libsvm file orig_libsvm_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map.libsvm') writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True) writer.write() # now make a copy of the dataset swapped_fs = copy.deepcopy(orig_fs) # now modify this new featureset to swap the first two columns del swapped_fs.vectorizer.vocabulary_['f01'] del swapped_fs.vectorizer.vocabulary_['f02'] swapped_fs.vectorizer.vocabulary_['f01'] = 1 swapped_fs.vectorizer.vocabulary_['f02'] = 0 tmp = swapped_fs.features[:, 0] swapped_fs.features[:, 0] = swapped_fs.features[:, 1] swapped_fs.features[:, 1] = tmp # now write out this new feature set as a MegaM file swapped_megam_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map.megam') writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True) writer.write() # now run skll_convert to convert this into a libsvm file # but using the mapping specified in the first libsvm file converted_libsvm_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map2.libsvm') # now call skll convert's main function skll_convert_cmd = [ '--reuse_libsvm_map', orig_libsvm_file, '--quiet', orig_libsvm_file, converted_libsvm_file ] err = '' try: old_stderr = sys.stderr sys.stderr = mystderr = StringIO() sk.main(skll_convert_cmd) err = mystderr.getvalue() finally: sys.stderr = old_stderr print(err) # now read the converted libsvm file into a featureset reader = LibSVMReader(converted_libsvm_file, quiet=True) converted_fs = reader.read() # now ensure that this new featureset and the original # featureset are the same eq_(orig_fs, converted_fs)
def check_print_model_weights(task='classification'): # create some simple classification or regression data if task == 'classification': train_fs, _ = make_classification_data(train_test_ratio=0.8) else: train_fs, _, _ = make_regression_data(num_features=4, train_test_ratio=0.8) # now train the appropriate model if task == 'classification': learner = Learner('LogisticRegression') learner.train(train_fs) else: learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') # now save the model to disk model_file = join(_my_dir, 'output', 'test_print_model_weights.model') learner.save(model_file) # now call print_model_weights main() and capture the output print_model_weights_cmd = [model_file] err = '' try: old_stderr = sys.stderr old_stdout = sys.stdout sys.stderr = mystderr = StringIO() sys.stdout = mystdout = StringIO() pmw.main(print_model_weights_cmd) out = mystdout.getvalue() err = mystderr.getvalue() finally: sys.stderr = old_stderr sys.stdout = old_stdout print(err) # now parse the output of the print_model_weight command # and get the intercept and the feature values if task == 'classification': lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = safe_float(lines_to_parse[0].split('\t')[0]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[2], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_[0]) assert_allclose(learner.model.coef_[0], feature_values) else: lines_to_parse = [l for l in out.split('\n') if l] intercept = safe_float(lines_to_parse[0].split('=')[1]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values)
def test_learner_api_load_into_existing_instance(): """ Check that `Learner.load()` works as expected """ # create a LinearSVC instance and train it on some data learner1 = Learner('LinearSVC') (train_fs, test_fs) = make_classification_data(num_examples=200, num_features=5, use_feature_hashing=False, non_negative=True) learner1.train(train_fs, grid_search=False) # now use `load()` to replace the existing instance with a # different saved learner other_model_file = join( _my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0])) learner1.load(other_model_file) # now load the saved model into another instance using the class method # `from_file()` learner2 = Learner.from_file(other_model_file) # check that the two instances are now basically the same eq_(learner1.model_type, learner2.model_type) eq_(learner1.model_params, learner2.model_params) eq_(learner1.model_kwargs, learner2.model_kwargs)
def test_string_feature(): """ Test to make sure that string-valued features are properly encoded as binary features """ # create a featureset that is derived from an original # set of features containing 3 numeric features and # one string-valued feature that can take six possible # values between 'a' to 'f'. This means that the # featureset will have 3 numeric + 6 binary features. fs, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, one_string_feature=True, num_string_values=6, train_test_ratio=1.0) # confirm that the number of features are as expected eq_(fs.features.shape, (100, 9)) # confirm the feature names eq_(fs.vectorizer.feature_names_, [ 'f01', 'f02', 'f03', 'f04=a', 'f04=b', 'f04=c', 'f04=d', 'f04=e', 'f04=f' ]) # confirm that the final six features are binary assert_array_equal(fs.features[:, [3, 4, 5, 6, 7, 8]].data, 1)
def test_skll_convert_libsvm_map(): """ Test to check whether the --reuse_libsvm_map option works for skll_convert """ # create some simple classification data orig_fs, _ = make_classification_data(train_test_ratio=1.0, one_string_feature=True) # now write out this feature set as a libsvm file orig_libsvm_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map.libsvm') writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True) writer.write() # now make a copy of the dataset swapped_fs = copy.deepcopy(orig_fs) # now modify this new featureset to swap the first two columns del swapped_fs.vectorizer.vocabulary_['f01'] del swapped_fs.vectorizer.vocabulary_['f02'] swapped_fs.vectorizer.vocabulary_['f01'] = 1 swapped_fs.vectorizer.vocabulary_['f02'] = 0 tmp = swapped_fs.features[:, 0] swapped_fs.features[:, 0] = swapped_fs.features[:, 1] swapped_fs.features[:, 1] = tmp # now write out this new feature set as a MegaM file swapped_megam_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map.megam') writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True) writer.write() # now run skll_convert to convert this into a libsvm file # but using the mapping specified in the first libsvm file converted_libsvm_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map2.libsvm') # now call skll convert's main function skll_convert_cmd = ['--reuse_libsvm_map', orig_libsvm_file, '--quiet', orig_libsvm_file, converted_libsvm_file] err = '' try: old_stderr = sys.stderr sys.stderr = mystderr = StringIO() sk.main(skll_convert_cmd) err = mystderr.getvalue() finally: sys.stderr = old_stderr print(err) # now read the converted libsvm file into a featureset reader = LibSVMReader(converted_libsvm_file, quiet=True) converted_fs = reader.read() # now ensure that this new featureset and the original # featureset are the same eq_(orig_fs, converted_fs)
def check_generate_predictions_console(use_threshold=False): # create some simple classification data without feature hashing train_fs, test_fs = make_classification_data(num_examples=1000, num_features=5) # save the test feature set to an NDJ file input_file = join(_my_dir, 'test', 'test_generate_predictions.jsonlines') writer = NDJWriter(input_file, test_fs) writer.write() # create a learner that uses an SGD classifier learner = Learner('SGDClassifier', probability=use_threshold) # train the learner with grid search learner.train(train_fs, grid_search=True) # get the predictions on the test featureset predictions = learner.predict(test_fs) # if we asked for probabilities, then use the threshold # to convert them into binary predictions if use_threshold: threshold = 0.6 predictions = [int(p[1] >= threshold) for p in predictions] else: predictions = predictions.tolist() threshold = None # save the learner to a file model_file = join(_my_dir, 'output', 'test_generate_predictions_console.model') learner.save(model_file) # now call main() from generate_predictions.py generate_cmd = [] if use_threshold: generate_cmd.append('-t {}'.format(threshold)) generate_cmd.extend([model_file, input_file]) # we need to capture stdout since that's what main() writes to err = '' try: old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = mystdout = StringIO() sys.stderr = mystderr = StringIO() gp.main(generate_cmd) out = mystdout.getvalue() err = mystderr.getvalue() predictions_after_saving = [int(x) for x in out.strip().split('\n')] eq_(predictions, predictions_after_saving) finally: sys.stdout = old_stdout sys.stderr = old_stderr print(err)
def test_custom_learner_model_loading(): num_labels = 10 class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_model_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_model_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() # run the configuration that trains the custom model and saves it cfgfile = 'test_model_save_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) # save the predictions from disk into memory # and delete the predictions file outprefix = 'test_model_custom_learner' pred_file = join(_my_dir, 'output', '{}_{}_CustomLogisticRegressionWrapper' '.predictions'.format(outprefix, outprefix)) preds1 = read_predictions(pred_file) os.unlink(pred_file) # run the configuration that loads the saved model # and generates the predictions again cfgfile = 'test_model_load_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, overwrite=False, quiet=True) # load the newly generated predictions preds2 = read_predictions(pred_file) # make sure that they are the same as before assert_array_equal(preds1, preds2)
def test_merge_different_vectorizers(): """ Test to ensure rejection of merging featuresets with different vectorizers """ # create a featureset each with a DictVectorizer fs1, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) # create another featureset using hashing fs2, _ = make_classification_data(num_examples=100, num_features=4, feature_prefix='g', num_labels=3, train_test_ratio=1.0, use_feature_hashing=True) # This should raise a ValueError fs1 + fs2
def test_length(): """ Test to whether len() returns the number of instances """ # create a featureset fs, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) eq_(len(fs), 100)
def test_empty_labels(): """ Test to check behaviour when labels is None """ # create a feature set with empty labels fs, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, empty_labels=True, train_test_ratio=1.0) assert np.isnan(fs.labels).all()
def check_learner_api_grid_search_no_objective(task='train'): (train_fs, test_fs) = make_classification_data(num_examples=500, train_test_ratio=0.7, num_features=5, use_feature_hashing=False, non_negative=True) learner = Learner('LogisticRegression') if task == 'train': _ = learner.train(train_fs) else: _ = learner.cross_validate(train_fs)
def test_write_hashed_featureset(): """ Test to check that hashed featuresets cannot be written out """ fs, _ = make_classification_data(num_examples=100, num_features=4, use_feature_hashing=True, feature_bins=2, random_state=1234) output_dir = join(_my_dir, 'output') writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs) writer.write()
def test_new_labels_in_test_set(): """ Test classification experiment with an unseen label in the test set. """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # add new labels to the test set test_fs.labels[-3:] = 3 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 4, [3] yield assert_almost_equal, res[1], 0.3
def test_all_new_labels_in_test(): """ Test classification with all labels in test set unseen """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change all test labels test_fs.labels = test_fs.labels+3 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 6, [3, 4, 5] yield assert_almost_equal, res[1], 0
def test_all_new_labels_in_test(): """ Test classification with all labels in test set unseen """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change all test labels test_fs.labels = test_fs.labels + 3 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 6, [3, 4, 5] yield assert_almost_equal, res[1], 0
def test_merge_different_hashers(): """ Test to ensure rejection of merging featuresets with different FeatureHashers """ # create a feature set with 4 feature hashing bins fs1, _ = make_classification_data(num_examples=100, num_features=10, num_labels=3, train_test_ratio=1.0, use_feature_hashing=True, feature_bins=4) # create a second feature set with 3 feature hashing bins fs2, _ = make_classification_data(num_examples=100, num_features=10, num_labels=3, feature_prefix='g', train_test_ratio=1.0, use_feature_hashing=True, feature_bins=3) # This should raise a ValueError fs1 + fs2
def check_predict(model, use_feature_hashing=False): """ This tests whether predict task runs and generates the same number of predictions as samples in the test set. The specified model indicates whether to generate random regression or classification data. """ # create the random data for the given model if model._estimator_type == 'regressor': train_fs, test_fs, _ = \ make_regression_data(use_feature_hashing=use_feature_hashing, feature_bins=5) # feature hashing will not work for Naive Bayes since it requires # non-negative feature values elif model.__name__ == 'MultinomialNB': train_fs, test_fs = \ make_classification_data(use_feature_hashing=False, non_negative=True) else: train_fs, test_fs = \ make_classification_data(use_feature_hashing=use_feature_hashing, feature_bins=25) # create the learner with the specified model learner = Learner(model.__name__) # now train the learner on the training data and use feature hashing when # specified and when we are not using a Naive Bayes model learner.train(train_fs, grid_search=False) # now make predictions on the test set predictions = learner.predict(test_fs) # make sure we have the same number of outputs as the # number of test set samples eq_(len(predictions), test_fs.features.shape[0])
def test_merge_different_labels_same_ids(): """ Test to ensure rejection of merging featuresets that have conflicting labels """ # create a feature set fs1, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) # create a different feature set that has everything # the same but has different labels for the same IDs fs2, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, feature_prefix='g', train_test_ratio=1.0) # artificially modify the class labels fs2.labels = fs2.labels + 1 # This should raise a ValueError fs1 + fs2
def test_filter_with_hashing(): """ Test to ensure rejection of filtering by features when using hashing """ # create a feature set fs, _ = make_classification_data(num_examples=100, num_features=5, num_labels=3, train_test_ratio=1.0, use_feature_hashing=True, feature_bins=2) # filter features f1 and f4 or their inverse fs.filter(features=['f1', 'f4'])
def test_iteration_without_dictvectorizer(): """ Test to allow iteration only if the vectorizer is a DictVectorizer """ # create a feature set fs, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0, use_feature_hashing=True, feature_bins=2) # This should raise a ValueError for _ in fs: pass
def make_summary_data(): train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=2, num_features=3, non_negative=True) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_summary.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_summary.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write()
def test_new_labels_in_test_set_change_order(): """ Test classification with an unseen label in the test set when the new label falls between the existing labels """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change train labels to create a gap train_fs.labels = train_fs.labels * 10 # add new test labels test_fs.labels = test_fs.labels * 10 test_fs.labels[-3:] = 15 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 4, [15] yield assert_almost_equal, res[1], 0.3
def test_new_labels_in_test_set_change_order(): """ Test classification with an unseen label in the test set when the new label falls between the existing labels """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change train labels to create a gap train_fs.labels = train_fs.labels*10 # add new test labels test_fs.labels = test_fs.labels*10 test_fs.labels[-3:] = 15 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 4, [15] yield assert_almost_equal, res[1], 0.3
def check_generate_predictions(use_feature_hashing=False, use_threshold=False, test_on_subset=False): # create some simple classification feature sets for training and testing train_fs, test_fs = make_classification_data( num_examples=1000, num_features=5, use_feature_hashing=use_feature_hashing, feature_bins=4) # create a learner that uses an SGD classifier learner = Learner('SGDClassifier', probability=use_threshold) # train the learner with grid search learner.train(train_fs, grid_search=True) # if we are asked to use only a subset, then filter out # one of the features if we are not using feature hashing, # do nothing if we are using feature hashing if test_on_subset and not use_feature_hashing: test_fs.filter(features=['f01', 'f02', 'f03', 'f04']) # get the predictions on the test featureset predictions = learner.predict(test_fs) # if we asked for probabilities, then use the threshold # to convert them into binary predictions if use_threshold: threshold = 0.6 predictions = [int(p[1] >= threshold) for p in predictions] else: predictions = predictions.tolist() threshold = None # save the learner to a file model_file = join(_my_dir, 'output', 'test_generate_predictions.model') learner.save(model_file) # now use Predictor to generate the predictions and make # sure that they are the same as before saving the model p = gp.Predictor(model_file, threshold=threshold) predictions_after_saving = p.predict(test_fs) eq_(predictions, predictions_after_saving)
def test_logistic_custom_learner(): num_labels = 10 class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_logistic_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_logistic_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() cfgfile = 'test_logistic_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) outprefix = 'test_logistic_custom_learner' preds = read_predictions(join(_my_dir, 'output', ('{}_{}_CustomLogisticRegressionWrapper' '.predictions'.format(outprefix, outprefix)))) expected = read_predictions(join(_my_dir, 'output', ('{}_{}_LogisticRegression.predictions' .format(outprefix, outprefix)))) assert_array_equal(preds, expected)
def test_logistic_custom_learner(): num_labels = 10 class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_logistic_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_logistic_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() cfgfile = 'test_logistic_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) outprefix = 'test_logistic_custom_learner' preds = read_predictions(join(_my_dir, 'output', ('{}_{}_CustomLogisticRegressionWrapper' '_predictions.tsv'.format(outprefix, outprefix)))) expected = read_predictions(join(_my_dir, 'output', ('{}_{}_LogisticRegression_predictions.tsv' .format(outprefix, outprefix)))) assert_array_equal(preds, expected)
def check_filter_ids(inverse=False): # create a feature set fs, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) # keep just the IDs after Example_50 or do the inverse ids_to_filter = ['EXAMPLE_{}'.format(i) for i in range(51, 101)] if inverse: ids_kept = ['EXAMPLE_{}'.format(i) for i in range(1, 51)] else: ids_kept = ids_to_filter fs.filter(ids=ids_to_filter, inverse=inverse) # make sure that we removed the right things assert_array_equal(fs.ids, np.array(ids_kept)) # make sure that number of ids, labels and features are the same eq_(fs.ids.shape[0], fs.labels.shape[0]) eq_(fs.labels.shape[0], fs.features.shape[0])
def test_mlp_classification(): train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=3, num_features=5) # train an MLPCLassifier on the training data and evalute on the # testing data learner = Learner('MLPClassifier') with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated accuracy = accuracy_score(predictions, test_fs.labels) assert_almost_equal(accuracy, 0.858, places=3)
def test_majority_class_custom_learner(): num_labels = 10 # This will make data where the last class happens about 50% of the time. class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_majority_class_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_majority_class_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() cfgfile = 'test_majority_class_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) outprefix = 'test_majority_class_custom_learner' preds = read_predictions( join(_my_dir, 'output', ('{}_{}_MajorityClassLearner_predictions.tsv'.format( outprefix, outprefix)))) expected = np.array([float(num_labels - 1) for x in preds]) assert_array_equal(preds, expected)
def test_mlp_classification(): train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=3, num_features=5) # train an MLPCLassifier on the training data and evalute on the # testing data learner = Learner('MLPClassifier') with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) learner.train(train_fs, grid_search=True) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated accuracy = accuracy_score(predictions, test_fs.labels) assert_almost_equal(accuracy, 0.825)
def test_majority_class_custom_learner(): num_labels = 10 # This will make data where the last class happens about 50% of the time. class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_majority_class_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_majority_class_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() cfgfile = 'test_majority_class_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) outprefix = 'test_majority_class_custom_learner' preds = read_predictions(join(_my_dir, 'output', ('{}_{}_MajorityClassLearner_predictions.tsv' .format(outprefix, outprefix)))) expected = np.array([float(num_labels - 1) for x in preds]) assert_array_equal(preds, expected)
def check_filter_features(inverse=False): # create a feature set fs, _ = make_classification_data(num_examples=100, num_features=5, num_labels=3, train_test_ratio=1.0) # store the features in a separate matrix before filtering X = fs.features.todense() # filter features f1 and f4 or their inverse fs.filter(features=['f01', 'f04'], inverse=inverse) # make sure that we have the right number of feature columns # depending on whether we are inverting feature_shape = (100, 3) if inverse else (100, 2) eq_(fs.features.shape, feature_shape) # and that they are the first and fourth columns # of X that we generated, if not inverting and # the second, third and fifth, if inverting if inverse: feature_columns = X[:, [1, 2, 4]] else: feature_columns = X[:, [0, 3]] assert (fs.features.todense() == feature_columns).all() # make sure that the feature names that we kept are also correct feature_names = ['f02', 'f03', 'f05'] if inverse else ['f01', 'f04'] assert_array_equal(np.array(fs.vectorizer.feature_names_), feature_names) # make sure that number of ids, labels and features are the same eq_(fs.ids.shape[0], fs.labels.shape[0]) eq_(fs.labels.shape[0], fs.features.shape[0])