def test_featureset_creation_from_dataframe_with_string_labels(): dftest = pd.DataFrame({ "id": [1, 2], "score": ['yes', 'no'], "text": ["a b", "b c"] }) dftest.set_index("id", inplace=True) test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}] test_dict_vectorizer = DictVectorizer() Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list) fs_test = FeatureSet('test', ids=dftest.index.values, labels=dftest['score'].values, features=Xtest, vectorizer=test_dict_vectorizer) output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines") test_writer = NDJWriter(output_path, fs_test) test_writer.write() # read in the written file into a featureset and confirm that the # two featuresets are equal fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read() assert fs_test == fs_test2
def check_generate_predictions_console(use_threshold=False): # create some simple classification data without feature hashing train_fs, test_fs = make_classification_data(num_examples=1000, num_features=5) # save the test feature set to an NDJ file input_file = join(_my_dir, 'test', 'test_generate_predictions.jsonlines') writer = NDJWriter(input_file, test_fs) writer.write() # create a learner that uses an SGD classifier learner = Learner('SGDClassifier', probability=use_threshold) # train the learner with grid search learner.train(train_fs, grid_search=True) # get the predictions on the test featureset predictions = learner.predict(test_fs) # if we asked for probabilities, then use the threshold # to convert them into binary predictions if use_threshold: threshold = 0.6 predictions = [int(p[1] >= threshold) for p in predictions] else: predictions = predictions.tolist() threshold = None # save the learner to a file model_file = join(_my_dir, 'output', 'test_generate_predictions_console.model') learner.save(model_file) # now call main() from generate_predictions.py generate_cmd = [] if use_threshold: generate_cmd.append('-t {}'.format(threshold)) generate_cmd.extend([model_file, input_file]) # we need to capture stdout since that's what main() writes to err = '' try: old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = mystdout = StringIO() sys.stderr = mystderr = StringIO() gp.main(generate_cmd) out = mystdout.getvalue() err = mystderr.getvalue() predictions_after_saving = [int(x) for x in out.strip().split('\n')] eq_(predictions, predictions_after_saving) finally: sys.stdout = old_stdout sys.stderr = old_stderr print(err)
def test_write_hashed_featureset(): """ Test to check that hashed featuresets cannot be written out """ fs, _ = make_classification_data(num_examples=100, num_features=4, use_feature_hashing=True, feature_bins=2, random_state=1234) output_dir = join(_my_dir, 'output') writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs) writer.write()
def make_learning_curve_data(): # Load in the digits data set digits = load_digits() X, y = digits.data, digits.target # create featureset with all features feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])] features = [] for row in X: features.append(dict(zip(feature_names, row))) fs1 = FeatureSet('train1', features=features, labels=y, ids=list(range(X.shape[0]))) # Write this feature set to file train_path = join(_my_dir, 'train', 'test_learning_curve1.jsonlines') writer = NDJWriter(train_path, fs1) writer.write() # create featureset with all except the last feature feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])] features = [] for row in X: features.append(dict(zip(feature_names[:-1], row))) fs2 = FeatureSet('train2', features=features, labels=y, ids=list(range(X.shape[0]))) # Write this feature set to file train_path = join(_my_dir, 'train', 'test_learning_curve2.jsonlines') writer = NDJWriter(train_path, fs2) writer.write()
def create_jsonlines_feature_files(path): # we only need to create the feature files if they # don't already exist under the given path feature_files_to_create = [ join(path, 'f{}.jsonlines'.format(i)) for i in range(6) ] if all([exists(ff) for ff in feature_files_to_create]): return else: num_examples = 1000 np.random.seed(1234567890) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) for i in range(5): file_path = join(path, 'f{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_examples): feat_num = i x = { "f{}".format(feat_num): features[example_num]["f{}".format(feat_num)] } sub_features.append(x) fs = FeatureSet('ablation_cv', ids, features=sub_features, labels=labels) writer = NDJWriter(file_path, fs) writer.write() # now write out the last file which is basically # identical to the last featureset we wrote # except that it has two extra instances fs = FeatureSet( 'extra', ids + ['cat{}'.format(num_examples), 'dog{}'.format(num_examples + 1)], features=sub_features + [{}, {}], labels=labels + ['cat', 'dog']) file_path = join(path, 'f5.jsonlines') writer = NDJWriter(file_path, fs) writer.write()
def create_jsonlines_feature_files(path): # we only need to create the feature files if they # don't already exist under the given path feature_files_to_create = [ join(path, 'f{}.jsonlines'.format(i)) for i in range(5) ] if all([exists(ff) for ff in feature_files_to_create]): return else: num_examples = 1000 np.random.seed(1234567890) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) for i in range(5): file_path = join(path, 'f{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_examples): feat_num = i x = { "f{}".format(feat_num): features[example_num]["f{}".format(feat_num)] } sub_features.append(x) fs = FeatureSet('ablation_cv', ids, features=sub_features, labels=labels) writer = NDJWriter(file_path, fs) writer.write()
def test_writing_ndj_featureset_with_string_ids(): test_dict_vectorizer = DictVectorizer() test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}] Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list) fs_test = FeatureSet('test', ids=['1', '2'], labels=[1, 2], features=Xtest, vectorizer=test_dict_vectorizer) output_path = join(_my_dir, "other", "test_string_ids.jsonlines") test_writer = NDJWriter(output_path, fs_test) test_writer.write() # read in the written file into a featureset and confirm that the # two featuresets are equal fs_test2 = NDJReader.for_path(output_path).read() assert fs_test == fs_test2
def test_custom_learner_model_loading(): num_labels = 10 class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_model_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_model_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() # run the configuration that trains the custom model and saves it cfgfile = 'test_model_save_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) # save the predictions from disk into memory # and delete the predictions file outprefix = 'test_model_custom_learner' pred_file = join(_my_dir, 'output', '{}_{}_CustomLogisticRegressionWrapper' '.predictions'.format(outprefix, outprefix)) preds1 = read_predictions(pred_file) os.unlink(pred_file) # run the configuration that loads the saved model # and generates the predictions again cfgfile = 'test_model_load_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, overwrite=False, quiet=True) # load the newly generated predictions preds2 = read_predictions(pred_file) # make sure that they are the same as before assert_array_equal(preds1, preds2)
def make_ablation_data(): # Remove old CV data for old_file in glob.glob(join(_my_dir, 'output', 'ablation_cv_*.results')): os.remove(old_file) num_examples = 1000 np.random.seed(1234567890) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) for i in range(5): train_path = join(_my_dir, 'train', 'f{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_examples): feat_num = i x = { "f{}".format(feat_num): features[example_num]["f{}".format(feat_num)] } sub_features.append(x) train_fs = FeatureSet('ablation_cv', ids, features=sub_features, labels=labels) writer = NDJWriter(train_path, train_fs) writer.write()
def make_summary_data(): train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=2, num_features=3, non_negative=True) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_summary.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_summary.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write()
def test_featureset_creation_from_dataframe_with_string_labels(): dftest = pd.DataFrame({"id": [1, 2], "score": ['yes', 'no'], "text": ["a b", "b c"]}) dftest.set_index("id", inplace=True) test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}] test_dict_vectorizer = DictVectorizer() Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list) fs_test = FeatureSet('test', ids=dftest.index.values, labels=dftest['score'].values, features=Xtest, vectorizer=test_dict_vectorizer) output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines") test_writer = NDJWriter(output_path, fs_test) test_writer.write() # read in the written file into a featureset and confirm that the # two featuresets are equal fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read() assert fs_test == fs_test2
def make_ablation_data(): # Remove old CV data for old_file in glob.glob(join(_my_dir, 'output', 'ablation_cv_*.results')): os.remove(old_file) num_examples = 1000 np.random.seed(1234567890) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = {"f{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5)} x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) for i in range(5): train_path = join(_my_dir, 'train', 'f{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_examples): feat_num = i x = {"f{}".format(feat_num): features[example_num]["f{}".format(feat_num)]} sub_features.append(x) train_fs = FeatureSet('ablation_cv', ids, features=sub_features, labels=labels) writer = NDJWriter(train_path, train_fs) writer.write()
def make_class_map_data(): # Create training file train_path = join(_my_dir, 'train', 'test_class_map.jsonlines') ids = [] labels = [] features = [] class_names = ['beagle', 'cat', 'dachsund', 'cat'] for i in range(1, 101): y = class_names[i % 4] ex_id = "{}{}".format(y, i) # note that f1 and f5 are missing in all instances but f4 is not x = {"f2": i + 1, "f3": i + 2, "f4": i + 5} ids.append(ex_id) labels.append(y) features.append(x) train_fs = FeatureSet('train_class_map', ids, features=features, labels=labels) writer = NDJWriter(train_path, train_fs) writer.write() # Create test file test_path = join(_my_dir, 'test', 'test_class_map.jsonlines') ids = [] labels = [] features = [] for i in range(1, 51): y = class_names[i % 4] ex_id = "{}{}".format(y, i) # f1 and f5 are not missing in any instances here but f4 is x = {"f1": i, "f2": i + 2, "f3": i % 10, "f5": i * 2} ids.append(ex_id) labels.append(y) features.append(x) test_fs = FeatureSet('test_class_map', ids, features=features, labels=labels) writer = NDJWriter(test_path, test_fs) writer.write()
def test_logistic_custom_learner(): num_labels = 10 class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_logistic_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_logistic_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() cfgfile = 'test_logistic_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) outprefix = 'test_logistic_custom_learner' preds = read_predictions(join(_my_dir, 'output', ('{}_{}_CustomLogisticRegressionWrapper' '.predictions'.format(outprefix, outprefix)))) expected = read_predictions(join(_my_dir, 'output', ('{}_{}_LogisticRegression.predictions' .format(outprefix, outprefix)))) assert_array_equal(preds, expected)
def test_majority_class_custom_learner(): num_labels = 10 # This will make data where the last class happens about 50% of the time. class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_majority_class_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_majority_class_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() cfgfile = 'test_majority_class_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) outprefix = 'test_majority_class_custom_learner' preds = read_predictions( join(_my_dir, 'output', ('{}_{}_MajorityClassLearner_predictions.tsv'.format( outprefix, outprefix)))) expected = np.array([float(num_labels - 1) for x in preds]) assert_array_equal(preds, expected)
def test_logistic_custom_learner(): num_labels = 10 class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_logistic_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_logistic_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() cfgfile = 'test_logistic_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) outprefix = 'test_logistic_custom_learner' preds = read_predictions(join(_my_dir, 'output', ('{}_{}_CustomLogisticRegressionWrapper' '_predictions.tsv'.format(outprefix, outprefix)))) expected = read_predictions(join(_my_dir, 'output', ('{}_{}_LogisticRegression_predictions.tsv' .format(outprefix, outprefix)))) assert_array_equal(preds, expected)
def test_majority_class_custom_learner(): num_labels = 10 # This will make data where the last class happens about 50% of the time. class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_majority_class_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_majority_class_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() cfgfile = 'test_majority_class_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) outprefix = 'test_majority_class_custom_learner' preds = read_predictions(join(_my_dir, 'output', ('{}_{}_MajorityClassLearner_predictions.tsv' .format(outprefix, outprefix)))) expected = np.array([float(num_labels - 1) for x in preds]) assert_array_equal(preds, expected)
def test_fancy_output(): """ Test the descriptive statistics output in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_search=True, grid_objective='pearson') # evaluate the trained model using the test feature set resultdict = learner.evaluate(test_fs) actual_stats_from_api = dict(resultdict[2]['descriptive']['actual']) pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted']) # write out the training and test feature set train_dir = join(_my_dir, 'train') test_dir = join(_my_dir, 'test') output_dir = join(_my_dir, 'output') train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'), train_fs) train_writer.write() test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs) test_writer.write() # now get the config file template, fill it in and run it # so that we can get a results file config_template_path = join(_my_dir, 'configs', 'test_regression_fancy_output.template.cfg') config_path = fill_in_config_paths_for_fancy_output(config_template_path) run_configuration(config_path, quiet=True) # read in the results file and get the descriptive statistics actual_stats_from_file = {} pred_stats_from_file = {} with open( join(output_dir, ('regression_fancy_output_train_fancy_train.' 'jsonlines_test_fancy_test.jsonlines' '_LinearRegression.results')), 'r') as resultf: result_output = resultf.read().strip().split('\n') for desc_stat_line in result_output[26:30]: desc_stat_line = desc_stat_line.strip() if not desc_stat_line: continue else: m = re.search( r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+' r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+' r'\((predicted)\)', desc_stat_line) stat_type, actual_value, _, pred_value, _ = m.groups() actual_stats_from_file[stat_type.lower()] = float(actual_value) pred_stats_from_file[stat_type.lower()] = float(pred_value) for stat_type in actual_stats_from_api: assert_almost_equal(actual_stats_from_file[stat_type], actual_stats_from_api[stat_type], places=4) assert_almost_equal(pred_stats_from_file[stat_type], pred_stats_from_api[stat_type], places=4)
def test_fancy_output(): """ Test the descriptive statistics output in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') # evaluate the trained model using the test feature set resultdict = learner.evaluate(test_fs) actual_stats_from_api = dict(resultdict[2]['descriptive']['actual']) pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted']) # write out the training and test feature set train_dir = join(_my_dir, 'train') test_dir = join(_my_dir, 'test') output_dir = join(_my_dir, 'output') train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'), train_fs) train_writer.write() test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs) test_writer.write() # now get the config file template, fill it in and run it # so that we can get a results file config_template_path = join(_my_dir, 'configs', 'test_regression_fancy_output.template.cfg') config_path = fill_in_config_paths_for_fancy_output(config_template_path) run_configuration(config_path, quiet=True) # read in the results file and get the descriptive statistics actual_stats_from_file = {} pred_stats_from_file = {} with open(join(output_dir, ('regression_fancy_output_train_fancy_train.' 'jsonlines_test_fancy_test.jsonlines' '_LinearRegression.results')), 'r') as resultf: result_output = resultf.read().strip().split('\n') for desc_stat_line in result_output[27:31]: desc_stat_line = desc_stat_line.strip() if not desc_stat_line: continue else: m = re.search(r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+' r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+' r'\((predicted)\)', desc_stat_line) stat_type, actual_value, _, pred_value, _ = m.groups() actual_stats_from_file[stat_type.lower()] = float(actual_value) pred_stats_from_file[stat_type.lower()] = float(pred_value) for stat_type in actual_stats_from_api: assert_almost_equal(actual_stats_from_file[stat_type], actual_stats_from_api[stat_type], places=4) assert_almost_equal(pred_stats_from_file[stat_type], pred_stats_from_api[stat_type], places=4)