def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher): """ Helper function for the two unit tests for FeatureSet.from_data_frame(). Since labels are optional, run two tests, one with, one without. """ import pandas # First, setup the test data. # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # Not using 0 - 100 here because that would be pandas' default index names anyway. # So let's make sure pandas is using the ids we supply. ids = list(range(100, 200)) featureset_name = 'test' # if use_feature_hashing, run these tests with a vectorizer feature_bins = 4 vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hasher else None) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # Now, create a FeatureSet object. if with_labels: expected = FeatureSet(featureset_name, ids, features=features, labels=y, vectorizer=vectorizer) else: expected = FeatureSet(featureset_name, ids, features=features, vectorizer=vectorizer) # Also create a DataFrame and then create a FeatureSet from it. df = pandas.DataFrame(features, index=ids) if with_labels: df['y'] = y current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y', vectorizer=vectorizer) else: current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer) return (expected, current)
def make_scaling_data(use_feature_hashing=False): X, y = make_classification(n_samples=1000, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=1234567890) # we want to arbitrary scale the various features to test the scaling scalers = np.array([1, 10, 100, 1000, 10000]) X = X * scalers # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, 1001)] # create a list of dictionaries as the features feature_names = ['f{}'.format(n) for n in range(1, 6)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # split everything into training and testing portions train_features, test_features = features[:800], features[800:] train_y, test_y = y[:800], y[800:] train_ids, test_ids = ids[:800], ids[800:] vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None train_fs = FeatureSet('train_scaling', train_ids, features=train_features, labels=train_y, vectorizer=vectorizer) test_fs = FeatureSet('test_scaling', test_ids, features=test_features, labels=test_y, vectorizer=vectorizer) return (train_fs, test_fs)
def make_learning_curve_data(): # Load in the digits data set digits = load_digits() X, y = digits.data, digits.target # create featureset with all features feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])] features = [] for row in X: features.append(dict(zip(feature_names, row))) fs1 = FeatureSet('train1', features=features, labels=y, ids=list(range(X.shape[0]))) # Write this feature set to file train_path = join(_my_dir, 'train', 'test_learning_curve1.jsonlines') writer = NDJWriter(train_path, fs1) writer.write() # create featureset with all except the last feature feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])] features = [] for row in X: features.append(dict(zip(feature_names[:-1], row))) fs2 = FeatureSet('train2', features=features, labels=y, ids=list(range(X.shape[0]))) # Write this feature set to file train_path = join(_my_dir, 'train', 'test_learning_curve2.jsonlines') writer = NDJWriter(train_path, fs2) writer.write()
def create_jsonlines_feature_files(path): # we only need to create the feature files if they # don't already exist under the given path feature_files_to_create = [ join(path, 'f{}.jsonlines'.format(i)) for i in range(6) ] if all([exists(ff) for ff in feature_files_to_create]): return else: num_examples = 1000 np.random.seed(1234567890) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) for i in range(5): file_path = join(path, 'f{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_examples): feat_num = i x = { "f{}".format(feat_num): features[example_num]["f{}".format(feat_num)] } sub_features.append(x) fs = FeatureSet('ablation_cv', ids, features=sub_features, labels=labels) writer = NDJWriter(file_path, fs) writer.write() # now write out the last file which is basically # identical to the last featureset we wrote # except that it has two extra instances fs = FeatureSet( 'extra', ids + ['cat{}'.format(num_examples), 'dog{}'.format(num_examples + 1)], features=sub_features + [{}, {}], labels=labels + ['cat', 'dog']) file_path = join(path, 'f5.jsonlines') writer = NDJWriter(file_path, fs) writer.write()
def make_sparse_data(use_feature_hashing=False): """ Function to create sparse data with two features always zero in the training set and a different one always zero in the test set """ # Create training data X, y = make_classification(n_samples=500, n_features=3, n_informative=3, n_redundant=0, n_classes=2, random_state=1234567890) # we need features to be non-negative since we will be # using naive bayes laster X = np.abs(X) # make sure that none of the features are zero X[np.where(X == 0)] += 1 # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, 501)] # create a list of dictionaries as the features # with f1 and f5 always 0 feature_names = ['f{}'.format(n) for n in range(1, 6)] features = [] for row in X: row = [0] + row.tolist() + [0] features.append(dict(zip(feature_names, row))) # use a FeatureHasher if we are asked to do feature hashing vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None train_fs = FeatureSet('train_sparse', ids, features=features, labels=y, vectorizer=vectorizer) # now create the test set with f4 always 0 but nothing else X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=2, random_state=1234567890) X = np.abs(X) X[np.where(X == 0)] += 1 ids = ['EXAMPLE_{}'.format(n) for n in range(1, 101)] # create a list of dictionaries as the features # with f4 always 0 feature_names = ['f{}'.format(n) for n in range(1, 6)] features = [] for row in X: row = row.tolist() row = row[:3] + [0] + row[3:] features.append(dict(zip(feature_names, row))) test_fs = FeatureSet('test_sparse', ids, features=features, labels=y, vectorizer=vectorizer) return train_fs, test_fs
def make_regression_data(num_examples=100, train_test_ratio=0.5, num_features=2, sd_noise=1.0, use_feature_hashing=False, feature_bins=4, start_feature_num=1, random_state=1234567890): # use sklearn's make_regression to generate the data for us X, y, weights = make_regression(n_samples=num_examples, n_features=num_features, noise=sd_noise, random_state=random_state, coef=True) # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)] # create a list of dictionaries as the features feature_names = [ 'f{:02d}'.format(n) for n in range(start_feature_num, start_feature_num + num_features) ] features = [dict(zip(feature_names, row)) for row in X] # convert the weights array into a dictionary for convenience weightdict = dict(zip(feature_names, weights)) # split everything into training and testing portions num_train_examples = int(round(train_test_ratio * num_examples)) train_features, test_features = (features[:num_train_examples], features[num_train_examples:]) train_y, test_y = y[:num_train_examples], y[num_train_examples:] train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:] # create a FeatureHasher if we are asked to use feature hashing # with the specified number of feature bins vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hashing else None) train_fs = FeatureSet('regression_train', train_ids, labels=train_y, features=train_features, vectorizer=vectorizer) test_fs = FeatureSet('regression_test', test_ids, labels=test_y, features=test_features, vectorizer=vectorizer) return (train_fs, test_fs, weightdict)
def test_featureset_creation_from_dataframe_with_string_labels(): dftest = pd.DataFrame({ "id": [1, 2], "score": ['yes', 'no'], "text": ["a b", "b c"] }) dftest.set_index("id", inplace=True) test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}] test_dict_vectorizer = DictVectorizer() Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list) fs_test = FeatureSet('test', ids=dftest.index.values, labels=dftest['score'].values, features=Xtest, vectorizer=test_dict_vectorizer) output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines") test_writer = NDJWriter(output_path, fs_test) test_writer.write() # read in the written file into a featureset and confirm that the # two featuresets are equal fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read() assert fs_test == fs_test2
def test_mismatch_labels_features(): """ Test to catch mistmatch between the shape of the labels vector and the feature matrix """ # get a 100 instances with 4 features but ignore the labels we # get from here X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # double-stack y to ensure we don't match the number of feature rows y2 = np.hstack([y, y]) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # get 100 ids ids = ['EXAMPLE_{}'.format(i) for i in range(100)] # This should raise a ValueError FeatureSet('test', ids, features=features, labels=y2)
def check_dummy_classifier_predict(model_args, train_labels, expected_output): # create hard-coded featuresets based with known labels train_fs = FeatureSet('classification_train', ['TrainExample{}'.format(i) for i in range(20)], labels=train_labels, features=[{"feature": i} for i in range(20)]) test_fs = FeatureSet('classification_test', ['TestExample{}'.format(i) for i in range(10)], features=[{"feature": i} for i in range(20, 30)]) # Ensure predictions are as expectedfor the given strategy learner = Learner('DummyClassifier', model_kwargs=model_args) learner.train(train_fs, grid_search=False) predictions = learner.predict(test_fs) eq_(np.array_equal(expected_output, predictions), True)
def make_cv_folds_data(num_examples_per_fold=100, num_folds=3, use_feature_hashing=False): """ Create data for pre-specified CV folds tests with or without feature hashing """ num_total_examples = num_examples_per_fold * num_folds # create the numeric features and the binary labels X, _ = make_classification(n_samples=num_total_examples, n_features=3, n_informative=3, n_redundant=0, n_classes=2, random_state=1234567890) y = np.array([0, 1] * int(num_total_examples / 2)) # the folds mapping: the first num_examples_per_fold examples # are in fold 1 the second num_examples_per_fold are in # fold 2 and so on foldgen = ([str(i)] * num_examples_per_fold for i in range(num_folds)) folds = list(itertools.chain(*foldgen)) # now create the list of feature dictionaries # and add the binary features that depend on # the class and fold number feature_names = ['f{}'.format(i) for i in range(1, 4)] features = [] for row, classid, foldnum in zip(X, y, folds): string_feature_name = 'is_{}_{}'.format(classid, foldnum) string_feature_value = 1 feat_dict = dict(zip(feature_names, row)) feat_dict.update({string_feature_name: string_feature_value}) features.append(feat_dict) # create the example IDs ids = [ 'EXAMPLE_{}'.format(num_examples_per_fold * k + i) for k in range(num_folds) for i in range(num_examples_per_fold) ] # create the cross-validation feature set with or without feature hashing vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None cv_fs = FeatureSet('cv_folds', ids, features=features, labels=y, vectorizer=vectorizer) # make the custom cv folds dictionary custom_cv_folds = dict(zip(ids, folds)) return (cv_fs, custom_cv_folds)
def test_feature_merging_order_invariance(): """ Test whether featuresets with different orders of IDs can be merged """ # First, randomly generate two feature sets and then make sure they have # the same labels. train_fs1, _, _ = make_regression_data() train_fs2, _, _ = make_regression_data(start_feature_num=3, random_state=87654321) train_fs2.labels = train_fs1.labels.copy() # make a reversed copy of feature set 2 shuffled_indices = list(range(len(train_fs2.ids))) np.random.seed(123456789) np.random.shuffle(shuffled_indices) train_fs2_ids_shuf = train_fs2.ids[shuffled_indices] train_fs2_labels_shuf = train_fs2.labels[shuffled_indices] train_fs2_features_shuf = train_fs2.features[shuffled_indices] train_fs2_shuf = FeatureSet("f2_shuf", train_fs2_ids_shuf, labels=train_fs2_labels_shuf, features=train_fs2_features_shuf, vectorizer=train_fs2.vectorizer) # merge feature set 1 with feature set 2 and its reversed version merged_fs = train_fs1 + train_fs2 merged_fs_shuf = train_fs1 + train_fs2_shuf # check that the two merged versions are the same feature_names = (train_fs1.vectorizer.get_feature_names() + train_fs2.vectorizer.get_feature_names()) assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names) assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(), feature_names) assert_array_equal(merged_fs.labels, train_fs1.labels) assert_array_equal(merged_fs.labels, train_fs2.labels) assert_array_equal(merged_fs.labels, merged_fs_shuf.labels) assert_array_equal(merged_fs.ids, train_fs1.ids) assert_array_equal(merged_fs.ids, train_fs2.ids) assert_array_equal(merged_fs.ids, merged_fs_shuf.ids) assert_array_equal(merged_fs.features[:, 0:2].todense(), train_fs1.features.todense()) assert_array_equal(merged_fs.features[:, 2:4].todense(), train_fs2.features.todense()) assert_array_equal(merged_fs.features.todense(), merged_fs_shuf.features.todense()) assert not np.all( merged_fs.features[:, 0:2].todense() == merged_fs.features[:, 2:4].todense())
def make_class_map_data(): # Create training file train_path = join(_my_dir, 'train', 'test_class_map.jsonlines') ids = [] labels = [] features = [] class_names = ['beagle', 'cat', 'dachsund', 'cat'] for i in range(1, 101): y = class_names[i % 4] ex_id = "{}{}".format(y, i) # note that f1 and f5 are missing in all instances but f4 is not x = {"f2": i + 1, "f3": i + 2, "f4": i + 5} ids.append(ex_id) labels.append(y) features.append(x) train_fs = FeatureSet('train_class_map', ids, features=features, labels=labels) writer = NDJWriter(train_path, train_fs) writer.write() # Create test file test_path = join(_my_dir, 'test', 'test_class_map.jsonlines') ids = [] labels = [] features = [] for i in range(1, 51): y = class_names[i % 4] ex_id = "{}{}".format(y, i) # f1 and f5 are not missing in any instances here but f4 is x = {"f1": i, "f2": i + 2, "f3": i % 10, "f5": i * 2} ids.append(ex_id) labels.append(y) features.append(x) test_fs = FeatureSet('test_class_map', ids, features=features, labels=labels) writer = NDJWriter(test_path, test_fs) writer.write()
def read(self): """ Read examples from list of dictionaries. Returns ------- feature_set : skll.FeatureSet FeatureSet representing the list of dictionaries we read in. """ ids = [] labels = [] feat_dicts = [] for example_num, example in enumerate(self.path_or_list): curr_id = str(example.get("id", "EXAMPLE_{}".format(example_num))) if self.ids_to_floats: try: curr_id = float(curr_id) except ValueError: raise ValueError(('You set ids_to_floats to true,' + ' but ID {} could not be ' + 'converted to float in ' + '{}').format(curr_id, example)) class_name = (safe_float(example['y'], replace_dict=self.class_map) if 'y' in example else None) example = example['x'] # Update lists of IDs, labels, and feature dictionaries if self.ids_to_floats: try: curr_id = float(curr_id) except ValueError: raise ValueError(('You set ids_to_floats to true, but ID ' '{} could not be converted to float in ' '{}').format(curr_id, self.path_or_list)) ids.append(curr_id) labels.append(class_name) feat_dicts.append(example) # Print out status if example_num % 100 == 0: self._print_progress(example_num) # Convert lists to numpy arrays ids = np.array(ids) labels = np.array(labels) features = self.vectorizer.fit_transform(feat_dicts) return FeatureSet('converted', ids, labels=labels, features=features, vectorizer=self.vectorizer)
def test_dummy_classifier_predict(): # hard-code dataset train_fs = FeatureSet('classification_train', ['TrainExample{}'.format(i) for i in range(20)], labels=([0] * 14) + ([1] * 6), features=[{ "feature": i } for i in range(20)]) test_fs = FeatureSet('classification_test', ['TestExample{}'.format(i) for i in range(10)], features=[{ "feature": i } for i in range(20, 30)]) toy_data = ([{ "strategy": "stratified", "random_state": 12345 }, np.array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0])], [{ "strategy": "most_frequent" }, np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])], [{ "strategy": "constant", "constant": 1 }, np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]) # Ensure predictions are correct for all strategies. correct = [] for model_args, expected_output in toy_data: learner = Learner('DummyClassifier', model_kwargs=model_args) learner.train(train_fs) predictions = learner.predict(test_fs) correct.append(np.array_equal(expected_output, predictions)) eq_(correct, [True, True, True])
def make_float_class_data(): """ We want to create data that has labels that look like floats to make sure they are preserved correctly """ ids = ['EXAMPLE_{}'.format(n) for n in range(1, 76)] y = [1.2] * 25 + [1.5] * 25 + [1.8] * 25 X = np.vstack([np.identity(25), np.identity(25), np.identity(25)]) feature_names = ['f{}'.format(i) for i in range(1, 6)] features = [] for row in X: features.append(dict(zip(feature_names, row))) return FeatureSet('float-classes', ids, features=features, labels=y)
def test_learning_curve_implementation(): """ Test to ensure that the learning curve results match scikit-learn """ # This test is different from the other tests which just use regression data. # The reason is that we want this test to fail in case our implementation # diverges from the scikit-learn implementation. This test essentially # serves as a regression test as well. # Load in the digits data set digits = load_digits() X, y = digits.data, digits.target # get the learning curve results from scikit-learn for this data cv_folds = 10 random_state = 123456789 cv = ShuffleSplit(n_splits=cv_folds, test_size=0.2, random_state=random_state) estimator = MultinomialNB() train_sizes = np.linspace(.1, 1.0, 5) train_sizes1, train_scores1, test_scores1 = learning_curve(estimator, X, y, cv=cv, train_sizes=train_sizes, scoring='accuracy') # get the features from this data into a FeatureSet instance we can use # with the SKLL API feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])] features = [] for row in X: features.append(dict(zip(feature_names, row))) fs = FeatureSet('train', features=features, labels=y, ids=list(range(X.shape[0]))) # we don't want to filter out any features since scikit-learn # does not do that either learner = Learner('MultinomialNB', min_feature_count=0) (train_scores2, test_scores2, train_sizes2) = learner.learning_curve(fs, cv_folds=cv_folds, train_sizes=train_sizes, metric='accuracy') assert np.all(train_sizes1 == train_sizes2) assert np.allclose(train_scores1, train_scores2) assert np.allclose(test_scores1, test_scores2)
def make_rare_class_data(): """ We want to create data that has five instances per class, for three labels and for each instance within the group of 5, there's only a single feature firing """ ids = ['EXAMPLE_{}'.format(n) for n in range(1, 16)] y = [0] * 5 + [1] * 5 + [2] * 5 X = np.vstack([np.identity(5), np.identity(5), np.identity(5)]) feature_names = ['f{}'.format(i) for i in range(1, 6)] features = [] for row in X: features.append(dict(zip(feature_names, row))) return FeatureSet('rare-class', ids, features=features, labels=y)
def read(self): """ Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`, `.ndj`, or `.tsv` formats. Returns ------- feature_set : skll.FeatureSet ``FeatureSet`` instance representing the input file. Raises ------ ValueError If ``ids_to_floats`` is True, but IDs cannot be converted. ValueError If no features are found. ValueError If the example IDs are not unique. """ self.logger.debug('Path: %s', self.path_or_list) if not self.quiet: self._progress_msg = "Loading {}...".format(self.path_or_list) print(self._progress_msg, end="\r", file=sys.stderr) sys.stderr.flush() if self._use_pandas: ids, labels, features = self._sub_read(self.path_or_list) else: ids, labels, features = self._sub_read_rows(self.path_or_list) # Convert everything to numpy arrays features = self.vectorizer.fit_transform(features) # Report that loading is complete self._print_progress("done", end="\n") # Make sure we have the same number of ids, labels, and features assert ids.shape[0] == labels.shape[0] == features.shape[0] if ids.shape[0] != len(set(ids)): raise ValueError('The example IDs are not unique in %s.' % self.path_or_list) return FeatureSet(self.path_or_list, ids, labels=labels, features=features, vectorizer=self.vectorizer)
def create_jsonlines_feature_files(path): # we only need to create the feature files if they # don't already exist under the given path feature_files_to_create = [ join(path, 'f{}.jsonlines'.format(i)) for i in range(5) ] if all([exists(ff) for ff in feature_files_to_create]): return else: num_examples = 1000 np.random.seed(1234567890) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) for i in range(5): file_path = join(path, 'f{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_examples): feat_num = i x = { "f{}".format(feat_num): features[example_num]["f{}".format(feat_num)] } sub_features.append(x) fs = FeatureSet('ablation_cv', ids, features=sub_features, labels=labels) writer = NDJWriter(file_path, fs) writer.write()
def test_writing_ndj_featureset_with_string_ids(): test_dict_vectorizer = DictVectorizer() test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}] Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list) fs_test = FeatureSet('test', ids=['1', '2'], labels=[1, 2], features=Xtest, vectorizer=test_dict_vectorizer) output_path = join(_my_dir, "other", "test_string_ids.jsonlines") test_writer = NDJWriter(output_path, fs_test) test_writer.write() # read in the written file into a featureset and confirm that the # two featuresets are equal fs_test2 = NDJReader.for_path(output_path).read() assert fs_test == fs_test2
def test_empty_ids(): """ Test to ensure that an error is raised if ids is None """ # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # create a feature set with ids set to None and raise ValueError FeatureSet('test', None, features=features, labels=y)
def make_ablation_data(): # Remove old CV data for old_file in glob.glob(join(_my_dir, 'output', 'ablation_cv_*.results')): os.remove(old_file) num_examples = 1000 np.random.seed(1234567890) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) for i in range(5): train_path = join(_my_dir, 'train', 'f{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_examples): feat_num = i x = { "f{}".format(feat_num): features[example_num]["f{}".format(feat_num)] } sub_features.append(x) train_fs = FeatureSet('ablation_cv', ids, features=sub_features, labels=labels) writer = NDJWriter(train_path, train_fs) writer.write()
def test_mismatch_ids_features(): """ Test to catch mistmatch between the shape of the ids vector and the feature matrix """ # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # get 200 ids since we don't want to match the number of feature rows ids = ['EXAMPLE_{}'.format(i) for i in range(200)] # This should raise a ValueError FeatureSet('test', ids, features=features, labels=y)
def make_merging_data(num_feat_files, suffix, numeric_ids): num_examples = 500 num_feats_per_file = 17 np.random.seed(1234567890) merge_dir = join(_my_dir, 'train', 'test_merging') if not exists(merge_dir): os.makedirs(merge_dir) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) if not numeric_ids else j x = { "f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) # Unmerged subset_dict = {} for i in range(num_feat_files): feat_num = i * num_feats_per_file subset_dict['{}'.format(i)] = [ "f{:03d}".format(feat_num + j) for j in range(num_feats_per_file) ] train_path = join(merge_dir, suffix) train_fs = FeatureSet('train', ids, labels=labels, features=features) Writer.for_path(train_path, train_fs, subsets=subset_dict).write() # Merged train_path = join(merge_dir, 'all{}'.format(suffix)) Writer.for_path(train_path, train_fs).write()
def make_regression_data(num_examples=100, train_test_ratio=0.5, num_features=2, sd_noise=1.0, use_feature_hashing=False, feature_bins=4, start_feature_num=1, random_state=1234567890): # if we are doing feature hashing and we have asked for more # feature bins than number of total features, we need to # handle that because `make_regression()` doesn't know # about hashing if use_feature_hashing and num_features < feature_bins: num_features = feature_bins # use sklearn's make_regression to generate the data for us X, y, weights = make_regression(n_samples=num_examples, n_features=num_features, noise=sd_noise, random_state=random_state, coef=True) # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)] # create a list of dictionaries as the features index_width_for_feature_name = int(floor(log10(num_features))) + 1 feature_names = [] for n in range(start_feature_num, start_feature_num + num_features): index_str = str(n).zfill(index_width_for_feature_name) feature_name = 'f{}'.format(index_str) feature_names.append(feature_name) features = [dict(zip(feature_names, row)) for row in X] # At this point the labels are generated using unhashed features # even if we want to do feature hashing. `make_regression()` from # sklearn doesn't know anything about feature hashing, so we need # a hack here to compute the updated labels ourselves # using the same command that sklearn uses inside `make_regression()` # which is to generate the X and the weights and then compute the # y as the dot product of the two. This y will then be used as our # labels instead of the original y we got from `make_regression()`. # Note that we only want to use the number of weights that are # equal to the number of feature bins for the hashing if use_feature_hashing: feature_hasher = FeatureHasher(n_features=feature_bins) hashed_X = feature_hasher.fit_transform(features) y = hashed_X.dot(weights[:feature_bins]) # convert the weights array into a dictionary for convenience # if we are using feature hashing, we need to use the names # that would be output by `model_params()` instead of the # original names since that's what we would get from SKLL if use_feature_hashing: index_width_for_feature_name = int(floor(log10(feature_bins))) + 1 hashed_feature_names = [] for i in range(feature_bins): index_str = str(i + 1).zfill(index_width_for_feature_name) feature_name = 'hashed_feature_{}'.format(index_str) hashed_feature_names.append(feature_name) weightdict = dict(zip(hashed_feature_names, weights[:feature_bins])) else: weightdict = dict(zip(feature_names, weights)) # split everything into training and testing portions num_train_examples = int(round(train_test_ratio * num_examples)) train_features, test_features = (features[:num_train_examples], features[num_train_examples:]) train_y, test_y = y[:num_train_examples], y[num_train_examples:] train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:] # create a FeatureHasher if we are asked to use feature hashing # with the specified number of feature bins vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hashing else None) train_fs = FeatureSet('regression_train', train_ids, labels=train_y, features=train_features, vectorizer=vectorizer) test_fs = FeatureSet('regression_test', test_ids, labels=test_y, features=test_features, vectorizer=vectorizer) return (train_fs, test_fs, weightdict)
def make_classification_data(num_examples=100, train_test_ratio=0.5, num_features=10, use_feature_hashing=False, feature_bins=4, num_labels=2, empty_labels=False, string_label_list=None, feature_prefix='f', id_type='string', class_weights=None, non_negative=False, one_string_feature=False, num_string_values=4, random_state=1234567890): # use sklearn's make_classification to generate the data for us num_numeric_features = (num_features - 1 if one_string_feature else num_features) X, y = make_classification(n_samples=num_examples, n_features=num_numeric_features, n_informative=num_numeric_features, n_redundant=0, n_classes=num_labels, weights=class_weights, random_state=random_state) if string_label_list: assert (len(string_label_list) == num_labels) label_to_string = np.vectorize(lambda n: string_label_list[n]) y = label_to_string(y) # if we were told to only generate non-negative features, then # we can simply take the absolute values of the generated features if non_negative: X = abs(X) # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs; we create IDs that either can also # be numbers or pure strings if id_type == 'string': ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)] elif id_type == 'integer_string': ids = ['{}'.format(n) for n in range(1, num_examples + 1)] elif id_type == 'float': ids = [float(n) for n in range(1, num_examples + 1)] elif id_type == 'integer': ids = list(range(1, num_examples + 1)) # create a string feature that has four possible values # 'a', 'b', 'c' and 'd' and add it to X at the end if one_string_feature: prng = RandomState(random_state) random_indices = prng.random_integers(0, num_string_values - 1, num_examples) possible_values = [chr(x) for x in range(97, 97 + num_string_values)] string_feature_values = [possible_values[i] for i in random_indices] string_feature_column = np.array(string_feature_values, dtype=object).reshape(100, 1) X = np.append(X, string_feature_column, 1) # create a list of dictionaries as the features feature_names = [ '{}{:02d}'.format(feature_prefix, n) for n in range(1, num_features + 1) ] features = [dict(zip(feature_names, row)) for row in X] # split everything into training and testing portions num_train_examples = int(round(train_test_ratio * num_examples)) train_features, test_features = (features[:num_train_examples], features[num_train_examples:]) train_y, test_y = y[:num_train_examples], y[num_train_examples:] train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:] # are we told to generate empty labels train_labels = None if empty_labels else train_y test_labels = None if empty_labels else test_y # create a FeatureHasher if we are asked to use feature hashing # with the specified number of feature bins vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hashing else None) train_fs = FeatureSet('classification_train', train_ids, labels=train_labels, features=train_features, vectorizer=vectorizer) if train_test_ratio < 1.0: test_fs = FeatureSet('classification_test', test_ids, labels=test_labels, features=test_features, vectorizer=vectorizer) else: test_fs = None return (train_fs, test_fs)
def test_pipeline_attribute(): # define the classifier and regressor feature dictionaries and labels that we will test on # and also the classes and targets respectively cfeature_dicts = [{"f01": -2.87, "f02": 0.713, "f03": 2.86, "f04": 0.385, "f05": -0.989, "f06": 0.380, "f07": -0.365, "f08": -0.224, "f09": 3.45, "f10": 0.622}, {"f01": 0.058, "f02": -1.14, "f03": 2.85, "f04": 1.41, "f05": 1.60, "f06": 1.04, "f07": -0.669, "f08": -0.727, "f09": 1.82, "f10": 1.336}, {"f01": -1.80, "f02": 3.21, "f03": 0.79, "f04": -0.55, "f05": 0.059, "f06": -5.66, "f07": -3.08, "f08": -0.95, "f09": 0.188, "f10": -1.24}, {"f01": 2.270, "f02": 2.271, "f03": 2.285, "f04": 2.951, "f05": 1.018, "f06": -0.59, "f07": 0.432, "f08": 1.614, "f09": -0.69, "f10": -1.27}, {"f01": 2.98, "f02": 3.74, "f03": 1.96, "f04": 0.80, "f05": 0.425, "f06": -0.76, "f07": 4.013, "f08": 3.119, "f09": 2.104, "f10": 0.195}, {"f01": 2.560, "f02": -2.05, "f03": 1.793, "f04": 0.955, "f05": 2.914, "f06": 2.239, "f07": -1.41, "f08": -1.24, "f09": -4.44, "f10": 0.273}, {"f01": 1.86, "f02": -0.017, "f03": 1.337, "f04": -2.14, "f05": 2.255, "f06": -1.21, "f07": -0.24, "f08": -0.66, "f09": -2.51, "f10": -1.06}, {"f01": -1.95, "f02": -1.81, "f03": 2.105, "f04": 0.976, "f05": -1.480, "f06": 1.120, "f07": -1.22, "f08": 0.704, "f09": -3.66, "f10": -1.72}, {"f01": -1.54, "f02": -2.17, "f03": -4.18, "f04": 1.708, "f05": 0.514, "f06": 0.354, "f07": -3.55, "f08": 2.285, "f09": -3.47, "f10": -0.79}, {"f01": 2.162, "f02": -0.71, "f03": -0.448, "f04": 0.326, "f05": 3.384, "f06": -0.455, "f07": 1.253, "f08": 0.998, "f09": 3.193, "f10": 1.342}] classes = [1, 1, 0, 2, 1, 2, 0, 1, 2, 1] rfeature_dicts = [{'f1': 1.351, 'f2': -0.117, 'f3': 0.570, 'f4': 0.0619, 'f5': 1.569, 'f6': 0.805}, {'f1': -0.557, 'f2': -1.704, 'f3': 0.0913, 'f4': 0.767, 'f5': 1.281, 'f6': -0.803}, {'f1': 0.720, 'f2': -0.268, 'f3': 0.760, 'f4': 0.861, 'f5': -0.403, 'f6': 0.814}, {'f1': 1.737, 'f2': -0.228, 'f3': 1.340, 'f4': 2.031, 'f5': 2.170, 'f6': 1.498}, {'f1': 0.344, 'f2': 0.340, 'f3': 0.572, 'f4': -1.06, 'f5': 1.044, 'f6': 2.065}, {'f1': -0.489, 'f2': -0.420, 'f3': 0.428, 'f4': 0.707, 'f5': -1.306, 'f6': 0.0081}, {'f1': 0.805, 'f2': 0.570, 'f3': 1.351, 'f4': -0.117, 'f5': 0.0619, 'f6': 1.569}, {'f1': -1.083, 'f2': 0.0369, 'f3': -0.413, 'f4': 1.391, 'f5': 1.417, 'f6': -1.118}, {'f1': -1.945, 'f2': -0.332, 'f3': -1.393, 'f4': 0.952, 'f5': -0.816, 'f6': 1.417}, {'f1': 1.976, 'f2': -0.220, 'f3': -1.636, 'f4': 0.795, 'f5': -2.34, 'f6': -0.148}] targets = [96.057, -176.017, -182.32, -56.46, -50.14, -84.53, 241.71, -17.84, -47.09, 77.65] # create training featuresets that we will use to train our estimator function_args_dict = defaultdict(dict) for estimator_type in ['classifier', 'regressor']: for do_feature_hashing in [True, False]: if estimator_type == 'classifier': (train_fs, test_fs) = make_classification_data(num_examples=500, num_features=10, num_labels=3, feature_bins=4, non_negative=True, use_feature_hashing=do_feature_hashing) labels = classes feature_dicts = cfeature_dicts else: (train_fs, test_fs, _) = make_regression_data(num_examples=500, num_features=6, feature_bins=4, use_feature_hashing=do_feature_hashing) labels = targets feature_dicts = rfeature_dicts # if we are doing feature hashing, we need to transform our test # cases to the same space. If we are not, then we don't need to worry # beacuse we have manually ensured that the number of features are the # same for the non-hashing case (10 for classification, and 6 for # regression) test_fs = FeatureSet('test', ids=list(range(1, 11)), features=feature_dicts, labels=labels, vectorizer=train_fs.vectorizer if do_feature_hashing else None) function_args_dict[estimator_type][do_feature_hashing] = [train_fs, test_fs, feature_dicts, labels] function_args_dict = dict(function_args_dict) # now set up the test cases learners = ['LinearSVC', 'LogisticRegression', 'MultinomialNB', 'SVC', 'GradientBoostingClassifier', 'Lars', 'LinearSVR', 'Ridge', 'SVR', 'GradientBoostingRegressor'] use_hashing = [True, False] min_feature_counts = [1, 2] samplers = [None, 'RBFSampler', 'SkewedChi2Sampler'] scalers = ['none', 'with_mean', 'with_std', 'both'] for (learner_name, do_feature_hashing, min_count, scaling_type, sampler_name) in product(learners, use_hashing, min_feature_counts, scalers, samplers): # skip the case for MultinomialNB with feature hashing # or feature sampling since it does not support those if learner_name == 'MultinomialNB': if do_feature_hashing or sampler_name is not None: continue # if we are using a SkewedChi2Sampler, we need to set the # some parameters to make sure it works as expected if sampler_name == 'SkewedChi2Sampler': sampler_kwargs = {'skewedness': 15, 'n_components': 10} else: sampler_kwargs = {} # create a learner instance with the given parameters # and with pipeline attribute set to True learner = Learner(learner_name, min_feature_count=min_count, sampler=sampler_name, sampler_kwargs=sampler_kwargs, feature_scaling=scaling_type, pipeline=True) yield (check_pipeline_attribute, learner_name, do_feature_hashing, min_count, scaling_type, sampler_name, learner, function_args_dict)
def make_conversion_data(num_feat_files, from_suffix, to_suffix): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = join(_my_dir, 'train', 'test_conversion') if not exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) # Create vectorizers/maps for libsvm subset writing feat_vectorizer = DictVectorizer() feat_vectorizer.fit(features) label_map = { label: num for num, label in enumerate( sorted({ label for label in labels if not isinstance(label, (int, float)) })) } # Add fake item to vectorizer for None label_map[None] = '00000' # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = join( convert_dir, '{}_{}{}'.format(feature_name_prefix, i, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = { "f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file) } sub_features.append(x) train_fs = FeatureSet('sub_train', ids, labels=labels, features=sub_features, vectorizer=feat_vectorizer) if from_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() else: Writer.for_path(train_path, train_fs).write() # Write out the merged features in the `to_suffix` file format train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix, to_suffix)) train_fs = FeatureSet('train', ids, labels=labels, features=features, vectorizer=feat_vectorizer) if to_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() else: Writer.for_path(train_path, train_fs).write()
def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = join(_my_dir, 'train', 'test_conversion') if not exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] labels = [] if with_labels else None for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) # if we are not using labels, we do not want zero-valued features # because it may be the case that some subset of features end up # being all 0 and if this subset ends up being written out to a file # below, then for some formats (e.g., megam) nothing will get written # out which can cause issues when reading this file lowest_feature_value = 0 if with_labels else 1 x = { "f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) if with_labels: labels.append(y) features.append(x) # Create vectorizers/maps for libsvm subset writing feat_vectorizer = DictVectorizer() feat_vectorizer.fit(features) if with_labels: label_map = { label: num for num, label in enumerate( sorted({ label for label in labels if not isinstance(label, (int, float)) })) } # Add fake item to vectorizer for None label_map[None] = '00000' else: label_map = None # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # use '_unlabeled' as part of any file names when not using labels with_labels_part = '' if with_labels else '_unlabeled' # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = join( convert_dir, '{}_{}{}{}'.format(feature_name_prefix, i, with_labels_part, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = { "f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file) } sub_features.append(x) train_fs = FeatureSet('sub_train', ids, labels=labels, features=sub_features, vectorizer=feat_vectorizer) if from_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif from_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write() # Write out the merged features in the `to_suffix` file format train_path = join( convert_dir, '{}{}_all{}'.format(feature_name_prefix, with_labels_part, to_suffix)) train_fs = FeatureSet('train', ids, labels=labels, features=features, vectorizer=feat_vectorizer) # we need to do this to get around the FeatureSet using NaNs # instead of None when there are no labels which causes problems # later when comparing featuresets if not with_labels: train_fs.labels = [None] * len(train_fs.labels) if to_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif to_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write()
def read(self): """ Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`, `.megam`, `.ndj`, or `.tsv` formats. Returns ------- feature_set : skll.FeatureSet ``FeatureSet`` instance representing the input file. Raises ------ ValueError If ``ids_to_floats`` is True, but IDs cannot be converted. ValueError If no features are found. ValueError If the example IDs are not unique. """ self.logger.debug('Path: %s', self.path_or_list) if not self.quiet: self._progress_msg = "Loading {}...".format(self.path_or_list) print(self._progress_msg, end="\r", file=sys.stderr) sys.stderr.flush() # Get labels and IDs ids = [] labels = [] ex_num = 0 with open(self.path_or_list, 'r' if PY3 else 'rb') as f: for ex_num, (id_, class_, _) in enumerate(self._sub_read(f), start=1): # Update lists of IDs, clases, and features if self.ids_to_floats: try: id_ = float(id_) except ValueError: raise ValueError(('You set ids_to_floats to true,' ' but ID {} could not be ' 'converted to float in ' '{}').format(id_, self.path_or_list)) ids.append(id_) labels.append(class_) if ex_num % 100 == 0: self._print_progress(ex_num) self._print_progress(ex_num) # Remember total number of examples for percentage progress meter total = ex_num if total == 0: raise ValueError("No features found in possibly " "empty file '{}'.".format(self.path_or_list)) # Convert everything to numpy arrays ids = np.array(ids) labels = np.array(labels) def feat_dict_generator(): with open(self.path_or_list, 'r' if PY3 else 'rb') as f: for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)): yield feat_dict if ex_num % 100 == 0: self._print_progress('{:.8}%'.format( 100 * ((ex_num / total)))) self._print_progress("100%") # Convert everything to numpy arrays features = self.vectorizer.fit_transform(feat_dict_generator()) # Report that loading is complete self._print_progress("done", end="\n") # Make sure we have the same number of ids, labels, and features assert ids.shape[0] == labels.shape[0] == features.shape[0] if ids.shape[0] != len(set(ids)): raise ValueError('The example IDs are not unique in %s.' % self.path_or_list) return FeatureSet(self.path_or_list, ids, labels=labels, features=features, vectorizer=self.vectorizer)