def make_learning_curve_data(): # Load in the digits data set digits = load_digits() X, y = digits.data, digits.target # create featureset with all features feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])] features = [] for row in X: features.append(dict(zip(feature_names, row))) fs1 = FeatureSet('train1', features=features, labels=y, ids=list(range(X.shape[0]))) # Write this feature set to file train_path = join(_my_dir, 'train', 'test_learning_curve1.jsonlines') writer = NDJWriter(train_path, fs1) writer.write() # create featureset with all except the last feature feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])] features = [] for row in X: features.append(dict(zip(feature_names[:-1], row))) fs2 = FeatureSet('train2', features=features, labels=y, ids=list(range(X.shape[0]))) # Write this feature set to file train_path = join(_my_dir, 'train', 'test_learning_curve2.jsonlines') writer = NDJWriter(train_path, fs2) writer.write()
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher): """ Helper function for the two unit tests for FeatureSet.from_data_frame(). Since labels are optional, run two tests, one with, one without. """ import pandas # First, setup the test data. # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # Not using 0 - 100 here because that would be pandas' default index names anyway. # So let's make sure pandas is using the ids we supply. ids = list(range(100, 200)) featureset_name = 'test' # if use_feature_hashing, run these tests with a vectorizer feature_bins = 4 vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hasher else None) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # Now, create a FeatureSet object. if with_labels: expected = FeatureSet(featureset_name, ids, features=features, labels=y, vectorizer=vectorizer) else: expected = FeatureSet(featureset_name, ids, features=features, vectorizer=vectorizer) # Also create a DataFrame and then create a FeatureSet from it. df = pandas.DataFrame(features, index=ids) if with_labels: df['y'] = y current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y', vectorizer=vectorizer) else: current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer) return (expected, current)
def make_scaling_data(use_feature_hashing=False): X, y = make_classification(n_samples=1000, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=1234567890) # we want to arbitrary scale the various features to test the scaling scalers = np.array([1, 10, 100, 1000, 10000]) X = X * scalers # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, 1001)] # create a list of dictionaries as the features feature_names = ['f{}'.format(n) for n in range(1, 6)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # split everything into training and testing portions train_features, test_features = features[:800], features[800:] train_y, test_y = y[:800], y[800:] train_ids, test_ids = ids[:800], ids[800:] vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None train_fs = FeatureSet('train_scaling', train_ids, features=train_features, labels=train_y, vectorizer=vectorizer) test_fs = FeatureSet('test_scaling', test_ids, features=test_features, labels=test_y, vectorizer=vectorizer) return (train_fs, test_fs)
def create_jsonlines_feature_files(path): # we only need to create the feature files if they # don't already exist under the given path feature_files_to_create = [ join(path, 'f{}.jsonlines'.format(i)) for i in range(6) ] if all([exists(ff) for ff in feature_files_to_create]): return else: num_examples = 1000 np.random.seed(1234567890) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) for i in range(5): file_path = join(path, 'f{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_examples): feat_num = i x = { "f{}".format(feat_num): features[example_num]["f{}".format(feat_num)] } sub_features.append(x) fs = FeatureSet('ablation_cv', ids, features=sub_features, labels=labels) writer = NDJWriter(file_path, fs) writer.write() # now write out the last file which is basically # identical to the last featureset we wrote # except that it has two extra instances fs = FeatureSet( 'extra', ids + ['cat{}'.format(num_examples), 'dog{}'.format(num_examples + 1)], features=sub_features + [{}, {}], labels=labels + ['cat', 'dog']) file_path = join(path, 'f5.jsonlines') writer = NDJWriter(file_path, fs) writer.write()
def make_sparse_data(use_feature_hashing=False): """ Function to create sparse data with two features always zero in the training set and a different one always zero in the test set """ # Create training data X, y = make_classification(n_samples=500, n_features=3, n_informative=3, n_redundant=0, n_classes=2, random_state=1234567890) # we need features to be non-negative since we will be # using naive bayes laster X = np.abs(X) # make sure that none of the features are zero X[np.where(X == 0)] += 1 # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, 501)] # create a list of dictionaries as the features # with f1 and f5 always 0 feature_names = ['f{}'.format(n) for n in range(1, 6)] features = [] for row in X: row = [0] + row.tolist() + [0] features.append(dict(zip(feature_names, row))) # use a FeatureHasher if we are asked to do feature hashing vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None train_fs = FeatureSet('train_sparse', ids, features=features, labels=y, vectorizer=vectorizer) # now create the test set with f4 always 0 but nothing else X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=2, random_state=1234567890) X = np.abs(X) X[np.where(X == 0)] += 1 ids = ['EXAMPLE_{}'.format(n) for n in range(1, 101)] # create a list of dictionaries as the features # with f4 always 0 feature_names = ['f{}'.format(n) for n in range(1, 6)] features = [] for row in X: row = row.tolist() row = row[:3] + [0] + row[3:] features.append(dict(zip(feature_names, row))) test_fs = FeatureSet('test_sparse', ids, features=features, labels=y, vectorizer=vectorizer) return train_fs, test_fs
def make_regression_data(num_examples=100, train_test_ratio=0.5, num_features=2, sd_noise=1.0, use_feature_hashing=False, feature_bins=4, start_feature_num=1, random_state=1234567890): # use sklearn's make_regression to generate the data for us X, y, weights = make_regression(n_samples=num_examples, n_features=num_features, noise=sd_noise, random_state=random_state, coef=True) # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)] # create a list of dictionaries as the features feature_names = [ 'f{:02d}'.format(n) for n in range(start_feature_num, start_feature_num + num_features) ] features = [dict(zip(feature_names, row)) for row in X] # convert the weights array into a dictionary for convenience weightdict = dict(zip(feature_names, weights)) # split everything into training and testing portions num_train_examples = int(round(train_test_ratio * num_examples)) train_features, test_features = (features[:num_train_examples], features[num_train_examples:]) train_y, test_y = y[:num_train_examples], y[num_train_examples:] train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:] # create a FeatureHasher if we are asked to use feature hashing # with the specified number of feature bins vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hashing else None) train_fs = FeatureSet('regression_train', train_ids, labels=train_y, features=train_features, vectorizer=vectorizer) test_fs = FeatureSet('regression_test', test_ids, labels=test_y, features=test_features, vectorizer=vectorizer) return (train_fs, test_fs, weightdict)
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher): """ Helper function for the two unit tests for FeatureSet.from_data_frame(). Since labels are optional, run two tests, one with, one without. """ import pandas # First, setup the test data. # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # Not using 0 - 100 here because that would be pandas' default index names anyway. # So let's make sure pandas is using the ids we supply. ids = list(range(100, 200)) featureset_name = 'test' # if use_feature_hashing, run these tests with a vectorizer feature_bins = 4 vectorizer = (FeatureHasher(n_features=feature_bins) if use_feature_hasher else None) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # Now, create a FeatureSet object. if with_labels: expected = FeatureSet(featureset_name, ids, features=features, labels=y, vectorizer=vectorizer) else: expected = FeatureSet(featureset_name, ids, features=features, vectorizer=vectorizer) # Also create a DataFrame and then create a FeatureSet from it. df = pandas.DataFrame(features, index=ids) if with_labels: df['y'] = y current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y', vectorizer=vectorizer) else: current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer) return (expected, current)
def test_featureset_creation_from_dataframe_with_string_labels(): dftest = pd.DataFrame({ "id": [1, 2], "score": ['yes', 'no'], "text": ["a b", "b c"] }) dftest.set_index("id", inplace=True) test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}] test_dict_vectorizer = DictVectorizer() Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list) fs_test = FeatureSet('test', ids=dftest.index.values, labels=dftest['score'].values, features=Xtest, vectorizer=test_dict_vectorizer) output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines") test_writer = NDJWriter(output_path, fs_test) test_writer.write() # read in the written file into a featureset and confirm that the # two featuresets are equal fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read() assert fs_test == fs_test2
def test_reading_csv_and_tsv_with_drop_blanks(): # create CSV and TSV strings with blanks test_csv = '1,1,6\n2,,2\n3,9,3\n,,\n,5,\n,,\n2,7,7' test_tsv = test_csv.replace(',', '\t') # specify pandas_kwargs for CSV and TSV readers kwargs = {'header': None, 'names': ['A', 'B', 'C']} expected = pd.DataFrame( { 'A': [1, 3, 2], 'B': [1, 9, 7], 'C': [6, 3, 7], 'L': [None, None, None] }, index=['EXAMPLE_0', 'EXAMPLE_1', 'EXAMPLE_2']) fs_expected = FeatureSet.from_data_frame(expected, 'test', labels_column='L') fs_csv = CSVReader(StringIO(test_csv), drop_blanks=True, pandas_kwargs=kwargs).read() fs_csv.name = 'test' fs_tsv = TSVReader(StringIO(test_tsv), drop_blanks=True, pandas_kwargs=kwargs).read() fs_tsv.name = 'test' eq_(fs_csv, fs_expected) eq_(fs_tsv, fs_expected)
def test_mismatch_labels_features(): """ Test to catch mistmatch between the shape of the labels vector and the feature matrix """ # get a 100 instances with 4 features but ignore the labels we # get from here X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # double-stack y to ensure we don't match the number of feature rows y2 = np.hstack([y, y]) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # get 100 ids ids = ['EXAMPLE_{}'.format(i) for i in range(100)] # This should raise a ValueError FeatureSet('test', ids, features=features, labels=y2)
def setup_cv_split_iterator(cv_folds, examples): """ Set up a cross-validation split iterator over the given ``FeatureSet``. Parameters ---------- cv_folds : int or dict The number of folds to use for cross-validation, or a mapping from example IDs to folds. examples : skll.FeatureSet The given featureset which is to be split. Returns ------- res : a 2-tuple The first element is an iterator over the train/test featuresets and the second is the maximum number of training samples available. """ # seed the random number generator for replicability random_state = np.random.RandomState(123456789) # set up the cross-validation split iterator with 20% of # the data always reserved for testing cv = ShuffleSplit(n_splits=cv_folds, test_size=0.2, random_state=random_state) cv_iter = list(cv.split(examples.features, examples.labels, None)) n_max_training_samples = len(cv_iter[0][0]) # create an iterator over train/test featuresets based on the # cross-validation index iterator featureset_iter = (FeatureSet.split_by_ids(examples, train, test) for train, test in cv_iter) return featureset_iter, n_max_training_samples
def check_dummy_classifier_predict(model_args, train_labels, expected_output): # create hard-coded featuresets based with known labels train_fs = FeatureSet('classification_train', ['TrainExample{}'.format(i) for i in range(20)], labels=train_labels, features=[{"feature": i} for i in range(20)]) test_fs = FeatureSet('classification_test', ['TestExample{}'.format(i) for i in range(10)], features=[{"feature": i} for i in range(20, 30)]) # Ensure predictions are as expectedfor the given strategy learner = Learner('DummyClassifier', model_kwargs=model_args) learner.train(train_fs, grid_search=False) predictions = learner.predict(test_fs) eq_(np.array_equal(expected_output, predictions), True)
def test_feature_merging_order_invariance(): """ Test whether featuresets with different orders of IDs can be merged """ # First, randomly generate two feature sets and then make sure they have # the same labels. train_fs1, _, _ = make_regression_data() train_fs2, _, _ = make_regression_data(start_feature_num=3, random_state=87654321) train_fs2.labels = train_fs1.labels.copy() # make a reversed copy of feature set 2 shuffled_indices = list(range(len(train_fs2.ids))) np.random.seed(123456789) np.random.shuffle(shuffled_indices) train_fs2_ids_shuf = train_fs2.ids[shuffled_indices] train_fs2_labels_shuf = train_fs2.labels[shuffled_indices] train_fs2_features_shuf = train_fs2.features[shuffled_indices] train_fs2_shuf = FeatureSet("f2_shuf", train_fs2_ids_shuf, labels=train_fs2_labels_shuf, features=train_fs2_features_shuf, vectorizer=train_fs2.vectorizer) # merge feature set 1 with feature set 2 and its reversed version merged_fs = train_fs1 + train_fs2 merged_fs_shuf = train_fs1 + train_fs2_shuf # check that the two merged versions are the same feature_names = (train_fs1.vectorizer.get_feature_names() + train_fs2.vectorizer.get_feature_names()) assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names) assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(), feature_names) assert_array_equal(merged_fs.labels, train_fs1.labels) assert_array_equal(merged_fs.labels, train_fs2.labels) assert_array_equal(merged_fs.labels, merged_fs_shuf.labels) assert_array_equal(merged_fs.ids, train_fs1.ids) assert_array_equal(merged_fs.ids, train_fs2.ids) assert_array_equal(merged_fs.ids, merged_fs_shuf.ids) assert_array_equal(merged_fs.features[:, 0:2].todense(), train_fs1.features.todense()) assert_array_equal(merged_fs.features[:, 2:4].todense(), train_fs2.features.todense()) assert_array_equal(merged_fs.features.todense(), merged_fs_shuf.features.todense()) assert not np.all( merged_fs.features[:, 0:2].todense() == merged_fs.features[:, 2:4].todense())
def make_cv_folds_data(num_examples_per_fold=100, num_folds=3, use_feature_hashing=False): """ Create data for pre-specified CV folds tests with or without feature hashing """ num_total_examples = num_examples_per_fold * num_folds # create the numeric features and the binary labels X, _ = make_classification(n_samples=num_total_examples, n_features=3, n_informative=3, n_redundant=0, n_classes=2, random_state=1234567890) y = np.array([0, 1] * int(num_total_examples / 2)) # the folds mapping: the first num_examples_per_fold examples # are in fold 1 the second num_examples_per_fold are in # fold 2 and so on foldgen = ([str(i)] * num_examples_per_fold for i in range(num_folds)) folds = list(itertools.chain(*foldgen)) # now create the list of feature dictionaries # and add the binary features that depend on # the class and fold number feature_names = ['f{}'.format(i) for i in range(1, 4)] features = [] for row, classid, foldnum in zip(X, y, folds): string_feature_name = 'is_{}_{}'.format(classid, foldnum) string_feature_value = 1 feat_dict = dict(zip(feature_names, row)) feat_dict.update({string_feature_name: string_feature_value}) features.append(feat_dict) # create the example IDs ids = [ 'EXAMPLE_{}'.format(num_examples_per_fold * k + i) for k in range(num_folds) for i in range(num_examples_per_fold) ] # create the cross-validation feature set with or without feature hashing vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None cv_fs = FeatureSet('cv_folds', ids, features=features, labels=y, vectorizer=vectorizer) # make the custom cv folds dictionary custom_cv_folds = dict(zip(ids, folds)) return (cv_fs, custom_cv_folds)
def make_class_map_data(): # Create training file train_path = join(_my_dir, 'train', 'test_class_map.jsonlines') ids = [] labels = [] features = [] class_names = ['beagle', 'cat', 'dachsund', 'cat'] for i in range(1, 101): y = class_names[i % 4] ex_id = "{}{}".format(y, i) # note that f1 and f5 are missing in all instances but f4 is not x = {"f2": i + 1, "f3": i + 2, "f4": i + 5} ids.append(ex_id) labels.append(y) features.append(x) train_fs = FeatureSet('train_class_map', ids, features=features, labels=labels) writer = NDJWriter(train_path, train_fs) writer.write() # Create test file test_path = join(_my_dir, 'test', 'test_class_map.jsonlines') ids = [] labels = [] features = [] for i in range(1, 51): y = class_names[i % 4] ex_id = "{}{}".format(y, i) # f1 and f5 are not missing in any instances here but f4 is x = {"f1": i, "f2": i + 2, "f3": i % 10, "f5": i * 2} ids.append(ex_id) labels.append(y) features.append(x) test_fs = FeatureSet('test_class_map', ids, features=features, labels=labels) writer = NDJWriter(test_path, test_fs) writer.write()
def read(self): """ Read examples from list of dictionaries. Returns ------- feature_set : skll.FeatureSet FeatureSet representing the list of dictionaries we read in. """ ids = [] labels = [] feat_dicts = [] for example_num, example in enumerate(self.path_or_list): curr_id = str(example.get("id", "EXAMPLE_{}".format(example_num))) if self.ids_to_floats: try: curr_id = float(curr_id) except ValueError: raise ValueError(('You set ids_to_floats to true,' + ' but ID {} could not be ' + 'converted to float in ' + '{}').format(curr_id, example)) class_name = (safe_float(example['y'], replace_dict=self.class_map) if 'y' in example else None) example = example['x'] # Update lists of IDs, labels, and feature dictionaries if self.ids_to_floats: try: curr_id = float(curr_id) except ValueError: raise ValueError(('You set ids_to_floats to true, but ID ' '{} could not be converted to float in ' '{}').format(curr_id, self.path_or_list)) ids.append(curr_id) labels.append(class_name) feat_dicts.append(example) # Print out status if example_num % 100 == 0: self._print_progress(example_num) # Convert lists to numpy arrays ids = np.array(ids) labels = np.array(labels) features = self.vectorizer.fit_transform(feat_dicts) return FeatureSet('converted', ids, labels=labels, features=features, vectorizer=self.vectorizer)
def test_dummy_classifier_predict(): # hard-code dataset train_fs = FeatureSet('classification_train', ['TrainExample{}'.format(i) for i in range(20)], labels=([0] * 14) + ([1] * 6), features=[{ "feature": i } for i in range(20)]) test_fs = FeatureSet('classification_test', ['TestExample{}'.format(i) for i in range(10)], features=[{ "feature": i } for i in range(20, 30)]) toy_data = ([{ "strategy": "stratified", "random_state": 12345 }, np.array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0])], [{ "strategy": "most_frequent" }, np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])], [{ "strategy": "constant", "constant": 1 }, np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]) # Ensure predictions are correct for all strategies. correct = [] for model_args, expected_output in toy_data: learner = Learner('DummyClassifier', model_kwargs=model_args) learner.train(train_fs) predictions = learner.predict(test_fs) correct.append(np.array_equal(expected_output, predictions)) eq_(correct, [True, True, True])
def make_float_class_data(): """ We want to create data that has labels that look like floats to make sure they are preserved correctly """ ids = ['EXAMPLE_{}'.format(n) for n in range(1, 76)] y = [1.2] * 25 + [1.5] * 25 + [1.8] * 25 X = np.vstack([np.identity(25), np.identity(25), np.identity(25)]) feature_names = ['f{}'.format(i) for i in range(1, 6)] features = [] for row in X: features.append(dict(zip(feature_names, row))) return FeatureSet('float-classes', ids, features=features, labels=y)
def test_learning_curve_implementation(): """ Test to ensure that the learning curve results match scikit-learn """ # This test is different from the other tests which just use regression data. # The reason is that we want this test to fail in case our implementation # diverges from the scikit-learn implementation. This test essentially # serves as a regression test as well. # Load in the digits data set digits = load_digits() X, y = digits.data, digits.target # get the learning curve results from scikit-learn for this data cv_folds = 10 random_state = 123456789 cv = ShuffleSplit(n_splits=cv_folds, test_size=0.2, random_state=random_state) estimator = MultinomialNB() train_sizes = np.linspace(.1, 1.0, 5) train_sizes1, train_scores1, test_scores1 = learning_curve(estimator, X, y, cv=cv, train_sizes=train_sizes, scoring='accuracy') # get the features from this data into a FeatureSet instance we can use # with the SKLL API feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])] features = [] for row in X: features.append(dict(zip(feature_names, row))) fs = FeatureSet('train', features=features, labels=y, ids=list(range(X.shape[0]))) # we don't want to filter out any features since scikit-learn # does not do that either learner = Learner('MultinomialNB', min_feature_count=0) (train_scores2, test_scores2, train_sizes2) = learner.learning_curve(fs, cv_folds=cv_folds, train_sizes=train_sizes, metric='accuracy') assert np.all(train_sizes1 == train_sizes2) assert np.allclose(train_scores1, train_scores2) assert np.allclose(test_scores1, test_scores2)
def make_rare_class_data(): """ We want to create data that has five instances per class, for three labels and for each instance within the group of 5, there's only a single feature firing """ ids = ['EXAMPLE_{}'.format(n) for n in range(1, 16)] y = [0] * 5 + [1] * 5 + [2] * 5 X = np.vstack([np.identity(5), np.identity(5), np.identity(5)]) feature_names = ['f{}'.format(i) for i in range(1, 6)] features = [] for row in X: features.append(dict(zip(feature_names, row))) return FeatureSet('rare-class', ids, features=features, labels=y)
def read(self): """ Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`, `.ndj`, or `.tsv` formats. Returns ------- feature_set : skll.FeatureSet ``FeatureSet`` instance representing the input file. Raises ------ ValueError If ``ids_to_floats`` is True, but IDs cannot be converted. ValueError If no features are found. ValueError If the example IDs are not unique. """ self.logger.debug('Path: %s', self.path_or_list) if not self.quiet: self._progress_msg = "Loading {}...".format(self.path_or_list) print(self._progress_msg, end="\r", file=sys.stderr) sys.stderr.flush() if self._use_pandas: ids, labels, features = self._sub_read(self.path_or_list) else: ids, labels, features = self._sub_read_rows(self.path_or_list) # Convert everything to numpy arrays features = self.vectorizer.fit_transform(features) # Report that loading is complete self._print_progress("done", end="\n") # Make sure we have the same number of ids, labels, and features assert ids.shape[0] == labels.shape[0] == features.shape[0] if ids.shape[0] != len(set(ids)): raise ValueError('The example IDs are not unique in %s.' % self.path_or_list) return FeatureSet(self.path_or_list, ids, labels=labels, features=features, vectorizer=self.vectorizer)
def create_jsonlines_feature_files(path): # we only need to create the feature files if they # don't already exist under the given path feature_files_to_create = [ join(path, 'f{}.jsonlines'.format(i)) for i in range(5) ] if all([exists(ff) for ff in feature_files_to_create]): return else: num_examples = 1000 np.random.seed(1234567890) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) for i in range(5): file_path = join(path, 'f{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_examples): feat_num = i x = { "f{}".format(feat_num): features[example_num]["f{}".format(feat_num)] } sub_features.append(x) fs = FeatureSet('ablation_cv', ids, features=sub_features, labels=labels) writer = NDJWriter(file_path, fs) writer.write()
def test_writing_ndj_featureset_with_string_ids(): test_dict_vectorizer = DictVectorizer() test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}] Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list) fs_test = FeatureSet('test', ids=['1', '2'], labels=[1, 2], features=Xtest, vectorizer=test_dict_vectorizer) output_path = join(_my_dir, "other", "test_string_ids.jsonlines") test_writer = NDJWriter(output_path, fs_test) test_writer.write() # read in the written file into a featureset and confirm that the # two featuresets are equal fs_test2 = NDJReader.for_path(output_path).read() assert fs_test == fs_test2
def test_empty_ids(): """ Test to ensure that an error is raised if ids is None """ # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # create a feature set with ids set to None and raise ValueError FeatureSet('test', None, features=features, labels=y)
def make_ablation_data(): # Remove old CV data for old_file in glob.glob(join(_my_dir, 'output', 'ablation_cv_*.results')): os.remove(old_file) num_examples = 1000 np.random.seed(1234567890) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{}".format(feat_num): np.random.randint(0, 4) for feat_num in range(5) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) for i in range(5): train_path = join(_my_dir, 'train', 'f{}.jsonlines'.format(i)) sub_features = [] for example_num in range(num_examples): feat_num = i x = { "f{}".format(feat_num): features[example_num]["f{}".format(feat_num)] } sub_features.append(x) train_fs = FeatureSet('ablation_cv', ids, features=sub_features, labels=labels) writer = NDJWriter(train_path, train_fs) writer.write()
def test_mismatch_ids_features(): """ Test to catch mistmatch between the shape of the ids vector and the feature matrix """ # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # get 200 ids since we don't want to match the number of feature rows ids = ['EXAMPLE_{}'.format(i) for i in range(200)] # This should raise a ValueError FeatureSet('test', ids, features=features, labels=y)
def make_merging_data(num_feat_files, suffix, numeric_ids): num_examples = 500 num_feats_per_file = 17 np.random.seed(1234567890) merge_dir = join(_my_dir, 'train', 'test_merging') if not exists(merge_dir): os.makedirs(merge_dir) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) if not numeric_ids else j x = { "f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) # Unmerged subset_dict = {} for i in range(num_feat_files): feat_num = i * num_feats_per_file subset_dict['{}'.format(i)] = [ "f{:03d}".format(feat_num + j) for j in range(num_feats_per_file) ] train_path = join(merge_dir, suffix) train_fs = FeatureSet('train', ids, labels=labels, features=features) Writer.for_path(train_path, train_fs, subsets=subset_dict).write() # Merged train_path = join(merge_dir, 'all{}'.format(suffix)) Writer.for_path(train_path, train_fs).write()
def test_reading_csv_and_tsv_with_fill_blanks_with_dictionary(): # create CSV and TSV strings with blanks test_csv = '1,1,6\n2,,2\n3,9,3\n,,\n,5,\n,,\n2,7,7' test_tsv = test_csv.replace(',', '\t') # specify pandas_kwargs for CSV and TSV readers kwargs = {'header': None, 'names': ['A', 'B', 'C']} expected = pd.DataFrame( { 'A': [1, 2, 3, 4.5, 4.5, 4.5, 2], 'B': [1, 2.5, 9, 2.5, 5, 2.5, 7], 'C': [6, 2, 3, 1, 1, 1, 7], 'L': [None, None, None, None, None, None, None] }, index=[ 'EXAMPLE_0', 'EXAMPLE_1', 'EXAMPLE_2', 'EXAMPLE_3', 'EXAMPLE_4', 'EXAMPLE_5', 'EXAMPLE_6' ]) fs_expected = FeatureSet.from_data_frame(expected, 'test', labels_column='L') replacement_dict = {'A': 4.5, 'B': 2.5, 'C': 1} fs_csv = CSVReader(StringIO(test_csv), replace_blanks_with=replacement_dict, pandas_kwargs=kwargs).read() fs_csv.name = 'test' fs_tsv = TSVReader(StringIO(test_tsv), replace_blanks_with=replacement_dict, pandas_kwargs=kwargs).read() fs_tsv.name = 'test' eq_(fs_csv, fs_expected) eq_(fs_tsv, fs_expected)
def make_regression_data(num_examples=100, train_test_ratio=0.5, num_features=2, sd_noise=1.0, use_feature_hashing=False, feature_bins=4, start_feature_num=1, random_state=1234567890): # if we are doing feature hashing and we have asked for more # feature bins than number of total features, we need to # handle that because `make_regression()` doesn't know # about hashing if use_feature_hashing and num_features < feature_bins: num_features = feature_bins # use sklearn's make_regression to generate the data for us X, y, weights = make_regression(n_samples=num_examples, n_features=num_features, noise=sd_noise, random_state=random_state, coef=True) # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)] # create a list of dictionaries as the features index_width_for_feature_name = int(floor(log10(num_features))) + 1 feature_names = [] for n in range(start_feature_num, start_feature_num + num_features): index_str = str(n).zfill(index_width_for_feature_name) feature_name = 'f{}'.format(index_str) feature_names.append(feature_name) features = [dict(zip(feature_names, row)) for row in X] # At this point the labels are generated using unhashed features # even if we want to do feature hashing. `make_regression()` from # sklearn doesn't know anything about feature hashing, so we need # a hack here to compute the updated labels ourselves # using the same command that sklearn uses inside `make_regression()` # which is to generate the X and the weights and then compute the # y as the dot product of the two. This y will then be used as our # labels instead of the original y we got from `make_regression()`. # Note that we only want to use the number of weights that are # equal to the number of feature bins for the hashing if use_feature_hashing: feature_hasher = FeatureHasher(n_features=feature_bins) hashed_X = feature_hasher.fit_transform(features) y = hashed_X.dot(weights[:feature_bins]) # convert the weights array into a dictionary for convenience # if we are using feature hashing, we need to use the names # that would be output by `model_params()` instead of the # original names since that's what we would get from SKLL if use_feature_hashing: index_width_for_feature_name = int(floor(log10(feature_bins))) + 1 hashed_feature_names = [] for i in range(feature_bins): index_str = str(i + 1).zfill(index_width_for_feature_name) feature_name = 'hashed_feature_{}'.format(index_str) hashed_feature_names.append(feature_name) weightdict = dict(zip(hashed_feature_names, weights[:feature_bins])) else: weightdict = dict(zip(feature_names, weights)) # split everything into training and testing portions num_train_examples = int(round(train_test_ratio * num_examples)) train_features, test_features = (features[:num_train_examples], features[num_train_examples:]) train_y, test_y = y[:num_train_examples], y[num_train_examples:] train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:] # create a FeatureHasher if we are asked to use feature hashing # with the specified number of feature bins vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hashing else None) train_fs = FeatureSet('regression_train', train_ids, labels=train_y, features=train_features, vectorizer=vectorizer) test_fs = FeatureSet('regression_test', test_ids, labels=test_y, features=test_features, vectorizer=vectorizer) return (train_fs, test_fs, weightdict)
def make_classification_data(num_examples=100, train_test_ratio=0.5, num_features=10, use_feature_hashing=False, feature_bins=4, num_labels=2, empty_labels=False, string_label_list=None, feature_prefix='f', id_type='string', class_weights=None, non_negative=False, one_string_feature=False, num_string_values=4, random_state=1234567890): # use sklearn's make_classification to generate the data for us num_numeric_features = (num_features - 1 if one_string_feature else num_features) X, y = make_classification(n_samples=num_examples, n_features=num_numeric_features, n_informative=num_numeric_features, n_redundant=0, n_classes=num_labels, weights=class_weights, random_state=random_state) if string_label_list: assert (len(string_label_list) == num_labels) label_to_string = np.vectorize(lambda n: string_label_list[n]) y = label_to_string(y) # if we were told to only generate non-negative features, then # we can simply take the absolute values of the generated features if non_negative: X = abs(X) # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs; we create IDs that either can also # be numbers or pure strings if id_type == 'string': ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)] elif id_type == 'integer_string': ids = ['{}'.format(n) for n in range(1, num_examples + 1)] elif id_type == 'float': ids = [float(n) for n in range(1, num_examples + 1)] elif id_type == 'integer': ids = list(range(1, num_examples + 1)) # create a string feature that has four possible values # 'a', 'b', 'c' and 'd' and add it to X at the end if one_string_feature: prng = RandomState(random_state) random_indices = prng.random_integers(0, num_string_values - 1, num_examples) possible_values = [chr(x) for x in range(97, 97 + num_string_values)] string_feature_values = [possible_values[i] for i in random_indices] string_feature_column = np.array(string_feature_values, dtype=object).reshape(100, 1) X = np.append(X, string_feature_column, 1) # create a list of dictionaries as the features feature_names = [ '{}{:02d}'.format(feature_prefix, n) for n in range(1, num_features + 1) ] features = [dict(zip(feature_names, row)) for row in X] # split everything into training and testing portions num_train_examples = int(round(train_test_ratio * num_examples)) train_features, test_features = (features[:num_train_examples], features[num_train_examples:]) train_y, test_y = y[:num_train_examples], y[num_train_examples:] train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:] # are we told to generate empty labels train_labels = None if empty_labels else train_y test_labels = None if empty_labels else test_y # create a FeatureHasher if we are asked to use feature hashing # with the specified number of feature bins vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hashing else None) train_fs = FeatureSet('classification_train', train_ids, labels=train_labels, features=train_features, vectorizer=vectorizer) if train_test_ratio < 1.0: test_fs = FeatureSet('classification_test', test_ids, labels=test_labels, features=test_features, vectorizer=vectorizer) else: test_fs = None return (train_fs, test_fs)
def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = join(_my_dir, 'train', 'test_conversion') if not exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] labels = [] if with_labels else None for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) # if we are not using labels, we do not want zero-valued features # because it may be the case that some subset of features end up # being all 0 and if this subset ends up being written out to a file # below, then for some formats (e.g., megam) nothing will get written # out which can cause issues when reading this file lowest_feature_value = 0 if with_labels else 1 x = {"f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num in range(num_feat_files * num_feats_per_file)} x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) if with_labels: labels.append(y) features.append(x) # Create vectorizers/maps for libsvm subset writing feat_vectorizer = DictVectorizer() feat_vectorizer.fit(features) if with_labels: label_map = {label: num for num, label in enumerate(sorted({label for label in labels if not isinstance(label, (int, float))}))} # Add fake item to vectorizer for None label_map[None] = '00000' else: label_map = None # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # use '_unlabeled' as part of any file names when not using labels with_labels_part = '' if with_labels else '_unlabeled' # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = join(convert_dir, '{}_{}{}{}'.format(feature_name_prefix, i, with_labels_part, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = {"f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file)} sub_features.append(x) train_fs = FeatureSet('sub_train', ids, labels=labels, features=sub_features, vectorizer=feat_vectorizer) if from_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif from_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write() # Write out the merged features in the `to_suffix` file format train_path = join(convert_dir, '{}{}_all{}'.format(feature_name_prefix, with_labels_part, to_suffix)) train_fs = FeatureSet('train', ids, labels=labels, features=features, vectorizer=feat_vectorizer) # we need to do this to get around the FeatureSet using NaNs # instead of None when there are no labels which causes problems # later when comparing featuresets if not with_labels: train_fs.labels = [None] * len(train_fs.labels) if to_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif to_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write()
def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = join(_my_dir, 'train', 'test_conversion') if not exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] labels = [] if with_labels else None for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) # if we are not using labels, we do not want zero-valued features # because it may be the case that some subset of features end up # being all 0 and if this subset ends up being written out to a file # below, then for some formats (e.g., megam) nothing will get written # out which can cause issues when reading this file lowest_feature_value = 0 if with_labels else 1 x = { "f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) if with_labels: labels.append(y) features.append(x) # Create vectorizers/maps for libsvm subset writing feat_vectorizer = DictVectorizer() feat_vectorizer.fit(features) if with_labels: label_map = { label: num for num, label in enumerate( sorted({ label for label in labels if not isinstance(label, (int, float)) })) } # Add fake item to vectorizer for None label_map[None] = '00000' else: label_map = None # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # use '_unlabeled' as part of any file names when not using labels with_labels_part = '' if with_labels else '_unlabeled' # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = join( convert_dir, '{}_{}{}{}'.format(feature_name_prefix, i, with_labels_part, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = { "f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file) } sub_features.append(x) train_fs = FeatureSet('sub_train', ids, labels=labels, features=sub_features, vectorizer=feat_vectorizer) if from_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif from_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write() # Write out the merged features in the `to_suffix` file format train_path = join( convert_dir, '{}{}_all{}'.format(feature_name_prefix, with_labels_part, to_suffix)) train_fs = FeatureSet('train', ids, labels=labels, features=features, vectorizer=feat_vectorizer) # we need to do this to get around the FeatureSet using NaNs # instead of None when there are no labels which causes problems # later when comparing featuresets if not with_labels: train_fs.labels = [None] * len(train_fs.labels) if to_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif to_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write()