def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher): """ Helper function for the two unit tests for FeatureSet.from_data_frame(). Since labels are optional, run two tests, one with, one without. """ import pandas # First, setup the test data. # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # Not using 0 - 100 here because that would be pandas' default index names anyway. # So let's make sure pandas is using the ids we supply. ids = list(range(100, 200)) featureset_name = 'test' # if use_feature_hashing, run these tests with a vectorizer feature_bins = 4 vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hasher else None) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # Now, create a FeatureSet object. if with_labels: expected = FeatureSet(featureset_name, ids, features=features, labels=y, vectorizer=vectorizer) else: expected = FeatureSet(featureset_name, ids, features=features, vectorizer=vectorizer) # Also create a DataFrame and then create a FeatureSet from it. df = pandas.DataFrame(features, index=ids) if with_labels: df['y'] = y current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y', vectorizer=vectorizer) else: current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer) return (expected, current)
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher): """ Helper function for the two unit tests for FeatureSet.from_data_frame(). Since labels are optional, run two tests, one with, one without. """ import pandas # First, setup the test data. # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # Not using 0 - 100 here because that would be pandas' default index names anyway. # So let's make sure pandas is using the ids we supply. ids = list(range(100, 200)) featureset_name = 'test' # if use_feature_hashing, run these tests with a vectorizer feature_bins = 4 vectorizer = (FeatureHasher(n_features=feature_bins) if use_feature_hasher else None) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # Now, create a FeatureSet object. if with_labels: expected = FeatureSet(featureset_name, ids, features=features, labels=y, vectorizer=vectorizer) else: expected = FeatureSet(featureset_name, ids, features=features, vectorizer=vectorizer) # Also create a DataFrame and then create a FeatureSet from it. df = pandas.DataFrame(features, index=ids) if with_labels: df['y'] = y current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y', vectorizer=vectorizer) else: current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer) return (expected, current)
def test_reading_csv_and_tsv_with_drop_blanks(): # create CSV and TSV strings with blanks test_csv = '1,1,6\n2,,2\n3,9,3\n,,\n,5,\n,,\n2,7,7' test_tsv = test_csv.replace(',', '\t') # specify pandas_kwargs for CSV and TSV readers kwargs = {'header': None, 'names': ['A', 'B', 'C']} expected = pd.DataFrame( { 'A': [1, 3, 2], 'B': [1, 9, 7], 'C': [6, 3, 7], 'L': [None, None, None] }, index=['EXAMPLE_0', 'EXAMPLE_1', 'EXAMPLE_2']) fs_expected = FeatureSet.from_data_frame(expected, 'test', labels_column='L') fs_csv = CSVReader(StringIO(test_csv), drop_blanks=True, pandas_kwargs=kwargs).read() fs_csv.name = 'test' fs_tsv = TSVReader(StringIO(test_tsv), drop_blanks=True, pandas_kwargs=kwargs).read() fs_tsv.name = 'test' eq_(fs_csv, fs_expected) eq_(fs_tsv, fs_expected)
def test_reading_csv_and_tsv_with_fill_blanks_with_dictionary(): # create CSV and TSV strings with blanks test_csv = '1,1,6\n2,,2\n3,9,3\n,,\n,5,\n,,\n2,7,7' test_tsv = test_csv.replace(',', '\t') # specify pandas_kwargs for CSV and TSV readers kwargs = {'header': None, 'names': ['A', 'B', 'C']} expected = pd.DataFrame( { 'A': [1, 2, 3, 4.5, 4.5, 4.5, 2], 'B': [1, 2.5, 9, 2.5, 5, 2.5, 7], 'C': [6, 2, 3, 1, 1, 1, 7], 'L': [None, None, None, None, None, None, None] }, index=[ 'EXAMPLE_0', 'EXAMPLE_1', 'EXAMPLE_2', 'EXAMPLE_3', 'EXAMPLE_4', 'EXAMPLE_5', 'EXAMPLE_6' ]) fs_expected = FeatureSet.from_data_frame(expected, 'test', labels_column='L') replacement_dict = {'A': 4.5, 'B': 2.5, 'C': 1} fs_csv = CSVReader(StringIO(test_csv), replace_blanks_with=replacement_dict, pandas_kwargs=kwargs).read() fs_csv.name = 'test' fs_tsv = TSVReader(StringIO(test_tsv), replace_blanks_with=replacement_dict, pandas_kwargs=kwargs).read() fs_tsv.name = 'test' eq_(fs_csv, fs_expected) eq_(fs_tsv, fs_expected)