示例#1
0
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):
    """
    Helper function for the two unit tests for FeatureSet.from_data_frame().
    Since labels are optional, run two tests, one with, one without.
    """
    import pandas

    # First, setup the test data.
    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100,
                               n_features=4,
                               n_informative=4,
                               n_redundant=0,
                               n_classes=3,
                               random_state=1234567890)

    # Not using 0 - 100 here because that would be pandas' default index names anyway.
    # So let's make sure pandas is using the ids we supply.
    ids = list(range(100, 200))

    featureset_name = 'test'

    # if use_feature_hashing, run these tests with a vectorizer
    feature_bins = 4
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hasher else None)

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # Now, create a FeatureSet object.
    if with_labels:
        expected = FeatureSet(featureset_name,
                              ids,
                              features=features,
                              labels=y,
                              vectorizer=vectorizer)
    else:
        expected = FeatureSet(featureset_name,
                              ids,
                              features=features,
                              vectorizer=vectorizer)

    # Also create a DataFrame and then create a FeatureSet from it.
    df = pandas.DataFrame(features, index=ids)
    if with_labels:
        df['y'] = y
        current = FeatureSet.from_data_frame(df,
                                             featureset_name,
                                             labels_column='y',
                                             vectorizer=vectorizer)
    else:
        current = FeatureSet.from_data_frame(df,
                                             featureset_name,
                                             vectorizer=vectorizer)

    return (expected, current)
示例#2
0
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):
    """
    Helper function for the two unit tests for FeatureSet.from_data_frame().
    Since labels are optional, run two tests, one with, one without.
    """
    import pandas

    # First, setup the test data.
    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=3, random_state=1234567890)

    # Not using 0 - 100 here because that would be pandas' default index names anyway.
    # So let's make sure pandas is using the ids we supply.
    ids = list(range(100, 200))

    featureset_name = 'test'

    # if use_feature_hashing, run these tests with a vectorizer
    feature_bins = 4
    vectorizer = (FeatureHasher(n_features=feature_bins)
                  if use_feature_hasher else None)
    
    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # Now, create a FeatureSet object.
    if with_labels:
        expected = FeatureSet(featureset_name, ids, features=features, labels=y,
                              vectorizer=vectorizer)
    else:
        expected = FeatureSet(featureset_name, ids, features=features,
                              vectorizer=vectorizer)

    # Also create a DataFrame and then create a FeatureSet from it.
    df = pandas.DataFrame(features, index=ids)
    if with_labels:
        df['y'] = y
        current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y',
                                             vectorizer=vectorizer)
    else:
        current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer)

    return (expected, current)
示例#3
0
def test_reading_csv_and_tsv_with_drop_blanks():

    # create CSV and TSV strings with blanks
    test_csv = '1,1,6\n2,,2\n3,9,3\n,,\n,5,\n,,\n2,7,7'
    test_tsv = test_csv.replace(',', '\t')

    # specify pandas_kwargs for CSV and TSV readers
    kwargs = {'header': None, 'names': ['A', 'B', 'C']}

    expected = pd.DataFrame(
        {
            'A': [1, 3, 2],
            'B': [1, 9, 7],
            'C': [6, 3, 7],
            'L': [None, None, None]
        },
        index=['EXAMPLE_0', 'EXAMPLE_1', 'EXAMPLE_2'])

    fs_expected = FeatureSet.from_data_frame(expected,
                                             'test',
                                             labels_column='L')

    fs_csv = CSVReader(StringIO(test_csv),
                       drop_blanks=True,
                       pandas_kwargs=kwargs).read()
    fs_csv.name = 'test'

    fs_tsv = TSVReader(StringIO(test_tsv),
                       drop_blanks=True,
                       pandas_kwargs=kwargs).read()
    fs_tsv.name = 'test'

    eq_(fs_csv, fs_expected)
    eq_(fs_tsv, fs_expected)
示例#4
0
def test_reading_csv_and_tsv_with_fill_blanks_with_dictionary():

    # create CSV and TSV strings with blanks
    test_csv = '1,1,6\n2,,2\n3,9,3\n,,\n,5,\n,,\n2,7,7'
    test_tsv = test_csv.replace(',', '\t')

    # specify pandas_kwargs for CSV and TSV readers
    kwargs = {'header': None, 'names': ['A', 'B', 'C']}

    expected = pd.DataFrame(
        {
            'A': [1, 2, 3, 4.5, 4.5, 4.5, 2],
            'B': [1, 2.5, 9, 2.5, 5, 2.5, 7],
            'C': [6, 2, 3, 1, 1, 1, 7],
            'L': [None, None, None, None, None, None, None]
        },
        index=[
            'EXAMPLE_0', 'EXAMPLE_1', 'EXAMPLE_2', 'EXAMPLE_3', 'EXAMPLE_4',
            'EXAMPLE_5', 'EXAMPLE_6'
        ])

    fs_expected = FeatureSet.from_data_frame(expected,
                                             'test',
                                             labels_column='L')

    replacement_dict = {'A': 4.5, 'B': 2.5, 'C': 1}
    fs_csv = CSVReader(StringIO(test_csv),
                       replace_blanks_with=replacement_dict,
                       pandas_kwargs=kwargs).read()
    fs_csv.name = 'test'

    fs_tsv = TSVReader(StringIO(test_tsv),
                       replace_blanks_with=replacement_dict,
                       pandas_kwargs=kwargs).read()
    fs_tsv.name = 'test'

    eq_(fs_csv, fs_expected)
    eq_(fs_tsv, fs_expected)