Exemplo n.º 1
0
    def test_filter(self):
        with_nans = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[np.nan, 2.5, 2.6, 2.4],
                      Petal_Width=[.8, .7, .9, 0.7]))

        tmpfile = 'tmpfile_with_nans.csv'
        with_nans.to_csv(tmpfile, index=False, na_rep='?')

        file_schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1 ' \
                      'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 header+'
        data = FileDataStream(tmpfile, schema=file_schema)

        xf = Filter(columns=[
            'Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width'
        ])

        features = xf.fit_transform(data)

        assert features.shape == (2, 4)
        print(features.columns)
        # columns ordering changed between 0.22 and 0.23
        assert set(features.columns) == {
            'Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width'
        }
        os.remove(tmpfile)
    def test_inf(self):
        data = DataFrame(data=dict(f0=[np.inf, 1, 2, 3, 4, 5, 6],
                                   f1=[1, 2, -np.Infinity, 3, 4, 5, 6]))

        xf = Filter(columns=['f0'])
        filtered = xf.fit_transform(data)
        assert_equal(filtered['f0'][0], np.inf)
        assert_equal(filtered['f1'][2], -np.inf)
    def test_missing(self):
        data = DataFrame(data=dict(f0=[np.nan, 1, 2, 3, 4, 5, 6],
                                   f1=[1, 2, np.nan, 3, 4, 5, 6],
                                   f2=[np.nan, 1, np.nan, 2, 3, np.nan, 4]))

        for col in data.columns:
            xf = Filter(columns=[col])
            filtered = xf.fit_transform(data)
            count = [isinstance(x, str) or not isnan(x)
                     for x in data[col]].count(True)
            assert_equal(filtered.shape[0], count)
Exemplo n.º 4
0
    def test_check_estimator_filter(self):
        dataTrain = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[np.nan, 2.5, 2.6, 2.4],
                      Petal_Width=[.8, .7, .9, 0.7],
                      Species=["setosa", "virginica", "", 'versicolor']))

        filter = Filter() << ["Sepal_Length", "Petal_Length"]
        data_idv = filter.fit_transform(dataTrain)
        assert data_idv is not None
        assert len(data_idv) > 0
Exemplo n.º 5
0
    def test_filter_no_renaming(self):
        with_nans = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[np.nan, 2.5, 2.6, 2.4],
                      Petal_Width=[.8, .7, .9, 0.7],
                      Species=["setosa", "viginica", "", 'versicolor']))

        tmpfile = 'tmpfile_with_nans.csv'
        with_nans.to_csv(tmpfile, index=False)

        file_schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1 ' \
                      'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 ' \
                      'col=Species:TX:4 header+'
        data = FileDataStream(tmpfile, schema=file_schema)

        try:
            xf = Filter(columns={'Petal_Length': 'Petal_Length'})
            xf.fit(data)
        except TypeError as e:
            assert 'Dictionaries are not allowed to specify input ' \
                   'columns.' in str(
                       e)

        try:
            xf = Filter(columns={'Petal_Length2': 'Petal_Length'})
            xf.fit(data)
        except TypeError as e:
            assert 'Dictionaries are not allowed to specify input ' \
                   'columns.' in str(
                       e)
    def test_input_conversion_to_float_retains_other_column_types(self):
        data = {
            'f0': [0, 1, 2, 3],
            'f1': ['2', '3', '4', '5'],
            'f2': [4, 5, np.nan, 9]
        }

        data = DataFrame(data).astype({
            'f0': np.int32,
            'f1': str,
            'f2': np.float64
        })

        # Check Indicator
        xf = Indicator(columns={'f2.ind': 'f2'})
        result = xf.fit_transform(data)
        assert_equal(result.dtypes['f0'], np.int32)
        assert_equal(result.dtypes['f1'], np.object)
        assert_equal(result.dtypes['f2'], np.float64)
        assert_equal(result.dtypes['f2.ind'], np.bool)
        assert_equal(result.loc[2, 'f2.ind'], True)
        assert_equal(len(result), 4)

        # Check Filter
        xf = Filter(columns=['f2'])
        result = xf.fit_transform(data)
        assert_equal(len(result), 3)
        assert_equal(result.loc[2, 'f2'], 9.0)
        assert_equal(result.dtypes['f0'], np.int32)
        assert_equal(result.dtypes['f1'], np.object)
        assert_equal(result.dtypes['f2'], np.float32)

        xf = Filter(columns=['f1'])
        result = xf.fit_transform(data)
        assert_equal(len(result), 4)
        assert_equal(result.loc[3, 'f2'], 9.0)
        assert_equal(result.dtypes['f0'], np.int32)
        assert_equal(result.dtypes['f1'], np.float32)
        assert_equal(result.dtypes['f2'], np.float64)

        # Check Handler
        xf = Handler(columns=['f2'], replace_with='Mean')
        result = xf.fit_transform(data)
        assert_equal(len(result), 4)
        assert_equal(result.loc[2, 'f2.f2'], 6.0)
        assert_equal(result.dtypes['f0'], np.int32)
        assert_equal(result.dtypes['f1'], np.object)
        assert_equal(result.dtypes['f2.f2'], np.float32)
Exemplo n.º 7
0
    def test_get_fit_info_fastl(self):
        train_file = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(train_file)
        data = FileDataStream(train_file, schema)

        pipeline = Pipeline([
            Filter(columns=['Ozone']),
            FastLinearRegressor(feature=['Solar_R', 'Temp'], label='Ozone')
        ])

        info = pipeline.get_fit_info(data)
        exp = [{
            'name':
            None,
            'outputs':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'schema_after':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'type':
            'start'
        }, {
            'inputs': ['Ozone'],
            'name':
            'Filter',
            'outputs': ['Ozone'],
            'schema_after':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'type':
            'transform'
        }]
        for el in info[0]:
            if 'operator' in el:
                del el['operator']
        self.assertEqual(exp, info[0][:2])
    def test_input_conversion_to_float(self):
        data = {
            'f0': [0, 1, 2, 3],
            'f1': [1, 2, 3, 4],
            'f2': [1, 2, 3, 4],
            'f3': [1, 2, 3, 4],
            'f4': ['2', '3', '4', '5'],
            'f5': [4, 5, np.nan, 9]
        }

        data = DataFrame(data).astype({
            'f0': np.int8,
            'f1': np.int16,
            'f2': np.int32,
            'f3': np.int64,
            'f4': str,
            'f5': np.float64
        })

        # Check Indicator
        xf = Indicator()
        result = xf.fit_transform(data)

        assert_equal(result.loc[2, 'f5'], True)
        result.loc[2, 'f5'] = False
        result = ~result
        for val in result.all().tolist():
            self.assertTrue(val)

        # Check Filter
        xf = Filter()
        result = xf.fit_transform(data)
        assert_equal(len(result), 3)
        assert_equal(result.loc[2, 'f5'], 9.0)

        # Check Handler
        xf = Handler(replace_with='Mean')
        result = xf.fit_transform(data)
        assert_equal(len(result), 4)
        assert_equal(result.loc[2, 'f5.f5'], 6.0)
        assert_equal(result.loc[2, 'f5.IsMissing.f5'], 1.0)
Exemplo n.º 9
0
 'ColumnConcatenator': ColumnConcatenator(columns={'Features': [
     'Sepal_Length',
     'Sepal_Width',
     'Petal_Length',
     'Petal_Width',
     'Setosa']}),
 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']),
 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}),
 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']),
 'DateTimeSplitter': DateTimeSplitter(prefix='dt'),
 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                          label='Setosa'),
 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                          label='Setosa'),
 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']),
 'FromKey': Pipeline([
     ToKey(columns=['Sepal_Length']),
     FromKey(columns=['Sepal_Length'])
 ]),
 # GlobalContrastRowScaler currently requires a vector input to work
 'GlobalContrastRowScaler': Pipeline([
     ColumnConcatenator() << {
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}),
 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']),
###############################################################################
# OrdinaryLeastSquaresRegressor
from nimbusml import Pipeline, FileDataStream, Role
from nimbusml.datasets import get_dataset
from nimbusml.linear_model import OrdinaryLeastSquaresRegressor
from nimbusml.preprocessing.missing_values import Filter

# use the built-in data set 'airquality' to create test and train data
#    Unnamed: 0  Ozone  Solar_R  Wind  Temp  Month  Day
# 0           1   41.0    190.0   7.4    67      5    1
# 1           2   36.0    118.0   8.0    72      5    2

train_file = get_dataset("airquality").as_filepath()
schema = "col=none:R4:0 col=ozone:R4:1 col=solar:R4:2 col=wind:R4:3 " \
         "col=temp:R4:4 col=month:R4:5 col=day:R4:6 sep=, header=+"

fds = FileDataStream(train_file, schema=schema)

# set up pipeline
pipe = Pipeline([
    Filter() << ['ozone'],
    OrdinaryLeastSquaresRegressor() << {
        Role.Label: 'ozone',
        Role.Feature: ['solar', 'wind', 'temp', 'month', 'day']
    }
])

# train and evaluate the model
metrics, scores = pipe.fit(fds).test(fds, "ozone", output_scores=True)
print(metrics)
Exemplo n.º 11
0
###############################################################################
# Filter
import numpy as np
import pandas as pd
from nimbusml import FileDataStream
from nimbusml.preprocessing.missing_values import Filter

with_nans = pd.DataFrame(
    data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
              Sepal_Width=[.75, .9, .8, .76],
              Petal_Length=[np.nan, 2.5, 2.6, 2.4],
              Petal_Width=[.8, .7, .9, 0.7],
              Species=["setosa", "viginica", "", 'versicolor']))

# write NaNs to file to see how transforms work
tmpfile = 'tmpfile_with_nans.csv'
with_nans.to_csv(tmpfile, index=False)

# schema for reading directly from text files
schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1' \
         'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 col=Species:TX:4 header+'
data = FileDataStream.read_csv(tmpfile)
print(data.schema)

# filter out rows where Sepal_Length is NaN
nafilter = Filter() << ['Sepal_Length', 'Petal_Length']

print(with_nans)
print('NAFilter\n', nafilter.fit_transform(data))
Exemplo n.º 12
0
###############################################################################
# Filter
import numpy as np
import pandas as pd
from nimbusml import FileDataStream
from nimbusml.preprocessing.missing_values import Filter

with_nans = pd.DataFrame(data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
                                   Sepal_Width=[.75, .9, .8, .76],
                                   Petal_Length=[np.nan, 2.5, 2.6, 2.4],
                                   Petal_Width=[.8, .7, .9, 0.7]))

# write NaNs to file to show how this transform work
tmpfile = 'tmpfile_with_nans.csv'
with_nans.to_csv(tmpfile, index=False)

data = FileDataStream.read_csv(tmpfile, sep=',', numeric_dtype=np.float32)

# transform usage
xf = Filter(
    columns=['Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width'])

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
#    Petal_Length  Petal_Width  Sepal_Length  Sepal_Width
# 0           2.4          0.7           1.0         0.76