示例#1
0
 def test_object_clone(self):
     obj1 = MeanVarianceScaler() << {'new_y': 'yy'}
     obj2 = obj1.clone()
     pobj = obj1.get_params()
     assert pobj == obj2.get_params()
     assert '_columns' not in pobj
     assert 'columns' in pobj
     assert obj1._columns is not None
     assert pobj['columns'] is not None
示例#2
0
    def test_pipeline_info(self):
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            MeanVarianceScaler() << {
                'new_y': 'yy'
            },
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {
                'Feature': ['workclass', 'education'],
                Role.Label: 'new_y'
            }
        ])

        infos = exp.get_fit_info(df)[0]
        for inf in infos:
            if 'operator' in inf:
                del inf['operator']
        exp = [{
            'name': None,
            'schema_after': ['education', 'workclass', 'yy'],
            'type': 'start',
            'outputs': ['education', 'workclass', 'yy']
        }, {
            'name': 'TypeConverter',
            'inputs': ['yy'],
            'outputs': ['new_y'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y'],
            'type': 'transform'
        }, {
            'name': 'MeanVarianceScaler',
            'inputs': ['new_y'],
            'type': 'transform',
            'outputs': ['new_y'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y']
        }, {
            'name': 'OneHotVectorizer',
            'inputs': ['workclass', 'education'],
            'type': 'transform',
            'outputs': ['workclass', 'education'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y']
        }, {
            'name': 'ColumnDropper',
            'type': 'transform',
            'schema_after': ['education', 'workclass', 'new_y'],
            'inputs': ['education', 'workclass', 'yy', 'new_y'],
            'outputs': ['education', 'workclass', 'new_y']
        }, {
            'name': 'FastLinearRegressor',
            'inputs': ['Feature:education,workclass', 'Label:new_y'],
            'type': 'regressor',
            'outputs': ['Score'],
            'schema_after': ['Score']
        }]
        if infos != exp:
            raise Exception(infos)
示例#3
0
    def test_syntax8_label(self):
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   yy=[1.1, 2.2, 1.24, 3.4, 3.4]))
        X = df.drop('yy', axis=1)

        exp = Pipeline([
            MeanVarianceScaler() << {'new_y': 'yy'},
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {'Feature': ['workclass', 'education'],
                                      Role.Label: 'new_y'}
        ])
        exp.fit(df, verbose=0)
        assert exp.nodes[-1].feature_column_ == 'Features'
        assert exp.nodes[-1].label_column_ == 'new_y'
        # The pipeline requires it now as it is transformed all along.
        X['yy'] = 0.0
        prediction = exp.predict(X, verbose=0)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        if prediction['Score'].min() < 0.4:
            raise Exception(prediction)
        if prediction['Score'].max() > 2.00:
            raise Exception(prediction)
示例#4
0
    def test_ensemble_supports_get_fit_info(self):
        df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                               workclass=['X', 'X', 'Y', 'Y', 'Y'],
                               yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        col_info = {'Feature': ['workclass', 'education'], Role.Label: 'new_y'}

        r1 = OrdinaryLeastSquaresRegressor(normalize="Yes") << col_info
        r2 = OnlineGradientDescentRegressor(normalize="Yes") << col_info
        r3 = LightGbmRegressor(normalize="Yes") << col_info

        pipeline = Pipeline([
            MeanVarianceScaler() << {'new_y': 'yy'},
            OneHotVectorizer() << ['workclass', 'education'],
            ColumnDropper() << 'yy',
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])

        info = pipeline.get_fit_info(df)

        last_info_node = info[0][-1]
        self.assertEqual(last_info_node['inputs'],
                         ['Feature:education,workclass', 'Label:new_y'])
        self.assertEqual(last_info_node['name'], 'VotingRegressor')
        self.assertTrue(isinstance(last_info_node['operator'], VotingRegressor))
        self.assertEqual(last_info_node['outputs'], ['Score'])
        self.assertEqual(last_info_node['schema_after'], ['Score'])
        self.assertEqual(last_info_node['type'], 'regressor')
    def test_pipeline_exports(self):
        import graphviz.backend
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            MeanVarianceScaler() << {
                'new_y': 'yy'
            },
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {
                'Feature': ['workclass', 'education'],
                Role.Label: 'new_y'
            }
        ])

        gr = img_export_pipeline(exp, df)
        name = next(tempfile._get_candidate_names())
        try:
            gr.render(name)
            assert os.path.exists(name)
        except graphviz.backend.ExecutableNotFound:
            warnings.warn('Graphviz is not installed.')
        if os.path.exists(name):
            os.remove(name)
示例#6
0
 def test_object_parameters(self):
     obj1 = MeanVarianceScaler() << {'new_y': 'yy'}
     assert obj1._columns is not None
     obj2 = MeanVarianceScaler(columns={'new_y': 'yy'})
     assert obj1.get_params() == {
         'columns': {
             'new_y': 'yy'
         },
         'fix_zero': True,
         'max_training_examples': 1000000000,
         'use_cdf': False
     }
     assert obj1.get_params() == obj2.get_params()
     obj3 = FastLinearRegressor() << {
         'Feature': ['workclass', 'education'],
         Role.Label: 'new_y'
     }
     exp = {
         'bias_learning_rate': 1.0,
         'caching': 'Auto',
         'check_frequency': None,
         'convergence_tolerance': 0.01,
         'feature': ['workclass', 'education'],
         'l1_threshold': None,
         'l2_weight': None,
         'label': 'new_y',
         'loss': 'squared',
         'max_iterations': None,
         'normalize': 'Auto',
         'shuffle': True,
         'train_threads': None
     }
     assert obj3.get_params() == exp
示例#7
0
    def test_transform_int(self):
        in_df = pandas.DataFrame(
            data=dict(xpetal=[-1, -2, -3], ipetal=[1, 2, 3]))

        normed = MeanVarianceScaler() << ['xpetal', 'ipetal']
        pipeline = Pipeline([normed])
        out_df = pipeline.fit_transform(in_df, verbose=0)
        assert_equal(out_df.shape, (3, 2))
        assert_almost_equal(out_df.loc[2, 'xpetal'], -1.3887302, decimal=3)
        assert_almost_equal(out_df.loc[2, 'ipetal'], 1.38873, decimal=3)
示例#8
0
 def test_lr_named_steps_iris(self):
     iris = load_iris()
     X = iris.data[:, :2]  # we only take the first two features.
     y = iris.target
     df = pd.DataFrame(X, columns=['X1', 'X2'])
     df['Label'] = y
     pipe = nimbusmlPipeline([('norm', MeanVarianceScaler() << ['X1', 'X2']),
                         ('lr',
                          LogisticRegressionClassifier() << ['X1', 'X2'])])
     pipe.fit(df)
     pred = pipe.predict(df).head()
     assert len(pred) == 5
示例#9
0
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.normalization import MeanVarianceScaler

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(
    path, sep=',', numeric_dtype=numpy.float32)  # Error with integer input
print(data.head())
#    age  case education  induced  parity  pooled.stratum  row_num  ...
# 0  26.0   1.0    0-5yrs      1.0     6.0             3.0      1.0  ...
# 1  42.0   1.0    0-5yrs      1.0     1.0             1.0      2.0  ...
# 2  39.0   1.0    0-5yrs      2.0     6.0             4.0      3.0  ...
# 3  34.0   1.0    0-5yrs      2.0     4.0             2.0      4.0  ...
# 4  35.0   1.0   6-11yrs      1.0     3.0            32.0      5.0  ...

# transform usage
xf = MeanVarianceScaler(columns={'in': 'induced', 'sp': 'spontaneous'})

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
#    age  case education        in  ... pooled.stratum  row_num  ...
# 0  26.0   1.0    0-5yrs  1.071517 ...            3.0      1.0  ...
# 1  42.0   1.0    0-5yrs  1.071517 ...            1.0      2.0  ...
# 2  39.0   1.0    0-5yrs  2.143034 ...            4.0      3.0  ...
# 3  34.0   1.0    0-5yrs  2.143034 ...            2.0      4.0  ...
# 4  35.0   1.0   6-11yrs  1.071517  ...          32.0      5.0  ...
示例#10
0
    def test_pipeline_exports(self):
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            MeanVarianceScaler() << {
                'new_y': 'yy'
            },
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {
                'Feature': ['workclass', 'education'],
                Role.Label: 'new_y'
            }
        ])

        for node in exp.nodes:
            if hasattr(node, 'label_column'):
                assert node.label_column == 'new_y'
        assert exp.nodes[-1].label_column == 'new_y'

        res = dot_export_pipeline(exp, df).strip("\n\r ")
        exp = """
                digraph{
                  orientation=portrait;
                  sch0[label="<f0> education|<f1> workclass|<f2> yy",
                  shape=record,fontsize=8];

                  node1[label="TypeConverter",shape=box,style="filled,
                  rounded",color=cyan,fontsize=12];
                  sch0:f2 -> node1;
                  sch1[label="<f0> new_y",shape=record,fontsize=8];
                  node1 -> sch1:f0;

                  node2[label="MeanVarianceScaler",shape=box,
                  style="filled,rounded",color=cyan,fontsize=12];
                  sch1:f0 -> node2;
                  sch2[label="<f0> new_y",shape=record,fontsize=8];
                  node2 -> sch2:f0;

                  node3[label="OneHotVectorizer",shape=box,
                  style="filled,rounded",color=cyan,fontsize=12];
                  sch0:f1 -> node3;
                  sch0:f0 -> node3;
                  sch3[label="<f0> workclass|<f1> education",
                  shape=record,fontsize=8];
                  node3 -> sch3:f0;
                  node3 -> sch3:f1;

                  node5[label="FastLinearRegressor",shape=box,
                  style="filled,rounded",color=yellow,fontsize=12];
                  sch3:f1 -> node5 [label="Feature",fontsize=8];
                  sch3:f0 -> node5 [label="Feature",fontsize=8];
                  sch2:f0 -> node5 [label="Label",fontsize=8];
                  sch5[label="<f0> Score",shape=record,fontsize=8];
                  node5 -> sch5:f0;
                }
                """.replace("                ", "").strip("\n\r ")
        if res.replace("\n", "").replace(" ", "") != exp.replace(
                "\n", "").replace(" ", ""):
            raise Exception(res)
示例#11
0
###############################################################################
# MeanVarianceScaler
import pandas as pd
from nimbusml.preprocessing.normalization import MeanVarianceScaler

in_df = pd.DataFrame(
    data=dict(Sepal_Length=[2.5, 1, 2.1, 1.0],
              Sepal_Width=[.75, .9, .8, .76],
              Petal_Length=[0, 2.5, 2.6, 2.4],
              Species=["setosa", "viginica", "setosa", 'versicolor']))

# generate two new Columns - Petal_Normed and Sepal_Normed
normed = MeanVarianceScaler() << {
    'Petal_Normed': 'Petal_Length',
    'Sepal_Normed': 'Sepal_Width'
}
out_df = normed.fit_transform(in_df)

print('MeanVarianceScaler\n', (out_df))
示例#12
0
###############################################################################
# Pipeline
import numpy as np
import pandas as pd
from nimbusml import Pipeline, FileDataStream
from nimbusml.linear_model import FastLinearRegressor
from nimbusml.preprocessing.normalization import MeanVarianceScaler

X = np.array([[1, 2.0], [2, 4], [3, 0.7]])
Y = np.array([2, 3, 1.5])

df = pd.DataFrame(dict(y=Y, x1=X[:, 0], x2=X[:, 1]))

pipe = Pipeline([
    MeanVarianceScaler(),
    FastLinearRegressor()
])

# fit with pandas dataframe
pipe.fit(X, Y)

# Fit with FileDataStream
df.to_csv('data.csv', index=False)
ds = FileDataStream.read_csv('data.csv', sep=',')

pipe = Pipeline([
    MeanVarianceScaler(),
    FastLinearRegressor()
])
pipe.fit(ds, 'y')
print(pipe.summary())