def test_object_clone(self): obj1 = MeanVarianceScaler() << {'new_y': 'yy'} obj2 = obj1.clone() pobj = obj1.get_params() assert pobj == obj2.get_params() assert '_columns' not in pobj assert 'columns' in pobj assert obj1._columns is not None assert pobj['columns'] is not None
def test_pipeline_info(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ MeanVarianceScaler() << { 'new_y': 'yy' }, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << { 'Feature': ['workclass', 'education'], Role.Label: 'new_y' } ]) infos = exp.get_fit_info(df)[0] for inf in infos: if 'operator' in inf: del inf['operator'] exp = [{ 'name': None, 'schema_after': ['education', 'workclass', 'yy'], 'type': 'start', 'outputs': ['education', 'workclass', 'yy'] }, { 'name': 'TypeConverter', 'inputs': ['yy'], 'outputs': ['new_y'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'], 'type': 'transform' }, { 'name': 'MeanVarianceScaler', 'inputs': ['new_y'], 'type': 'transform', 'outputs': ['new_y'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'] }, { 'name': 'OneHotVectorizer', 'inputs': ['workclass', 'education'], 'type': 'transform', 'outputs': ['workclass', 'education'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'] }, { 'name': 'ColumnDropper', 'type': 'transform', 'schema_after': ['education', 'workclass', 'new_y'], 'inputs': ['education', 'workclass', 'yy', 'new_y'], 'outputs': ['education', 'workclass', 'new_y'] }, { 'name': 'FastLinearRegressor', 'inputs': ['Feature:education,workclass', 'Label:new_y'], 'type': 'regressor', 'outputs': ['Score'], 'schema_after': ['Score'] }] if infos != exp: raise Exception(infos)
def test_syntax8_label(self): df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) X = df.drop('yy', axis=1) exp = Pipeline([ MeanVarianceScaler() << {'new_y': 'yy'}, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << {'Feature': ['workclass', 'education'], Role.Label: 'new_y'} ]) exp.fit(df, verbose=0) assert exp.nodes[-1].feature_column_ == 'Features' assert exp.nodes[-1].label_column_ == 'new_y' # The pipeline requires it now as it is transformed all along. X['yy'] = 0.0 prediction = exp.predict(X, verbose=0) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) if prediction['Score'].min() < 0.4: raise Exception(prediction) if prediction['Score'].max() > 2.00: raise Exception(prediction)
def test_ensemble_supports_get_fit_info(self): df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) col_info = {'Feature': ['workclass', 'education'], Role.Label: 'new_y'} r1 = OrdinaryLeastSquaresRegressor(normalize="Yes") << col_info r2 = OnlineGradientDescentRegressor(normalize="Yes") << col_info r3 = LightGbmRegressor(normalize="Yes") << col_info pipeline = Pipeline([ MeanVarianceScaler() << {'new_y': 'yy'}, OneHotVectorizer() << ['workclass', 'education'], ColumnDropper() << 'yy', VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) info = pipeline.get_fit_info(df) last_info_node = info[0][-1] self.assertEqual(last_info_node['inputs'], ['Feature:education,workclass', 'Label:new_y']) self.assertEqual(last_info_node['name'], 'VotingRegressor') self.assertTrue(isinstance(last_info_node['operator'], VotingRegressor)) self.assertEqual(last_info_node['outputs'], ['Score']) self.assertEqual(last_info_node['schema_after'], ['Score']) self.assertEqual(last_info_node['type'], 'regressor')
def test_pipeline_exports(self): import graphviz.backend df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ MeanVarianceScaler() << { 'new_y': 'yy' }, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << { 'Feature': ['workclass', 'education'], Role.Label: 'new_y' } ]) gr = img_export_pipeline(exp, df) name = next(tempfile._get_candidate_names()) try: gr.render(name) assert os.path.exists(name) except graphviz.backend.ExecutableNotFound: warnings.warn('Graphviz is not installed.') if os.path.exists(name): os.remove(name)
def test_object_parameters(self): obj1 = MeanVarianceScaler() << {'new_y': 'yy'} assert obj1._columns is not None obj2 = MeanVarianceScaler(columns={'new_y': 'yy'}) assert obj1.get_params() == { 'columns': { 'new_y': 'yy' }, 'fix_zero': True, 'max_training_examples': 1000000000, 'use_cdf': False } assert obj1.get_params() == obj2.get_params() obj3 = FastLinearRegressor() << { 'Feature': ['workclass', 'education'], Role.Label: 'new_y' } exp = { 'bias_learning_rate': 1.0, 'caching': 'Auto', 'check_frequency': None, 'convergence_tolerance': 0.01, 'feature': ['workclass', 'education'], 'l1_threshold': None, 'l2_weight': None, 'label': 'new_y', 'loss': 'squared', 'max_iterations': None, 'normalize': 'Auto', 'shuffle': True, 'train_threads': None } assert obj3.get_params() == exp
def test_transform_int(self): in_df = pandas.DataFrame( data=dict(xpetal=[-1, -2, -3], ipetal=[1, 2, 3])) normed = MeanVarianceScaler() << ['xpetal', 'ipetal'] pipeline = Pipeline([normed]) out_df = pipeline.fit_transform(in_df, verbose=0) assert_equal(out_df.shape, (3, 2)) assert_almost_equal(out_df.loc[2, 'xpetal'], -1.3887302, decimal=3) assert_almost_equal(out_df.loc[2, 'ipetal'], 1.38873, decimal=3)
def test_lr_named_steps_iris(self): iris = load_iris() X = iris.data[:, :2] # we only take the first two features. y = iris.target df = pd.DataFrame(X, columns=['X1', 'X2']) df['Label'] = y pipe = nimbusmlPipeline([('norm', MeanVarianceScaler() << ['X1', 'X2']), ('lr', LogisticRegressionClassifier() << ['X1', 'X2'])]) pipe.fit(df) pred = pipe.predict(df).head() assert len(pred) == 5
from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.preprocessing.normalization import MeanVarianceScaler # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv( path, sep=',', numeric_dtype=numpy.float32) # Error with integer input print(data.head()) # age case education induced parity pooled.stratum row_num ... # 0 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ... # 1 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ... # 2 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ... # 3 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ... # 4 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ... # transform usage xf = MeanVarianceScaler(columns={'in': 'induced', 'sp': 'spontaneous'}) # fit and transform features = xf.fit_transform(data) # print features print(features.head()) # age case education in ... pooled.stratum row_num ... # 0 26.0 1.0 0-5yrs 1.071517 ... 3.0 1.0 ... # 1 42.0 1.0 0-5yrs 1.071517 ... 1.0 2.0 ... # 2 39.0 1.0 0-5yrs 2.143034 ... 4.0 3.0 ... # 3 34.0 1.0 0-5yrs 2.143034 ... 2.0 4.0 ... # 4 35.0 1.0 6-11yrs 1.071517 ... 32.0 5.0 ...
def test_pipeline_exports(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ MeanVarianceScaler() << { 'new_y': 'yy' }, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << { 'Feature': ['workclass', 'education'], Role.Label: 'new_y' } ]) for node in exp.nodes: if hasattr(node, 'label_column'): assert node.label_column == 'new_y' assert exp.nodes[-1].label_column == 'new_y' res = dot_export_pipeline(exp, df).strip("\n\r ") exp = """ digraph{ orientation=portrait; sch0[label="<f0> education|<f1> workclass|<f2> yy", shape=record,fontsize=8]; node1[label="TypeConverter",shape=box,style="filled, rounded",color=cyan,fontsize=12]; sch0:f2 -> node1; sch1[label="<f0> new_y",shape=record,fontsize=8]; node1 -> sch1:f0; node2[label="MeanVarianceScaler",shape=box, style="filled,rounded",color=cyan,fontsize=12]; sch1:f0 -> node2; sch2[label="<f0> new_y",shape=record,fontsize=8]; node2 -> sch2:f0; node3[label="OneHotVectorizer",shape=box, style="filled,rounded",color=cyan,fontsize=12]; sch0:f1 -> node3; sch0:f0 -> node3; sch3[label="<f0> workclass|<f1> education", shape=record,fontsize=8]; node3 -> sch3:f0; node3 -> sch3:f1; node5[label="FastLinearRegressor",shape=box, style="filled,rounded",color=yellow,fontsize=12]; sch3:f1 -> node5 [label="Feature",fontsize=8]; sch3:f0 -> node5 [label="Feature",fontsize=8]; sch2:f0 -> node5 [label="Label",fontsize=8]; sch5[label="<f0> Score",shape=record,fontsize=8]; node5 -> sch5:f0; } """.replace(" ", "").strip("\n\r ") if res.replace("\n", "").replace(" ", "") != exp.replace( "\n", "").replace(" ", ""): raise Exception(res)
############################################################################### # MeanVarianceScaler import pandas as pd from nimbusml.preprocessing.normalization import MeanVarianceScaler in_df = pd.DataFrame( data=dict(Sepal_Length=[2.5, 1, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[0, 2.5, 2.6, 2.4], Species=["setosa", "viginica", "setosa", 'versicolor'])) # generate two new Columns - Petal_Normed and Sepal_Normed normed = MeanVarianceScaler() << { 'Petal_Normed': 'Petal_Length', 'Sepal_Normed': 'Sepal_Width' } out_df = normed.fit_transform(in_df) print('MeanVarianceScaler\n', (out_df))
############################################################################### # Pipeline import numpy as np import pandas as pd from nimbusml import Pipeline, FileDataStream from nimbusml.linear_model import FastLinearRegressor from nimbusml.preprocessing.normalization import MeanVarianceScaler X = np.array([[1, 2.0], [2, 4], [3, 0.7]]) Y = np.array([2, 3, 1.5]) df = pd.DataFrame(dict(y=Y, x1=X[:, 0], x2=X[:, 1])) pipe = Pipeline([ MeanVarianceScaler(), FastLinearRegressor() ]) # fit with pandas dataframe pipe.fit(X, Y) # Fit with FileDataStream df.to_csv('data.csv', index=False) ds = FileDataStream.read_csv('data.csv', sep=',') pipe = Pipeline([ MeanVarianceScaler(), FastLinearRegressor() ]) pipe.fit(ds, 'y') print(pipe.summary())