예제 #1
0
    def setUp(self):
        self.size = 100
        self.X = pd.DataFrame(dict(X=np.random.rand(self.size, )))
        self.y = pd.DataFrame(
            dict(y=(np.random.rand(self.size, ) > 0.5).astype(int)))
        sc = MinMaxScaler()
        nb = GaussianNB()
        steps = [
            ('scaler', sc),
            ('model', nb),
        ]
        connections = {
            'scaler': {
                'X': 'X'
            },
            'model': {
                'X': ('scaler', 'predict'),
                'y': 'y'
            },
        }
        model = PipeGraph(steps, connections)

        self.sc = sc
        self.nb = nb
        self.model = model
예제 #2
0
    def test_Pipegraph__example_1_no_connections(self):
        import numpy as np
        from sklearn.preprocessing import MinMaxScaler
        from sklearn.linear_model import LinearRegression
        from pipegraph import PipeGraph

        X = np.random.rand(100, 1)
        y = 4 * X + 0.5 * np.random.randn(100, 1)

        scaler = MinMaxScaler()
        linear_model = LinearRegression()
        steps = [('scaler', scaler), ('linear_model', linear_model)]

        pgraph = PipeGraph(steps=steps)
        self.assertTrue(pgraph.fit_connections is None)
        self.assertTrue(pgraph.predict_connections is None)
        pgraph.fit(X, y)
        y_pred = pgraph.predict(X)
        self.assertEqual(y_pred.shape[0], y.shape[0])
        self.assertEqual(
            pgraph.fit_connections,
            dict(scaler={'X': 'X'},
                 linear_model={
                     'X': ('scaler', 'predict'),
                     'y': 'y'
                 }))
        self.assertEqual(
            pgraph.predict_connections,
            dict(scaler={'X': 'X'},
                 linear_model={
                     'X': ('scaler', 'predict'),
                     'y': 'y'
                 }))
예제 #3
0
    def setUp(self):
        X_first = pd.Series(np.random.rand(100, ))
        y_first = pd.Series(4 * X_first + 0.5 * np.random.randn(100, ))

        X_second = pd.Series(np.random.rand(100, ) + 3)
        y_second = pd.Series(-4 * X_second + 0.5 * np.random.randn(100, ))

        X_third = pd.Series(np.random.rand(100, ) + 6)
        y_third = pd.Series(2 * X_third + 0.5 * np.random.randn(100, ))

        self.X = pd.concat([X_first, X_second, X_third], axis=0).to_frame()
        self.y = pd.concat([y_first, y_second, y_third], axis=0).to_frame()

        scaler = MinMaxScaler()
        gaussian_mixture = GaussianMixture(n_components=3)
        models = RegressorsWithParametrizedNumberOfReplicas(number_of_replicas=3, regressor=LinearRegression())
        neutral_regressor = NeutralRegressor()

        steps = [('scaler', scaler),
                 ('classifier', gaussian_mixture),
                 ('models', models),
                 ('neutral', neutral_regressor)]

        connections = {'scaler': {'X': 'X'},
                       'classifier': {'X': 'scaler'},
                       'models': {'X': 'scaler',
                                  'y': 'y',
                                  'selection': 'classifier'},
                       'neutral': {'X': 'models'}
                       }
        self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
예제 #4
0
 def setUp(self):
     self.size = 100
     self.X = np.random.rand(self.size, 1)
     self.y = 2 * self.X
     lm = LinearRegression()
     steps = [('linear_model', lm)]
     self.lm = lm
     self.steps = steps
     self.pgraph = PipeGraph(steps=steps)
예제 #5
0
    def setUp(self):
        self.size = 1000
        self.X = pd.DataFrame(dict(X=np.random.rand(self.size, )))
        self.y = pd.DataFrame(dict(y=(np.random.rand(self.size, ))))
        concatenator = Concatenator()
        gaussian_clustering = GaussianMixture(n_components=3)
        dbscan = DBSCAN(eps=0.5)
        mixer = CustomCombination()
        linear_model = LinearRegression()
        steps = [
            ('Concatenate_Xy', concatenator),
            ('Gaussian_Mixture', gaussian_clustering),
            ('Dbscan', dbscan),
            ('Combine_Clustering', mixer),
            ('Regressor', linear_model),
        ]

        connections = {
            'Concatenate_Xy':
            dict(df1='X', df2='y'),
            'Gaussian_Mixture':
            dict(X=('Concatenate_Xy', 'predict')),
            'Dbscan':
            dict(X=('Concatenate_Xy', 'predict')),
            'Combine_Clustering':
            dict(dominant=('Dbscan', 'predict'),
                 other=('Gaussian_Mixture', 'predict')),
            'Regressor':
            dict(X='X', y='y')
        }

        self.steps_external = [
            ('_External', concatenator),
            ('Gaussian_Mixture', gaussian_clustering),
            ('Dbscan', dbscan),
            ('Combine_Clustering', mixer),
            ('Regressor', linear_model),
        ]

        self.connections_external = {
            '_External':
            dict(df1='X', df2='y'),
            'Gaussian_Mixture':
            dict(X=('Concatenate_Xy', 'predict')),
            'Dbscan':
            dict(X=('Concatenate_Xy', 'predict')),
            'Combine_Clustering':
            dict(dominant=('Dbscan', 'predict'),
                 other=('Gaussian_Mixture', 'predict')),
            'Regressor':
            dict(X='X', y='y')
        }

        self.steps = steps
        self.connections = connections
        self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
        self.pgraph.fit(self.X, self.y)
예제 #6
0
    def test_Pipegraph__filter_nodes_predict(self):
        alternative_connections = {'Regressor': dict(X='X', y='y')}

        pgraph = PipeGraph(steps=self.steps,
                           fit_connections=self.connections,
                           predict_connections=alternative_connections)
        pgraph.fit(self.X, self.y)
        predict_nodes = list(pgraph._filter_predict_nodes())
        self.assertEqual(predict_nodes, ['Regressor'])
예제 #7
0
    def test_compositable__isinstance(self):
        X = self.X
        y = self.y
        new_graph = PipeGraph(steps=[('pgraph', self.pgraph)])
        self.assertEqual(new_graph.named_steps, {'pgraph': self.pgraph})

        new_graph.fit(X, y)
        result = new_graph.predict(X)['predict']
        expected = self.pgraph.predict(X)['predict']
        self.assertEqual(result.shape[0], expected.shape[0])
예제 #8
0
    def setUp(self):
        self.size = 100
        self.X = pd.DataFrame(dict(X=np.random.rand(self.size, )))
        self.y = pd.DataFrame(dict(y=(np.random.rand(self.size, ))))
        sc = MinMaxScaler()
        lm = LinearRegression()
        neutral_regressor = NeutralRegressor()

        steps = [
            ('scaler', sc),
            ('model', lm),
        ]
        connections = {
            'scaler': {
                'X': 'X'
            },
            'model': {
                'X': ('scaler', 'predict'),
                'y': 'y'
            },
        }
        model = PipeGraph(steps, connections)

        steps = [('scaler', sc), ('model', lm), ('neutral', neutral_regressor)]
        connections = {
            'scaler': {
                'X': 'X'
            },
            'model': {
                'X': ('scaler', 'predict'),
                'y': 'y'
            },
            'neutral': {
                'X': 'model'
            }
        }

        model_custom = PipeGraph(steps, connections)

        self.sc = sc
        self.lm = lm
        self.model = model
        self.model_custom = model_custom
예제 #9
0
 def test_Pipegraph__predict_connections(self):
     pgraph = PipeGraph(self.steps, self.connections)
     pgraph.fit(self.X, self.y)
     predict_nodes_list = list(pgraph._filter_predict_nodes())
     self.assertEqual(
         sorted(predict_nodes_list),
         sorted([
             'Concatenate_Xy',
             'Gaussian_Mixture',
             'Dbscan',
             'Combine_Clustering',
             'Regressor',
         ]))
예제 #10
0
    def setUp(self):
        self.size = 100
        self.X = np.random.rand(self.size, 1)
        self.y = 2 * self.X

        lm = LinearRegression()
        steps = [('linear_model', lm)]
        connections = {'linear_model': dict(X='X', y='y')}
        self.lm = lm
        self.steps = steps
        self.connections = connections
        self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
        self.param_grid = dict(linear_model__fit_intercept=[False, True],
                               linear_model__normalize=[True, False])
예제 #11
0
    def setUp(self):
        self.size = 100
        self.X = pd.DataFrame(dict(X=np.random.rand(self.size, )))
        self.y = pd.DataFrame(dict(y=np.random.rand(self.size, )))
        concatenator = Concatenator()
        gaussian_clustering = GaussianMixture(n_components=3)
        dbscan = DBSCAN(eps=0.5)
        mixer = CustomCombination()
        paellaModel = Paella(regressor=LinearRegression,
                             noise_label=None,
                             max_it=10,
                             regular_size=100,
                             minimum_size=30,
                             width_r=0.95,
                             power=10,
                             random_state=42)
        linear_model = LinearRegression()
        steps = [
            ('Concatenate_Xy', concatenator),
            ('Gaussian_Mixture', gaussian_clustering),
            ('Dbscan', dbscan),
            ('Combine_Clustering', mixer),
            ('Paella', paellaModel),
            ('Regressor', linear_model),
        ]

        connections = {
            'Concatenate_Xy':
            dict(df1='X', df2='y'),
            'Gaussian_Mixture':
            dict(X=('Concatenate_Xy', 'predict')),
            'Dbscan':
            dict(X=('Concatenate_Xy', 'predict')),
            'Combine_Clustering':
            dict(dominant=('Dbscan', 'predict'),
                 other=('Gaussian_Mixture', 'predict')),
            'Paella':
            dict(X='X',
                 y='y',
                 classification=('Combine_Clustering', 'predict')),
            'Regressor':
            dict(X='X', y='y', sample_weight=('Paella', 'predict'))
        }
        self.steps = steps
        self.connections = connections
        self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
예제 #12
0
    def test_Pipegraph__some_predict_connections(self):
        some_connections = {
            'Concatenate_Xy': dict(df1='X', df2='y'),
            'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')),
            'Dbscan': dict(X=('Concatenate_Xy', 'predict')),
        }

        pgraph = PipeGraph(steps=self.steps,
                           fit_connections=self.connections,
                           predict_connections=some_connections)
        pgraph.fit(self.X, self.y)
        predict_nodes_list = list(pgraph._filter_predict_nodes())
        self.assertEqual(
            sorted(predict_nodes_list),
            sorted([
                'Concatenate_Xy',
                'Gaussian_Mixture',
                'Dbscan',
            ]))
예제 #13
0
 def setUp(self):
     self.size = 1000
     self.X = np.random.rand(self.size, 1)
     self.y = self.X * 2
     sc = MinMaxScaler(feature_range=(0, 1))
     lm = LinearRegression()
     steps = [('scaler', sc), ('linear_model', lm)]
     connections = {
         'scaler': dict(X='X'),
         'linear_model': dict(X=('scaler', 'predict'), y='y')
     }
     self.lm = lm
     self.sc = sc
     self.steps = steps
     self.connections = connections
     self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
     self.param_grid = dict(
         linear_model__fit_intercept=[False, True],
         linear_model__normalize=[True, False],
     )
예제 #14
0
    def setUp(self):
        self.size = 100
        self.X = np.random.rand(self.size, 1)
        self.y = 2 * self.X

        sc = MinMaxScaler()
        gm = GaussianMixture(n_components=3)
        km = KMeans(n_clusters=4)

        steps = [('scaler', sc), ('gaussian', gm), ('kmeans', km)]
        connections_1 = {'scaler': dict(X='X'), 'gaussian': 'scaler'}
        connections_2 = {'scaler': dict(X='X'), 'kmeans': 'scaler'}

        self.sc = sc
        self.gm = gm
        self.km = km

        self.steps = steps
        self.connections_1 = connections_1
        self.connections_2 = connections_2
        self.pgraph = PipeGraph(steps=steps, fit_connections=connections_1)
        self.param_grid = dict(fit_connections=[connections_1, connections_2])
예제 #15
0
    def setUp(self):
        X, y = datasets.make_blobs(n_samples=10000, n_features=5, centers=10)
        self.X, self.y = X, y
        clustering = KMeans(n_clusters=10)
        classification = LinearDiscriminantAnalysis()

        steps = [('clustering', clustering),
                 ('classification', classification)]

        pgraph = PipeGraph(steps=steps)
        pgraph.inject(sink='clustering',
                      sink_var='X',
                      source='_External',
                      source_var='X')
        pgraph.inject(sink='classification',
                      sink_var='X',
                      source='_External',
                      source_var='X')
        pgraph.inject(sink='classification',
                      sink_var='y',
                      source='clustering',
                      source_var='predict')
        self.pgraph = pgraph
예제 #16
0
connections = {
    'scaler': {
        'X': 'X'
    },
    'bundle': {
        'X': 'scaler',
        'y': 'y'
    },
    'neutral': {
        'X': 'bundle',
        'y': 'y'
    }
}

pgraph = PipeGraph(steps=steps, fit_connections=connections)

##############################################################################################################
# Using GridSearchCV to find the best number of clusters and the best regressors

from sklearn.model_selection import GridSearchCV

param_grid = {'bundle__classifier__n_components': range(3, 10)}
gs = GridSearchCV(estimator=pgraph, param_grid=param_grid, refit=True)
gs.fit(X_train, y_train)
y_pred = gs.predict(X_train)
plt.scatter(X_train, y_train)
plt.scatter(X_train, y_pred)
print("Score:", gs.score(X_test, y_test))
print("bundle__classifier__n_components:",
      gs.best_estimator_.get_params()['bundle__classifier__n_components'])
        'y': ('demux', 'y_1')
    },
    'lm_2': {
        'X': ('demux', 'X_2'),
        'y': ('demux', 'y_2')
    },
    'mux': {
        '0': 'lm_0',
        '1': 'lm_1',
        '2': 'lm_2',
        'selection': 'selection'
    }
}

three_multiplexed_models = PipeGraph(
    steps=three_multiplexed_models_steps,
    fit_connections=three_multiplexed_models_connections)

#########################################################################################################
#  Now we can treat this PipeGraph as a reusable component and use it as a unitary step in another PipeGraph:
scaler = MinMaxScaler()
gaussian_mixture = GaussianMixture(n_components=3)
models = three_multiplexed_models

steps = [
    ('scaler', scaler),
    ('classifier', gaussian_mixture),
    ('models', three_multiplexed_models),
]

connections = {
예제 #18
0
 def test_Pipegraph__External_step_name(self):
     pgraph = PipeGraph(steps=self.steps_external,
                        fit_connections=self.connections_external)
     self.assertRaises(ValueError, pgraph.fit, self.X, self.y)
예제 #19
0
    def test_Pipegraph__ex_3_inject(self):
        import numpy as np
        import pandas as pd
        from sklearn.preprocessing import MinMaxScaler
        from sklearn.preprocessing import PolynomialFeatures
        from sklearn.linear_model import LinearRegression
        from sklearn.model_selection import GridSearchCV
        from pipegraph.base import PipeGraph
        from pipegraph.demo_blocks import CustomPower

        X = pd.DataFrame(
            dict(X=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
                 sample_weight=np.array([
                     0.01, 0.95, 0.10, 0.95, 0.95, 0.10, 0.10, 0.95, 0.95,
                     0.95, 0.01
                 ])))
        y = np.array([10, 4, 20, 16, 25, -60, 85, 64, 81, 100, 150])

        scaler = MinMaxScaler()
        polynomial_features = PolynomialFeatures()
        linear_model = LinearRegression()
        custom_power = CustomPower()
        selector = ColumnSelector(mapping={
            'X': slice(0, 1),
            'sample_weight': slice(1, 2)
        })

        steps = [('selector', selector), ('custom_power', custom_power),
                 ('scaler', scaler),
                 ('polynomial_features', polynomial_features),
                 ('linear_model', linear_model)]

        pgraph = PipeGraph(steps=steps)  #PipeGraphRegressor

        self.assertTrue(pgraph.fit_connections is None)
        self.assertTrue(pgraph.predict_connections is None)

        (pgraph.inject(
            sink='selector',
            sink_var='X', source='_External', source_var='X').inject(
                'custom_power', 'X', 'selector',
                'sample_weight').inject('scaler', 'X', 'selector', 'X').inject(
                    'polynomial_features', 'X', 'scaler').inject(
                        'linear_model', 'X',
                        'polynomial_features').inject('linear_model',
                                                      'y',
                                                      source_var='y').inject(
                                                          'linear_model',
                                                          'sample_weight',
                                                          'custom_power'))

        self.assertTrue(pgraph.fit_connections is not None)
        self.assertTrue(pgraph.predict_connections is not None)
        pgraph.fit(X, y)
        self.assertEqual(
            pgraph.fit_connections, {
                'selector': {
                    'X': ('_External', 'X')
                },
                'custom_power': {
                    'X': ('selector', 'sample_weight')
                },
                'scaler': {
                    'X': ('selector', 'X')
                },
                'polynomial_features': {
                    'X': ('scaler', 'predict')
                },
                'linear_model': {
                    'X': ('polynomial_features', 'predict'),
                    'y': ('_External', 'y'),
                    'sample_weight': ('custom_power', 'predict')
                }
            })

        self.assertEqual(
            pgraph.predict_connections, {
                'selector': {
                    'X': ('_External', 'X')
                },
                'custom_power': {
                    'X': ('selector', 'sample_weight')
                },
                'scaler': {
                    'X': ('selector', 'X')
                },
                'polynomial_features': {
                    'X': ('scaler', 'predict')
                },
                'linear_model': {
                    'X': ('polynomial_features', 'predict'),
                    'y': ('_External', 'y'),
                    'sample_weight': ('custom_power', 'predict')
                }
            })
# Next we define the steps and we use :class:`PipeGraphRegressor` as estimator for :class:`GridSearchCV`.

scaler = MinMaxScaler()
polynomial_features = PolynomialFeatures()
linear_model = LinearRegression()
custom_power = CustomPower()
selector = ColumnSelector(mapping={
    'X': slice(0, 1),
    'sample_weight': slice(1, 2)
})

steps = [('selector', selector), ('custom_power', custom_power),
         ('scaler', scaler), ('polynomial_features', polynomial_features),
         ('linear_model', linear_model)]

pgraph = PipeGraph(steps=steps)

(pgraph.inject(
    sink='selector', sink_var='X', source='_External', source_var='X').inject(
        'custom_power', 'X', 'selector',
        'sample_weight').inject('scaler', 'X', 'selector', 'X').inject(
            'polynomial_features', 'X', 'scaler').inject(
                'linear_model', 'X',
                'polynomial_features').inject('linear_model',
                                              'y',
                                              source_var='y').inject(
                                                  'linear_model',
                                                  'sample_weight',
                                                  'custom_power'))

###############################################################################