class Pipe2(Pipe): requirements = {'a2'} transform_requires = { 'x': types.List(float), } fit_requires = { 'x': types.List(float), } fitted_parameters = {'mean': float, 'var': float} fit_parameters = {'unused': float} transform_modifies = { 'x': types.List(float), } def fit(self, data: dict, parameters: dict = None): self['mean'] = sum(data['x']) / len(data['x']) self['var'] = sum([x_i**2 for x_i in data['x']]) / len( data['x']) - self['mean']**2 def transform(self, data: dict): data['x'] = [(x_i - self['mean']) / self['var']**0.5 for x_i in data['x']] return data
class Pipe(pipe.Pipe): requirements = {'sklearn'} # variables required by fit (supervised learning) fit_requires = { # (arbitrary items, arbitrary features) 'x': types.Array(np.float64, shape=(None, None)), 'y': types.List(float) } transform_requires = {'x': types.List(float)} # parameter passed to fit() fit_parameters = {'alpha': float} # parameter assigned in fit() fitted_parameters = {'model': object} # type and key of transform transform_modifies = {'model': object} def fit(self, data, parameters=None): import sklearn.linear_model self['model'] = sklearn.linear_model.Lasso(parameters['alpha']) self['model'].fit(data['x'], data['y']) def transform(self, data): data['model'] = self['model'] return data
class Pipe4(Pipe): fit_requires = { 'x1': types.List(float), } fitted_parameters = {'mean': float} transform_requires = { 'x': types.List(float), }
def test_transform_schema(self): p = Pipe() self.assertEqual(p.transform_schema({'x': types.List(float)}), { 'x': types.List(float), 'model': object }) with self.assertRaises(exceptions.WrongSchema) as e: p.transform_schema({'y': types.List(float)}) self.assertIn('in transform', str(e.exception))
def test_two_fit_schema(self): # P4 fit-needs 'x1', P2 fit-needs 'x' (float) => fit_requires needs both on its first type-occurrence p = Pipeline([Pipe1(), Pipe4(), Pipe2()]) self.assertEqual(p.transform_requires, {'x': types.List(str)}) self.assertEqual(p.fit_requires, { 'x': types.List(str), 'x1': types.List(float) }) self.assertEqual(p.transform_modifies, {'x': types.List(float)})
class Pipe1(Pipe): requirements = {'a1'} transform_requires = { 'x': types.List(str), } transform_modifies = { 'x': types.List(float), } def transform(self, data: dict): data['x'] = [float(x_i) for x_i in data['x']] return data
def test_two_transform_data(self): # P1 needs 'x', P2 needs 'x1' p = Pipeline([Pipe1(), Pipe3(), Pipe2()]) self.assertEqual(p.transform_requires, { 'x': types.List(str), 'x1': types.List(str) }) self.assertEqual(p.fit_requires, { 'x': types.List(str), 'x1': types.List(str) }) self.assertEqual(p.transform_modifies, {'x': types.List(float)})
class PipeWrongTransform(Pipe): """ Claims to convert to float, but converts to int """ requirements = {'a1'} transform_requires = { 'x': types.List(str), } transform_modifies = { 'x': types.List(float), } def transform(self, data: dict): data['x'] = [int(x_i) for x_i in data['x']] return data
def test_basic(self): p = Pipeline([Pipe1(), Pipe2()]) self.assertEqual(p.fitted_parameters, { '0': {}, '1': { 'mean': float, 'var': float } }) self.assertEqual(p.transform_requires, {'x': types.List(str)}) self.assertEqual(p.fit_requires, {'x': types.List(str)}) p.fit({'x': ['1', '2', '3']}, {'1': {'unused': 1.0}}) result = p.transform({'x': ['1', '2', '3']}) # std([1,2,3]) == 0.816496580927726 self.assertEqual(result['x'], [-1.2247448713915887, 0.0, 1.2247448713915887])
class FillNaN(Pipe): fit_requires = transform_modifies = transform_requires = { 'x': sf_types.PandasDataFrame(schema={}), 'x_categorical': sf_types.PandasDataFrame(schema={}) } fitted_parameters = { 'means': sf_types.List(float), 'most_frequent': sf_types.List(str) } def fit(self, data: dict, parameters: dict = None): self['means'] = data['x'].mean(axis=0) self['most_frequent'] = data['x_categorical'].mode(axis=0) def transform(self, data: dict): data['x'] = data['x'].fillna(self['means']) for column in data['x_categorical'].columns: data['x_categorical'].loc[ data['x_categorical'][column].isnull(), column] = self['most_frequent'][column][0] return data
class SplitNumericCategorical(Pipe): fit_requires = transform_requires = { 'x': sf_types.PandasDataFrame(schema={}) } transform_modifies = { 'x_categorical': sf_types.PandasDataFrame(schema={}), 'x': sf_types.PandasDataFrame(schema={}) } fitted_parameters = {'numeric_columns': sf_types.List(str)} def fit(self, data: dict, parameters: dict = None): self['numeric_columns'] = list( data['x'].select_dtypes(include=[np.number]).columns) def transform(self, data: dict): data['x_categorical'] = data['x'].drop(self['numeric_columns'], axis=1) data['x'] = data['x'].loc[:, self['numeric_columns']] return data
def test_transform_schema(self): # P1 needs 'x', P2 needs 'x1' p = Pipeline([Pipe1(), Pipe3(), Pipe2()]) # 'x1' is passed along without modification self.assertEqual( p.transform_schema({ 'x': types.List(str), 'x1': types.List(str) }), { 'x': types.List(float), 'x1': types.List(str) }) with self.assertRaises(exceptions.WrongSchema) as e: p.transform_schema({'y': types.List(str)}) self.assertIn('in transform of pipe \'0\' of Pipeline', str(e.exception)) with self.assertRaises(exceptions.WrongSchema) as e: p.transform_schema({'x': types.List(str)}) self.assertIn('in transform of pipe \'1\' of Pipeline', str(e.exception))
class Pipe3(Pipe): transform_requires = { 'x1': types.List(str), 'x': types.List(float), }