class JoinCategoricalAsOneHot(Pipe): fit_requires = {'x_categorical': sf_types.PandasDataFrame(schema={})} transform_requires = { 'x': sf_types.PandasDataFrame(schema={}), 'x_categorical': sf_types.PandasDataFrame(schema={}) } transform_modifies = { 'x': sf_types.PandasDataFrame(schema={}), 'x_categorical': sf_ops.Drop(), } fitted_parameters = {'label': object, 'one_hot': object} def fit(self, data: dict, parameters: dict = None): df = data['x_categorical'].copy() self['label'] = dict((column, LabelEncoder()) for column in df.columns) self['transformer'] = OneHotEncoder() for column in self['label']: df.loc[:, column] = self['label'][column].fit_transform( df.loc[:, column]) self['transformer'].fit(df.values) def transform(self, data: dict): index = data['x_categorical'].index for column in self['label']: mode = data['x_categorical'].loc[:, column].mode()[0] def f(x): if x not in self['label'][column].classes_: return mode else: return x data['x_categorical'].loc[:, column] = data[ 'x_categorical'].loc[:, column].apply(f) data['x_categorical'].loc[:, column] = self['label'][ column].transform(data['x_categorical'].loc[:, column]) data['x_categorical'] = self['transformer'].transform( data['x_categorical']) df = pd.DataFrame(data['x_categorical'].toarray(), index=index) data['x'] = data['x'].join(df) del data['x_categorical'] return data
class SplitNumericCategorical(Pipe): fit_requires = transform_requires = { 'x': sf_types.PandasDataFrame(schema={}) } transform_modifies = { 'x_categorical': sf_types.PandasDataFrame(schema={}), 'x': sf_types.PandasDataFrame(schema={}) } fitted_parameters = {'numeric_columns': sf_types.List(str)} def fit(self, data: dict, parameters: dict = None): self['numeric_columns'] = list( data['x'].select_dtypes(include=[np.number]).columns) def transform(self, data: dict): data['x_categorical'] = data['x'].drop(self['numeric_columns'], axis=1) data['x'] = data['x'].loc[:, self['numeric_columns']] return data
class LogLassoModel(Pipe): transform_requires = {'x': sf_types.PandasDataFrame(schema={})} fit_requires = { 'x': sf_types.PandasDataFrame(schema={}), 'y': sf_types.Array(float) } transform_modifies = { 'y_pred': sf_types.Array(np.float64), 'x': sf_ops.Drop() } fitted_parameters = {'model': LassoCV} def fit(self, data: dict, parameters: dict = None): self['model'] = LassoCV(normalize=True) self['model'].fit(data['x'], np.log(data['y'])) def transform(self, data: dict): data['y_pred'] = np.exp(self['model'].predict(data['x'])) del data['x'] return data
class Pipe2(Pipe): transform_requires = { 'x': types.PandasDataFrame(schema={ 'a': np.float64, 'b': np.float64 }), } transform_modifies = {'x': ops.ModifyDataFrame({'a': ops.Drop()})} def transform(self, data: dict): data['x'] = data['x'].drop('a', axis=1) return data
class FillNaN(Pipe): fit_requires = transform_modifies = transform_requires = { 'x': sf_types.PandasDataFrame(schema={}), 'x_categorical': sf_types.PandasDataFrame(schema={}) } fitted_parameters = { 'means': sf_types.List(float), 'most_frequent': sf_types.List(str) } def fit(self, data: dict, parameters: dict = None): self['means'] = data['x'].mean(axis=0) self['most_frequent'] = data['x_categorical'].mode(axis=0) def transform(self, data: dict): data['x'] = data['x'].fillna(self['means']) for column in data['x_categorical'].columns: data['x_categorical'].loc[ data['x_categorical'][column].isnull(), column] = self['most_frequent'][column][0] return data
class BaselineModel(Pipe): fit_requires = transform_requires = {'x': sf_types.PandasDataFrame({})} transform_modifies = {'y_pred_baseline': sf_types.Array(np.float64)} fitted_parameters = {'mean': np.float64} def fit(self, data: dict, parameters: dict = None): self['mean'] = np.mean(data['y']) def transform(self, data: dict): data['y_pred_baseline'] = np.full(data['x'].shape[0], self['mean']) return data
def test_combine(self): p = Pipeline([Pipe1(), Pipe2()]) result = p.transform({'x': pd.DataFrame({'a': [2.0], 'b': [2.0]})}) self.assertEqual(len(result['x'].columns), 2) self.assertEqual(p.transform_modifies, { 'x': [Pipe1.transform_modifies['x'], Pipe2.transform_modifies['x']] }) schema = p.transform_schema( {'x': types.PandasDataFrame({ 'a': np.float64, 'b': np.float64 })}) self.assertEqual( schema['x'], types.PandasDataFrame({ 'b': np.float64, 'a * b': np.float64 }))
class Pipe1(Pipe): transform_requires = { 'x': types.PandasDataFrame(schema={ 'a': np.float64, 'b': np.float64 }), } transform_modifies = { 'x': ops.ModifyDataFrame({'a * b': ops.Set(np.float64)}) } def transform(self, data: dict): data['x']['a * b'] = data['x']['a'] * data['x']['b'] return data
import logging import sys logger = logging.getLogger('schemaflow') ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) ch.setFormatter( logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(ch) # this pipeline is very generic: it does not make any assumptions about the data's format. predict_pipeline.check_fit( { 'x': sf_types.PandasDataFrame({}), 'y': sf_types.Array(np.float64) }, raise_=True) predict_pipeline.check_transform({'x': sf_types.PandasDataFrame({})}, raise_=True) print('expected fit schema: ', predict_pipeline.fit_requires) print('fitted parameters: ', predict_pipeline.fitted_parameters) print('expected transform schema: ', predict_pipeline.transform_requires) print( 'expected transformed schema: ', predict_pipeline.transform_schema(predict_pipeline.transform_requires)) # execution of the pipeline