def test_drop(self): p = Pipeline([Pipe2()]) result = p.transform({'x': pd.DataFrame({'a': [2.0], 'b': [2.0]})}) self.assertEqual(len(result['x'].columns), 1) self.assertEqual(p.transform_modifies, Pipe2.transform_modifies)
def test_set(self): p = Pipeline([Pipe1()]) result = p.transform({'x': pd.DataFrame({'a': [2.0], 'b': [2.0]})}) self.assertEqual(result['x'].loc[:, 'a * b'].values, [4.0]) self.assertEqual(p.transform_modifies, Pipe1.transform_modifies)
def test_custom_parameters(self): p = Pipeline([('1', Pipe1()), ('2', Pipe2())]) p.fit({'x': ['1', '2', '3']}, {'2': {'unused': 1.0}}) result = p.transform({'x': ['1', '2', '3']}) # std([1,2,3]) == 0.816496580927726 self.assertEqual(result['x'], [-1.2247448713915887, 0.0, 1.2247448713915887])
def test_custom_init(self): pipes = collections.OrderedDict([('1', Pipe1()), ('2', Pipe2())]) p = Pipeline(pipes) self.assertEqual(p.pipes, pipes) with self.assertRaises(TypeError): Pipeline([('1', 1)]) with self.assertRaises(TypeError): Pipeline(Pipe1())
def test_logged(self): p = Pipeline([Pipe1(), Pipe2()]) p.logged_fit({'x': ['1', '2', '3']}, {'1': {'unused': 1.0}}) self.assertEqual(self._handler.messages['error'], []) self.assertEqual(len(self._handler.messages['info']), 8) result = p.logged_transform({'x': ['1', '2', '3']}) self.assertEqual(self._handler.messages['error'], []) self.assertEqual(len(self._handler.messages['info']), 8 + 4) # std([1,2,3]) == 0.816496580927726 self.assertEqual(result['x'], [-1.2247448713915887, 0.0, 1.2247448713915887])
def test_basic(self): p = Pipeline([Pipe1(), Pipe2()]) self.assertEqual(p.fitted_parameters, { '0': {}, '1': { 'mean': float, 'var': float } }) self.assertEqual(p.transform_requires, {'x': types.List(str)}) self.assertEqual(p.fit_requires, {'x': types.List(str)}) p.fit({'x': ['1', '2', '3']}, {'1': {'unused': 1.0}}) result = p.transform({'x': ['1', '2', '3']}) # std([1,2,3]) == 0.816496580927726 self.assertEqual(result['x'], [-1.2247448713915887, 0.0, 1.2247448713915887])
def test_two_fit_schema(self): # P4 fit-needs 'x1', P2 fit-needs 'x' (float) => fit_requires needs both on its first type-occurrence p = Pipeline([Pipe1(), Pipe4(), Pipe2()]) self.assertEqual(p.transform_requires, {'x': types.List(str)}) self.assertEqual(p.fit_requires, { 'x': types.List(str), 'x1': types.List(float) }) self.assertEqual(p.transform_modifies, {'x': types.List(float)})
def test_combine(self): p = Pipeline([Pipe1(), Pipe2()]) result = p.transform({'x': pd.DataFrame({'a': [2.0], 'b': [2.0]})}) self.assertEqual(len(result['x'].columns), 2) self.assertEqual(p.transform_modifies, { 'x': [Pipe1.transform_modifies['x'], Pipe2.transform_modifies['x']] }) schema = p.transform_schema( {'x': types.PandasDataFrame({ 'a': np.float64, 'b': np.float64 })}) self.assertEqual( schema['x'], types.PandasDataFrame({ 'b': np.float64, 'a * b': np.float64 }))
def test_two_transform_data(self): # P1 needs 'x', P2 needs 'x1' p = Pipeline([Pipe1(), Pipe3(), Pipe2()]) self.assertEqual(p.transform_requires, { 'x': types.List(str), 'x1': types.List(str) }) self.assertEqual(p.fit_requires, { 'x': types.List(str), 'x1': types.List(str) }) self.assertEqual(p.transform_modifies, {'x': types.List(float)})
def test_check_transform(self): p = Pipeline([Pipe1(), Pipe2()]) # ok self.assertEqual(p.check_transform({'x': ['1']}), []) # ok with raise self.assertEqual(p.check_transform({'x': ['1']}, True), []) # not ok self.assertEqual(len(p.check_transform({'x': 1})), 1) # not ok with raise with self.assertRaises(exceptions.WrongType): p.check_transform({'x': [1]}, True)
def test_transform_schema(self): # P1 needs 'x', P2 needs 'x1' p = Pipeline([Pipe1(), Pipe3(), Pipe2()]) # 'x1' is passed along without modification self.assertEqual( p.transform_schema({ 'x': types.List(str), 'x1': types.List(str) }), { 'x': types.List(float), 'x1': types.List(str) }) with self.assertRaises(exceptions.WrongSchema) as e: p.transform_schema({'y': types.List(str)}) self.assertIn('in transform of pipe \'0\' of Pipeline', str(e.exception)) with self.assertRaises(exceptions.WrongSchema) as e: p.transform_schema({'x': types.List(str)}) self.assertIn('in transform of pipe \'1\' of Pipeline', str(e.exception))
def test_check_fit(self): p = Pipeline([Pipe1(), Pipe2()]) # ok self.assertEqual(p.check_fit({'x': ['1']}, {'1': {'unused': 1.0}}), []) # ok with raise self.assertEqual( p.check_fit({'x': ['1']}, {'1': { 'unused': 1.0 }}, True), []) # not ok self.assertEqual(len(p.check_fit({'x': 1}, {'1': {'unused': 1.0}})), 1) # not ok with raise with self.assertRaises(exceptions.WrongType): p.check_fit({'x': [1]}, {'1': {'unused': 1.0}}, True) p = Pipeline([Pipe2()]) with self.assertRaises(exceptions.WrongType): p.check_fit({'x': ['1']}, {'1': {'unused': 1.0}}, True)
def test_logged_transform(self): p = Pipeline([Pipe1(), Pipe2()]) p.fit({'x': ['1', '2', '3']}, {'1': {'unused': 1.0}}) p.logged_transform({'x': ['1', '2', '3']}) self.assertEqual(self._handler.messages['error'], []) self.assertEqual(len(self._handler.messages['info']), 4) self._handler.reset() p.logged_transform({'x': [1, 2, 3]}) self.assertEqual(len(self._handler.messages['error']), 1) self.assertEqual(len(self._handler.messages['info']), 4) self._handler.reset() p = Pipeline([PipeWrongTransform(), Pipe2()]) p.fit({'x': ['1', '2', '3']}, {'1': {'unused': 1.0}}) p.logged_transform({'x': ['1', '2', '3']}) # 2 errors: 1 transform_modify and 1 for wrong input to second pipe self.assertEqual(len(self._handler.messages['error']), 2) self.assertEqual( self._handler.messages['error'][0], "Wrong type in result 'x' of modified data from transform in 0:\nRequired type: List(float)\nPassed type: List(int)" ) self.assertEqual( self._handler.messages['error'][1], "Wrong type in argument 'x' of transform in 1:\nRequired type: List(float)\nPassed type: List(int)" ) self.assertEqual(len(self._handler.messages['info']), 4)
def test_requirements(self): p = Pipeline([Pipe1(), Pipe2()]) self.assertEqual(p.requirements, {'a1', 'a2'}) self.assertEqual(len(p.check_requirements), 2)
x, y = x_y_split(df, target_column) predict_pipeline.fit({'x': x, 'y': y}, parameters) df = pd.read_csv('examples/all/test.csv', index_col='Id') result = predict_pipeline.transform({'x': df})['y_pred'] pd.Series(result, name=target_column, index=df.index).to_csv('examples/submission.txt', header=True) if __name__ == '__main__': predict_pipeline = Pipeline([ SplitNumericCategorical(), FillNaN(), JoinCategoricalAsOneHot(), ('baseline', BaselineModel()), ('model', LogLassoModel()) ]) import logging import sys logger = logging.getLogger('schemaflow') ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) ch.setFormatter( logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(ch) # this pipeline is very generic: it does not make any assumptions about the data's format.