Пример #1
0
    def test_drop(self):
        p = Pipeline([Pipe2()])

        result = p.transform({'x': pd.DataFrame({'a': [2.0], 'b': [2.0]})})
        self.assertEqual(len(result['x'].columns), 1)

        self.assertEqual(p.transform_modifies, Pipe2.transform_modifies)
Пример #2
0
    def test_set(self):
        p = Pipeline([Pipe1()])

        result = p.transform({'x': pd.DataFrame({'a': [2.0], 'b': [2.0]})})
        self.assertEqual(result['x'].loc[:, 'a * b'].values, [4.0])

        self.assertEqual(p.transform_modifies, Pipe1.transform_modifies)
Пример #3
0
    def test_custom_parameters(self):
        p = Pipeline([('1', Pipe1()), ('2', Pipe2())])

        p.fit({'x': ['1', '2', '3']}, {'2': {'unused': 1.0}})
        result = p.transform({'x': ['1', '2', '3']})

        # std([1,2,3]) == 0.816496580927726
        self.assertEqual(result['x'],
                         [-1.2247448713915887, 0.0, 1.2247448713915887])
Пример #4
0
    def test_custom_init(self):
        pipes = collections.OrderedDict([('1', Pipe1()), ('2', Pipe2())])
        p = Pipeline(pipes)

        self.assertEqual(p.pipes, pipes)

        with self.assertRaises(TypeError):
            Pipeline([('1', 1)])

        with self.assertRaises(TypeError):
            Pipeline(Pipe1())
Пример #5
0
    def test_logged(self):
        p = Pipeline([Pipe1(), Pipe2()])

        p.logged_fit({'x': ['1', '2', '3']}, {'1': {'unused': 1.0}})
        self.assertEqual(self._handler.messages['error'], [])
        self.assertEqual(len(self._handler.messages['info']), 8)

        result = p.logged_transform({'x': ['1', '2', '3']})
        self.assertEqual(self._handler.messages['error'], [])
        self.assertEqual(len(self._handler.messages['info']), 8 + 4)

        # std([1,2,3]) == 0.816496580927726
        self.assertEqual(result['x'],
                         [-1.2247448713915887, 0.0, 1.2247448713915887])
Пример #6
0
    def test_basic(self):
        p = Pipeline([Pipe1(), Pipe2()])

        self.assertEqual(p.fitted_parameters, {
            '0': {},
            '1': {
                'mean': float,
                'var': float
            }
        })
        self.assertEqual(p.transform_requires, {'x': types.List(str)})
        self.assertEqual(p.fit_requires, {'x': types.List(str)})

        p.fit({'x': ['1', '2', '3']}, {'1': {'unused': 1.0}})
        result = p.transform({'x': ['1', '2', '3']})

        # std([1,2,3]) == 0.816496580927726
        self.assertEqual(result['x'],
                         [-1.2247448713915887, 0.0, 1.2247448713915887])
Пример #7
0
    def test_two_fit_schema(self):
        # P4 fit-needs 'x1', P2 fit-needs 'x' (float) => fit_requires needs both on its first type-occurrence
        p = Pipeline([Pipe1(), Pipe4(), Pipe2()])

        self.assertEqual(p.transform_requires, {'x': types.List(str)})

        self.assertEqual(p.fit_requires, {
            'x': types.List(str),
            'x1': types.List(float)
        })

        self.assertEqual(p.transform_modifies, {'x': types.List(float)})
Пример #8
0
    def test_combine(self):
        p = Pipeline([Pipe1(), Pipe2()])

        result = p.transform({'x': pd.DataFrame({'a': [2.0], 'b': [2.0]})})
        self.assertEqual(len(result['x'].columns), 2)

        self.assertEqual(p.transform_modifies, {
            'x':
            [Pipe1.transform_modifies['x'], Pipe2.transform_modifies['x']]
        })

        schema = p.transform_schema(
            {'x': types.PandasDataFrame({
                'a': np.float64,
                'b': np.float64
            })})

        self.assertEqual(
            schema['x'],
            types.PandasDataFrame({
                'b': np.float64,
                'a * b': np.float64
            }))
Пример #9
0
    def test_two_transform_data(self):
        # P1 needs 'x', P2 needs 'x1'
        p = Pipeline([Pipe1(), Pipe3(), Pipe2()])

        self.assertEqual(p.transform_requires, {
            'x': types.List(str),
            'x1': types.List(str)
        })

        self.assertEqual(p.fit_requires, {
            'x': types.List(str),
            'x1': types.List(str)
        })

        self.assertEqual(p.transform_modifies, {'x': types.List(float)})
Пример #10
0
    def test_check_transform(self):
        p = Pipeline([Pipe1(), Pipe2()])

        # ok
        self.assertEqual(p.check_transform({'x': ['1']}), [])
        # ok with raise
        self.assertEqual(p.check_transform({'x': ['1']}, True), [])

        # not ok
        self.assertEqual(len(p.check_transform({'x': 1})), 1)

        # not ok with raise
        with self.assertRaises(exceptions.WrongType):
            p.check_transform({'x': [1]}, True)
Пример #11
0
    def test_transform_schema(self):
        # P1 needs 'x', P2 needs 'x1'
        p = Pipeline([Pipe1(), Pipe3(), Pipe2()])

        # 'x1' is passed along without modification
        self.assertEqual(
            p.transform_schema({
                'x': types.List(str),
                'x1': types.List(str)
            }), {
                'x': types.List(float),
                'x1': types.List(str)
            })

        with self.assertRaises(exceptions.WrongSchema) as e:
            p.transform_schema({'y': types.List(str)})
        self.assertIn('in transform of pipe \'0\' of Pipeline',
                      str(e.exception))

        with self.assertRaises(exceptions.WrongSchema) as e:
            p.transform_schema({'x': types.List(str)})
        self.assertIn('in transform of pipe \'1\' of Pipeline',
                      str(e.exception))
Пример #12
0
    def test_check_fit(self):
        p = Pipeline([Pipe1(), Pipe2()])

        # ok
        self.assertEqual(p.check_fit({'x': ['1']}, {'1': {'unused': 1.0}}), [])
        # ok with raise
        self.assertEqual(
            p.check_fit({'x': ['1']}, {'1': {
                'unused': 1.0
            }}, True), [])

        # not ok
        self.assertEqual(len(p.check_fit({'x': 1}, {'1': {'unused': 1.0}})), 1)

        # not ok with raise
        with self.assertRaises(exceptions.WrongType):
            p.check_fit({'x': [1]}, {'1': {'unused': 1.0}}, True)

        p = Pipeline([Pipe2()])
        with self.assertRaises(exceptions.WrongType):
            p.check_fit({'x': ['1']}, {'1': {'unused': 1.0}}, True)
Пример #13
0
    def test_logged_transform(self):
        p = Pipeline([Pipe1(), Pipe2()])
        p.fit({'x': ['1', '2', '3']}, {'1': {'unused': 1.0}})
        p.logged_transform({'x': ['1', '2', '3']})
        self.assertEqual(self._handler.messages['error'], [])
        self.assertEqual(len(self._handler.messages['info']), 4)

        self._handler.reset()

        p.logged_transform({'x': [1, 2, 3]})
        self.assertEqual(len(self._handler.messages['error']), 1)
        self.assertEqual(len(self._handler.messages['info']), 4)

        self._handler.reset()

        p = Pipeline([PipeWrongTransform(), Pipe2()])
        p.fit({'x': ['1', '2', '3']}, {'1': {'unused': 1.0}})
        p.logged_transform({'x': ['1', '2', '3']})
        # 2 errors: 1 transform_modify and 1 for wrong input to second pipe
        self.assertEqual(len(self._handler.messages['error']), 2)
        self.assertEqual(
            self._handler.messages['error'][0],
            "Wrong type in result 'x' of modified data from transform in 0:\nRequired type: List(float)\nPassed type:   List(int)"
        )
        self.assertEqual(
            self._handler.messages['error'][1],
            "Wrong type in argument 'x' of transform in 1:\nRequired type: List(float)\nPassed type:   List(int)"
        )
        self.assertEqual(len(self._handler.messages['info']), 4)
Пример #14
0
    def test_requirements(self):
        p = Pipeline([Pipe1(), Pipe2()])
        self.assertEqual(p.requirements, {'a1', 'a2'})

        self.assertEqual(len(p.check_requirements), 2)
Пример #15
0
    x, y = x_y_split(df, target_column)

    predict_pipeline.fit({'x': x, 'y': y}, parameters)

    df = pd.read_csv('examples/all/test.csv', index_col='Id')

    result = predict_pipeline.transform({'x': df})['y_pred']

    pd.Series(result, name=target_column,
              index=df.index).to_csv('examples/submission.txt', header=True)


if __name__ == '__main__':
    predict_pipeline = Pipeline([
        SplitNumericCategorical(),
        FillNaN(),
        JoinCategoricalAsOneHot(), ('baseline', BaselineModel()),
        ('model', LogLassoModel())
    ])

    import logging
    import sys

    logger = logging.getLogger('schemaflow')
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    ch.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(ch)

    # this pipeline is very generic: it does not make any assumptions about the data's format.