예제 #1
0
    def test_lambda(self):

        # Test output key generation

        l1 = LambdaStage(lambda x, y: 0)
        self.assertEqual(l1.input_keys, ['x', 'y'])
        self.assertEqual(l1.output_keys, [
            'output0',
        ])

        l2 = LambdaStage(lambda: 0, n_outputs=3)
        self.assertEqual(l2.input_keys, [])
        self.assertEqual(l2.output_keys,
                         ['output{}'.format(i) for i in xrange(3)])

        # Test running in pipeline

        in_data = np_nd_to_sa(np.random.random((100, 10)))
        scale = np_nd_to_sa(np.array(3))
        out_keys = ['augmented', 'log_col', 'sqrt_col', 'scale_col']

        def log1_sqrt2_scale3(A, scale):
            names = A.dtype.names
            log_col = np.log(A[names[0]])
            sqrt_col = np.sqrt(A[names[1]])
            scale_col = A[names[2]] * scale[0][0]

            return (append_fields(A, ['log1', 'sqrt2', 'scale3'],
                                  (log_col, sqrt_col, scale_col)), log_col,
                    sqrt_col, scale_col)

        p = Pipeline()

        np_in = p.add(NumpyRead(in_data))
        scale_in = p.add(NumpyRead(scale))

        lambda_stage = p.add(LambdaStage(log1_sqrt2_scale3, out_keys))
        np_in['output'] > lambda_stage['A']
        scale_in['output'] > lambda_stage['scale']

        csv_out_stages = []
        for key in out_keys:
            stage = p.add(CSVWrite(self._tmp_files('out_{}.csv'.format(key))))
            csv_out_stages.append(stage)
            lambda_stage[key] > stage['input']

        self.run_pipeline(p)

        controls = log1_sqrt2_scale3(in_data, scale)

        for i, key in enumerate(out_keys):
            control = controls[i]
            if is_sa(control):
                control = np_sa_to_nd(control)[0]
            result = self._tmp_files.csv_read('out_{}.csv'.format(key),
                                              as_nd=True)
            self.assertTrue(np.allclose(control, result))
예제 #2
0
    def test_apply_to_selected_cols(self):
        rows = 100
        cols = 10
        random_data = np.random.rand(rows, cols)
        # enough nans so that there /has/ to be a Nan in 1 of our 3 selected cols
        nans = 701
        with_nans = np.copy(random_data)
        for r, c in zip(np.random.randint(0, rows, nans), 
                        np.random.randint(0, cols, nans)):
            with_nans[r,c] = np.NaN
        trials = ((wrap('sklearn.preprocessing.StandardScaler'), 
                   (), 
                   'X_train', 
                   'X_new',
                   np_nd_to_sa(random_data)), 
                  (FillNA, 
                   (0,), 
                   'input', 
                   'output',
                   np_nd_to_sa(with_nans)))
        sel_cols = ('f2', 'f3', 'f4')
        trials = trials[1:]

        for trans_cls, args, in_key, out_key, in_data in trials:
            p = Pipeline()

            node_in = p.add(NumpyRead(in_data))
            node_selected = p.add(
                ApplyToSelectedCols(sel_cols, trans_cls, *args))
            node_in['output'] > node_selected[in_key]

            node_out = p.add(NumpyWrite())
            node_selected[out_key] > node_out['input']

            node_ctrl_split = p.add(SplitColumns(sel_cols))
            node_in['output'] > node_ctrl_split['input']

            node_ctrl_trans = p.add(trans_cls(*args))
            node_ctrl_split['output'] > node_ctrl_trans[in_key]

            node_ctrl_out = p.add(NumpyWrite())
            node_ctrl_trans[out_key] > node_ctrl_out['input']

            self.run_pipeline(p)

            result = node_out.get_stage().result
            ctrl = node_ctrl_out.get_stage().result

            for col in in_data.dtype.names:
                if col in sel_cols:
                    self.assertTrue(np.allclose(result[col], ctrl[col]))
                else:
                    self.assertTrue(np.allclose(
                        np.nan_to_num(result[col]), 
                        np.nan_to_num(in_data[col])))
예제 #3
0
    def test_apply_to_selected_cols(self):
        rows = 100
        cols = 10
        random_data = np.random.rand(rows, cols)
        # enough nans so that there /has/ to be a Nan in 1 of our 3 selected cols
        nans = 701
        with_nans = np.copy(random_data)
        for r, c in zip(np.random.randint(0, rows, nans),
                        np.random.randint(0, cols, nans)):
            with_nans[r, c] = np.NaN
        trials = ((wrap('sklearn.preprocessing.StandardScaler'), (), 'X_train',
                   'X_new', np_nd_to_sa(random_data)),
                  (FillNA, (0, ), 'input', 'output', np_nd_to_sa(with_nans)))
        sel_cols = ('f2', 'f3', 'f4')
        trials = trials[1:]

        for trans_cls, args, in_key, out_key, in_data in trials:
            p = Pipeline()

            node_in = p.add(NumpyRead(in_data))
            node_selected = p.add(
                ApplyToSelectedCols(sel_cols, trans_cls, *args))
            node_in['output'] > node_selected[in_key]

            node_out = p.add(NumpyWrite())
            node_selected[out_key] > node_out['input']

            node_ctrl_split = p.add(SplitColumns(sel_cols))
            node_in['output'] > node_ctrl_split['input']

            node_ctrl_trans = p.add(trans_cls(*args))
            node_ctrl_split['output'] > node_ctrl_trans[in_key]

            node_ctrl_out = p.add(NumpyWrite())
            node_ctrl_trans[out_key] > node_ctrl_out['input']

            self.run_pipeline(p)

            result = node_out.get_stage().result
            ctrl = node_ctrl_out.get_stage().result

            for col in in_data.dtype.names:
                if col in sel_cols:
                    self.assertTrue(np.allclose(result[col], ctrl[col]))
                else:
                    self.assertTrue(
                        np.allclose(np.nan_to_num(result[col]),
                                    np.nan_to_num(in_data[col])))
예제 #4
0
    def test_identity(self):
        trials = [(('input0', 'input1'), ('output0', 'output1'), {
            'input0': 'output0',
            'input1': 'output1'
        }, True),
                  (('input0', 'input1', 'input2'), ('input0_out', 'input1_out',
                                                    'input2_out'),
                   ('input0', 'input1', 'input2'), True),
                  (('input0', 'input1'), ('output0', 'output1'), {
                      'output0': 'input0',
                      'output1': 'input1'
                  }, False),
                  (('output0_in', 'output1_in', 'output2_in'),
                   ('output0', 'output1', 'output2'), ('output0', 'output1',
                                                       'output2'), False)]

        for input_keys, output_keys, arg, specify_input in trials:

            in_data_arrays = []
            out_nodes = []

            p = Pipeline()

            if specify_input:
                node_id = p.add(Identity(arg))
            else:
                node_id = p.add(Identity(output_keys=arg))

            for input_key, output_key, in zip(input_keys, output_keys):

                in_data = np_nd_to_sa(np.random.random((100, 10)))
                node_in = p.add(NumpyRead(in_data))
                node_in['output'] > node_id[input_key]

                node_out = p.add(NumpyWrite())
                node_id[output_key] > node_out['input']

                in_data_arrays.append(in_data)
                out_nodes.append(node_out)

            self.run_pipeline(p)

            for in_data, out_node in zip(in_data_arrays, out_nodes):
                self.assertTrue(
                    np.array_equal(in_data,
                                   out_node.get_stage().result))
예제 #5
0
    def test_identity(self):
        trials = [(('input0', 'input1'), ('output0', 'output1'), 
                   {'input0': 'output0', 'input1': 'output1'},
                   True),
                  (('input0', 'input1', 'input2'), 
                   ('input0_out', 'input1_out', 'input2_out'), 
                   ('input0', 'input1', 'input2'),
                   True),
                  (('input0', 'input1'), ('output0', 'output1'), 
                   {'output0': 'input0', 'output1': 'input1'},
                   False),
                  (('output0_in', 'output1_in', 'output2_in'),
                   ('output0', 'output1', 'output2'),
                   ('output0', 'output1', 'output2'),
                   False)]
        
        for input_keys, output_keys, arg, specify_input in trials:

            in_data_arrays = []
            out_nodes = []

            p = Pipeline()

            if specify_input:
                node_id = p.add(Identity(arg))
            else:
                node_id = p.add(Identity(output_keys=arg))

            for input_key, output_key, in zip(input_keys, output_keys):

                in_data = np_nd_to_sa(np.random.random((100, 10)))
                node_in = p.add(NumpyRead(in_data))
                node_in['output'] > node_id[input_key]

                node_out = p.add(NumpyWrite())
                node_id[output_key] > node_out['input']

                in_data_arrays.append(in_data)
                out_nodes.append(node_out)

            self.run_pipeline(p)

            for in_data, out_node in zip(in_data_arrays, out_nodes):
                self.assertTrue(np.array_equal(in_data, 
                                               out_node.get_stage().result))
예제 #6
0
    def test_kfold(self):

        folds = 3
        rows = 6

        X = np.random.randint(0, 1000, (rows, 3))
        y = np.random.randint(0, 1000, (rows, 1))

        p = Pipeline()

        np_in_X = p.add(NumpyRead(X))
        np_in_y = p.add(NumpyRead(y))

        kfold = p.add(KFold(2, folds, random_state=0))
        np_in_X['output'] > kfold['input0']
        np_in_y['output'] > kfold['input1']

        ctrl_kf = SKKFold(rows, n_folds=folds, random_state=0)
        out_files = []
        expected_folds = []
        arrays = (X, y)
        for fold_i, train_test_inds in enumerate(ctrl_kf):
            for array_i, array in enumerate(arrays):
                for select_i, selection in enumerate(('train', 'test')):
                    out_key = '{}{}_{}'.format(selection, array_i, fold_i)
                    out_file = out_key + '.csv'
                    out_files.append(out_file)
                    stage = p.add(CSVWrite(self._tmp_files(out_file)))
                    kfold[out_key] > stage['input']
                    slice_inds = train_test_inds[select_i]
                    expected_folds.append(
                        np_nd_to_sa(arrays[array_i][slice_inds]))

        self.run_pipeline(p)

        for out_file, expected_fold in zip(out_files, expected_folds):
            self.assertTrue(
                np.array_equal(self._tmp_files.csv_read(out_file),
                               expected_fold))
예제 #7
0
    def test_kfold(self):

        folds = 3
        rows = 6

        X = np.random.randint(0, 1000, (rows, 3))
        y = np.random.randint(0, 1000, (rows, 1))

        p = Pipeline()

        np_in_X = p.add(NumpyRead(X))
        np_in_y = p.add(NumpyRead(y))

        kfold = p.add(KFold(2, folds, random_state=0))
        np_in_X['output'] > kfold['input0']
        np_in_y['output'] > kfold['input1']

        ctrl_kf = SKKFold(rows, n_folds = folds, random_state=0)
        out_files = []
        expected_folds = []
        arrays = (X, y)
        for fold_i, train_test_inds in enumerate(ctrl_kf):
            for array_i, array in enumerate(arrays):
                for select_i, selection in enumerate(('train', 'test')):
                    out_key = '{}{}_{}'.format(selection, array_i, fold_i) 
                    out_file = out_key + '.csv'
                    out_files.append(out_file)
                    stage = p.add(CSVWrite(self._tmp_files(out_file)))
                    kfold[out_key] > stage['input']
                    slice_inds = train_test_inds[select_i]
                    expected_folds.append(
                            np_nd_to_sa(arrays[array_i][slice_inds]))

        self.run_pipeline(p)

        for out_file, expected_fold in zip(out_files, expected_folds):
            self.assertTrue(np.array_equal(
                self._tmp_files.csv_read(out_file),
                expected_fold))
예제 #8
0
    def test_lambda(self):

        # Test output key generation

        l1 = LambdaStage(lambda x, y: 0)
        self.assertEqual(l1.input_keys, ['x', 'y'])
        self.assertEqual(l1.output_keys, ['output0',])

        l2 = LambdaStage(lambda: 0, n_outputs=3)
        self.assertEqual(l2.input_keys, [])
        self.assertEqual(l2.output_keys, ['output{}'.format(i) for i in
                                          xrange(3)])

        # Test running in pipeline

        in_data = np_nd_to_sa(np.random.random((100, 10)))
        scale = np_nd_to_sa(np.array(3))
        out_keys = ['augmented', 'log_col', 'sqrt_col', 'scale_col'] 

        def log1_sqrt2_scale3(A, scale):
            names = A.dtype.names
            log_col = np.log(A[names[0]])
            sqrt_col = np.sqrt(A[names[1]])
            scale_col = A[names[2]] * scale[0][0]

            return (append_fields(
                        A, 
                        ['log1', 'sqrt2', 'scale3'], 
                        (log_col, sqrt_col, scale_col)),
                    log_col,
                    sqrt_col,
                    scale_col)

        p = Pipeline()

        np_in = p.add(NumpyRead(in_data))
        scale_in = p.add(NumpyRead(scale))

        lambda_stage = p.add(
            LambdaStage(
                log1_sqrt2_scale3, 
                out_keys))
        np_in['output'] > lambda_stage['A']
        scale_in['output'] > lambda_stage['scale']

        csv_out_stages = []
        for key in out_keys:
            stage = p.add(
                    CSVWrite(
                        self._tmp_files(
                            'out_{}.csv'.format(key))))
            csv_out_stages.append(stage)
            lambda_stage[key] > stage['input']

        self.run_pipeline(p)

        controls = log1_sqrt2_scale3(in_data, scale)

        for i, key in enumerate(out_keys):
            control = controls[i]
            if is_sa(control):
                control = np_sa_to_nd(control)[0]
            result = self._tmp_files.csv_read(
                        'out_{}.csv'.format(key), 
                        as_nd=True)
            self.assertTrue(np.allclose(control, result))