def test_lambda(self): # Test output key generation l1 = LambdaStage(lambda x, y: 0) self.assertEqual(l1.input_keys, ['x', 'y']) self.assertEqual(l1.output_keys, [ 'output0', ]) l2 = LambdaStage(lambda: 0, n_outputs=3) self.assertEqual(l2.input_keys, []) self.assertEqual(l2.output_keys, ['output{}'.format(i) for i in xrange(3)]) # Test running in pipeline in_data = np_nd_to_sa(np.random.random((100, 10))) scale = np_nd_to_sa(np.array(3)) out_keys = ['augmented', 'log_col', 'sqrt_col', 'scale_col'] def log1_sqrt2_scale3(A, scale): names = A.dtype.names log_col = np.log(A[names[0]]) sqrt_col = np.sqrt(A[names[1]]) scale_col = A[names[2]] * scale[0][0] return (append_fields(A, ['log1', 'sqrt2', 'scale3'], (log_col, sqrt_col, scale_col)), log_col, sqrt_col, scale_col) p = Pipeline() np_in = p.add(NumpyRead(in_data)) scale_in = p.add(NumpyRead(scale)) lambda_stage = p.add(LambdaStage(log1_sqrt2_scale3, out_keys)) np_in['output'] > lambda_stage['A'] scale_in['output'] > lambda_stage['scale'] csv_out_stages = [] for key in out_keys: stage = p.add(CSVWrite(self._tmp_files('out_{}.csv'.format(key)))) csv_out_stages.append(stage) lambda_stage[key] > stage['input'] self.run_pipeline(p) controls = log1_sqrt2_scale3(in_data, scale) for i, key in enumerate(out_keys): control = controls[i] if is_sa(control): control = np_sa_to_nd(control)[0] result = self._tmp_files.csv_read('out_{}.csv'.format(key), as_nd=True) self.assertTrue(np.allclose(control, result))
def test_apply_to_selected_cols(self): rows = 100 cols = 10 random_data = np.random.rand(rows, cols) # enough nans so that there /has/ to be a Nan in 1 of our 3 selected cols nans = 701 with_nans = np.copy(random_data) for r, c in zip(np.random.randint(0, rows, nans), np.random.randint(0, cols, nans)): with_nans[r,c] = np.NaN trials = ((wrap('sklearn.preprocessing.StandardScaler'), (), 'X_train', 'X_new', np_nd_to_sa(random_data)), (FillNA, (0,), 'input', 'output', np_nd_to_sa(with_nans))) sel_cols = ('f2', 'f3', 'f4') trials = trials[1:] for trans_cls, args, in_key, out_key, in_data in trials: p = Pipeline() node_in = p.add(NumpyRead(in_data)) node_selected = p.add( ApplyToSelectedCols(sel_cols, trans_cls, *args)) node_in['output'] > node_selected[in_key] node_out = p.add(NumpyWrite()) node_selected[out_key] > node_out['input'] node_ctrl_split = p.add(SplitColumns(sel_cols)) node_in['output'] > node_ctrl_split['input'] node_ctrl_trans = p.add(trans_cls(*args)) node_ctrl_split['output'] > node_ctrl_trans[in_key] node_ctrl_out = p.add(NumpyWrite()) node_ctrl_trans[out_key] > node_ctrl_out['input'] self.run_pipeline(p) result = node_out.get_stage().result ctrl = node_ctrl_out.get_stage().result for col in in_data.dtype.names: if col in sel_cols: self.assertTrue(np.allclose(result[col], ctrl[col])) else: self.assertTrue(np.allclose( np.nan_to_num(result[col]), np.nan_to_num(in_data[col])))
def test_apply_to_selected_cols(self): rows = 100 cols = 10 random_data = np.random.rand(rows, cols) # enough nans so that there /has/ to be a Nan in 1 of our 3 selected cols nans = 701 with_nans = np.copy(random_data) for r, c in zip(np.random.randint(0, rows, nans), np.random.randint(0, cols, nans)): with_nans[r, c] = np.NaN trials = ((wrap('sklearn.preprocessing.StandardScaler'), (), 'X_train', 'X_new', np_nd_to_sa(random_data)), (FillNA, (0, ), 'input', 'output', np_nd_to_sa(with_nans))) sel_cols = ('f2', 'f3', 'f4') trials = trials[1:] for trans_cls, args, in_key, out_key, in_data in trials: p = Pipeline() node_in = p.add(NumpyRead(in_data)) node_selected = p.add( ApplyToSelectedCols(sel_cols, trans_cls, *args)) node_in['output'] > node_selected[in_key] node_out = p.add(NumpyWrite()) node_selected[out_key] > node_out['input'] node_ctrl_split = p.add(SplitColumns(sel_cols)) node_in['output'] > node_ctrl_split['input'] node_ctrl_trans = p.add(trans_cls(*args)) node_ctrl_split['output'] > node_ctrl_trans[in_key] node_ctrl_out = p.add(NumpyWrite()) node_ctrl_trans[out_key] > node_ctrl_out['input'] self.run_pipeline(p) result = node_out.get_stage().result ctrl = node_ctrl_out.get_stage().result for col in in_data.dtype.names: if col in sel_cols: self.assertTrue(np.allclose(result[col], ctrl[col])) else: self.assertTrue( np.allclose(np.nan_to_num(result[col]), np.nan_to_num(in_data[col])))
def test_identity(self): trials = [(('input0', 'input1'), ('output0', 'output1'), { 'input0': 'output0', 'input1': 'output1' }, True), (('input0', 'input1', 'input2'), ('input0_out', 'input1_out', 'input2_out'), ('input0', 'input1', 'input2'), True), (('input0', 'input1'), ('output0', 'output1'), { 'output0': 'input0', 'output1': 'input1' }, False), (('output0_in', 'output1_in', 'output2_in'), ('output0', 'output1', 'output2'), ('output0', 'output1', 'output2'), False)] for input_keys, output_keys, arg, specify_input in trials: in_data_arrays = [] out_nodes = [] p = Pipeline() if specify_input: node_id = p.add(Identity(arg)) else: node_id = p.add(Identity(output_keys=arg)) for input_key, output_key, in zip(input_keys, output_keys): in_data = np_nd_to_sa(np.random.random((100, 10))) node_in = p.add(NumpyRead(in_data)) node_in['output'] > node_id[input_key] node_out = p.add(NumpyWrite()) node_id[output_key] > node_out['input'] in_data_arrays.append(in_data) out_nodes.append(node_out) self.run_pipeline(p) for in_data, out_node in zip(in_data_arrays, out_nodes): self.assertTrue( np.array_equal(in_data, out_node.get_stage().result))
def test_identity(self): trials = [(('input0', 'input1'), ('output0', 'output1'), {'input0': 'output0', 'input1': 'output1'}, True), (('input0', 'input1', 'input2'), ('input0_out', 'input1_out', 'input2_out'), ('input0', 'input1', 'input2'), True), (('input0', 'input1'), ('output0', 'output1'), {'output0': 'input0', 'output1': 'input1'}, False), (('output0_in', 'output1_in', 'output2_in'), ('output0', 'output1', 'output2'), ('output0', 'output1', 'output2'), False)] for input_keys, output_keys, arg, specify_input in trials: in_data_arrays = [] out_nodes = [] p = Pipeline() if specify_input: node_id = p.add(Identity(arg)) else: node_id = p.add(Identity(output_keys=arg)) for input_key, output_key, in zip(input_keys, output_keys): in_data = np_nd_to_sa(np.random.random((100, 10))) node_in = p.add(NumpyRead(in_data)) node_in['output'] > node_id[input_key] node_out = p.add(NumpyWrite()) node_id[output_key] > node_out['input'] in_data_arrays.append(in_data) out_nodes.append(node_out) self.run_pipeline(p) for in_data, out_node in zip(in_data_arrays, out_nodes): self.assertTrue(np.array_equal(in_data, out_node.get_stage().result))
def test_kfold(self): folds = 3 rows = 6 X = np.random.randint(0, 1000, (rows, 3)) y = np.random.randint(0, 1000, (rows, 1)) p = Pipeline() np_in_X = p.add(NumpyRead(X)) np_in_y = p.add(NumpyRead(y)) kfold = p.add(KFold(2, folds, random_state=0)) np_in_X['output'] > kfold['input0'] np_in_y['output'] > kfold['input1'] ctrl_kf = SKKFold(rows, n_folds=folds, random_state=0) out_files = [] expected_folds = [] arrays = (X, y) for fold_i, train_test_inds in enumerate(ctrl_kf): for array_i, array in enumerate(arrays): for select_i, selection in enumerate(('train', 'test')): out_key = '{}{}_{}'.format(selection, array_i, fold_i) out_file = out_key + '.csv' out_files.append(out_file) stage = p.add(CSVWrite(self._tmp_files(out_file))) kfold[out_key] > stage['input'] slice_inds = train_test_inds[select_i] expected_folds.append( np_nd_to_sa(arrays[array_i][slice_inds])) self.run_pipeline(p) for out_file, expected_fold in zip(out_files, expected_folds): self.assertTrue( np.array_equal(self._tmp_files.csv_read(out_file), expected_fold))
def test_kfold(self): folds = 3 rows = 6 X = np.random.randint(0, 1000, (rows, 3)) y = np.random.randint(0, 1000, (rows, 1)) p = Pipeline() np_in_X = p.add(NumpyRead(X)) np_in_y = p.add(NumpyRead(y)) kfold = p.add(KFold(2, folds, random_state=0)) np_in_X['output'] > kfold['input0'] np_in_y['output'] > kfold['input1'] ctrl_kf = SKKFold(rows, n_folds = folds, random_state=0) out_files = [] expected_folds = [] arrays = (X, y) for fold_i, train_test_inds in enumerate(ctrl_kf): for array_i, array in enumerate(arrays): for select_i, selection in enumerate(('train', 'test')): out_key = '{}{}_{}'.format(selection, array_i, fold_i) out_file = out_key + '.csv' out_files.append(out_file) stage = p.add(CSVWrite(self._tmp_files(out_file))) kfold[out_key] > stage['input'] slice_inds = train_test_inds[select_i] expected_folds.append( np_nd_to_sa(arrays[array_i][slice_inds])) self.run_pipeline(p) for out_file, expected_fold in zip(out_files, expected_folds): self.assertTrue(np.array_equal( self._tmp_files.csv_read(out_file), expected_fold))
def test_lambda(self): # Test output key generation l1 = LambdaStage(lambda x, y: 0) self.assertEqual(l1.input_keys, ['x', 'y']) self.assertEqual(l1.output_keys, ['output0',]) l2 = LambdaStage(lambda: 0, n_outputs=3) self.assertEqual(l2.input_keys, []) self.assertEqual(l2.output_keys, ['output{}'.format(i) for i in xrange(3)]) # Test running in pipeline in_data = np_nd_to_sa(np.random.random((100, 10))) scale = np_nd_to_sa(np.array(3)) out_keys = ['augmented', 'log_col', 'sqrt_col', 'scale_col'] def log1_sqrt2_scale3(A, scale): names = A.dtype.names log_col = np.log(A[names[0]]) sqrt_col = np.sqrt(A[names[1]]) scale_col = A[names[2]] * scale[0][0] return (append_fields( A, ['log1', 'sqrt2', 'scale3'], (log_col, sqrt_col, scale_col)), log_col, sqrt_col, scale_col) p = Pipeline() np_in = p.add(NumpyRead(in_data)) scale_in = p.add(NumpyRead(scale)) lambda_stage = p.add( LambdaStage( log1_sqrt2_scale3, out_keys)) np_in['output'] > lambda_stage['A'] scale_in['output'] > lambda_stage['scale'] csv_out_stages = [] for key in out_keys: stage = p.add( CSVWrite( self._tmp_files( 'out_{}.csv'.format(key)))) csv_out_stages.append(stage) lambda_stage[key] > stage['input'] self.run_pipeline(p) controls = log1_sqrt2_scale3(in_data, scale) for i, key in enumerate(out_keys): control = controls[i] if is_sa(control): control = np_sa_to_nd(control)[0] result = self._tmp_files.csv_read( 'out_{}.csv'.format(key), as_nd=True) self.assertTrue(np.allclose(control, result))