def test_3_stage(self): from sklearn.preprocessing import Imputer infile_name = path_of_data('missing_vals.csv') p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv'))) impute_node = p.add(wrap_and_make_instance(Imputer)) csv_read_node['output'] > impute_node['X_train'] impute_node['X_new'] > csv_write_node['input'] self.run_pipeline(p) ctrl_imputer = Imputer() ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",", names=True) num_type = ctrl_X_sa[0][0].dtype ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa) ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd) control = ctrl_X_new_nd result = self._tmp_files.csv_read('out.csv', True) self.assertTrue(np.allclose(result, control))
def __process_in_data(self, in_data): if in_data is None: return (np.random.random((100, 10)), np.random.randint(0, 2, 100)) elif isinstance(in_data, str) and in_data.split('.')[-1] == 'csv': a = np_sa_to_nd(csv_read(path_of_data(in_data)))[0] return (a[:, :-1], a[:, -1]) # assume in_data is a tuple (X, y) return (in_data[0], in_data[1])
def __process_in_data(self, in_data): if in_data is None: return (np.random.random((100, 10)), np.random.randint(0, 2, 100)) elif isinstance(in_data, str) and in_data.split(".")[-1] == "csv": a = np_sa_to_nd(csv_read(path_of_data(in_data)))[0] return (a[:, :-1], a[:, -1]) # assume in_data is a tuple (X, y) return (in_data[0], in_data[1])
def test_lambda(self): # Test output key generation l1 = LambdaStage(lambda x, y: 0) self.assertEqual(l1.input_keys, ['x', 'y']) self.assertEqual(l1.output_keys, [ 'output0', ]) l2 = LambdaStage(lambda: 0, n_outputs=3) self.assertEqual(l2.input_keys, []) self.assertEqual(l2.output_keys, ['output{}'.format(i) for i in xrange(3)]) # Test running in pipeline in_data = np_nd_to_sa(np.random.random((100, 10))) scale = np_nd_to_sa(np.array(3)) out_keys = ['augmented', 'log_col', 'sqrt_col', 'scale_col'] def log1_sqrt2_scale3(A, scale): names = A.dtype.names log_col = np.log(A[names[0]]) sqrt_col = np.sqrt(A[names[1]]) scale_col = A[names[2]] * scale[0][0] return (append_fields(A, ['log1', 'sqrt2', 'scale3'], (log_col, sqrt_col, scale_col)), log_col, sqrt_col, scale_col) p = Pipeline() np_in = p.add(NumpyRead(in_data)) scale_in = p.add(NumpyRead(scale)) lambda_stage = p.add(LambdaStage(log1_sqrt2_scale3, out_keys)) np_in['output'] > lambda_stage['A'] scale_in['output'] > lambda_stage['scale'] csv_out_stages = [] for key in out_keys: stage = p.add(CSVWrite(self._tmp_files('out_{}.csv'.format(key)))) csv_out_stages.append(stage) lambda_stage[key] > stage['input'] self.run_pipeline(p) controls = log1_sqrt2_scale3(in_data, scale) for i, key in enumerate(out_keys): control = controls[i] if is_sa(control): control = np_sa_to_nd(control)[0] result = self._tmp_files.csv_read('out_{}.csv'.format(key), as_nd=True) self.assertTrue(np.allclose(control, result))
def test_numpy_write(self): in_data = np.random.rand(10, 10) p = Pipeline() np_in = p.add(NumpyRead(in_data)) np_out = p.add(NumpyWrite()) np_in['output'] > np_out['input'] self.run_pipeline(p) self.assertTrue( np.allclose(in_data, np_sa_to_nd(np_out.get_stage().result)[0]))
def test_numpy_write(self): in_data = np.random.rand(10,10) p = Pipeline() np_in = p.add(NumpyRead(in_data)) np_out = p.add(NumpyWrite()) np_in['output'] > np_out['input'] self.run_pipeline(p) self.assertTrue(np.allclose( in_data, np_sa_to_nd(np_out.get_stage().result)[0]))
def test_lambda(self): # Test output key generation l1 = LambdaStage(lambda x, y: 0) self.assertEqual(l1.input_keys, ['x', 'y']) self.assertEqual(l1.output_keys, ['output0',]) l2 = LambdaStage(lambda: 0, n_outputs=3) self.assertEqual(l2.input_keys, []) self.assertEqual(l2.output_keys, ['output{}'.format(i) for i in xrange(3)]) # Test running in pipeline in_data = np_nd_to_sa(np.random.random((100, 10))) scale = np_nd_to_sa(np.array(3)) out_keys = ['augmented', 'log_col', 'sqrt_col', 'scale_col'] def log1_sqrt2_scale3(A, scale): names = A.dtype.names log_col = np.log(A[names[0]]) sqrt_col = np.sqrt(A[names[1]]) scale_col = A[names[2]] * scale[0][0] return (append_fields( A, ['log1', 'sqrt2', 'scale3'], (log_col, sqrt_col, scale_col)), log_col, sqrt_col, scale_col) p = Pipeline() np_in = p.add(NumpyRead(in_data)) scale_in = p.add(NumpyRead(scale)) lambda_stage = p.add( LambdaStage( log1_sqrt2_scale3, out_keys)) np_in['output'] > lambda_stage['A'] scale_in['output'] > lambda_stage['scale'] csv_out_stages = [] for key in out_keys: stage = p.add( CSVWrite( self._tmp_files( 'out_{}.csv'.format(key)))) csv_out_stages.append(stage) lambda_stage[key] > stage['input'] self.run_pipeline(p) controls = log1_sqrt2_scale3(in_data, scale) for i, key in enumerate(out_keys): control = controls[i] if is_sa(control): control = np_sa_to_nd(control)[0] result = self._tmp_files.csv_read( 'out_{}.csv'.format(key), as_nd=True) self.assertTrue(np.allclose(control, result))
def csv_read(filename, as_nd=False, dtype=None): sa = np.genfromtxt(filename, dtype=dtype, delimiter=",", names=True) if as_nd: return np_sa_to_nd(sa)[0] return sa