def test_query_complex(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('query.csv'))) q1_node = p.add(Query("((id == value) and not (use_this_col == 'no'))" "or name == 'fish'")) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_comp = p.add(CSVWrite(self._tmp_files('out_comp.csv'))) csv_in['output'] > q1_node['input'] q1_node['output'] > csv_out['input'] q1_node['complement'] > csv_comp['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('query_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl)) result = self._tmp_files.csv_read('out_comp.csv') ctrl = csv_read(path_of_data('query_ctrl_comp.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_query_complex(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('query.csv'))) q1_node = p.add( Query("((id == value) and not (use_this_col == 'no'))" "or name == 'fish'")) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_comp = p.add(CSVWrite(self._tmp_files('out_comp.csv'))) csv_in['output'] > q1_node['input'] q1_node['output'] > csv_out['input'] q1_node['complement'] > csv_comp['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('query_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl)) result = self._tmp_files.csv_read('out_comp.csv') ctrl = csv_read(path_of_data('query_ctrl_comp.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_sql(self): # Make sure we don't accidentally corrupt our test database db_path, db_file_name = self._tmp_files.tmp_copy(path_of_data( 'small.db')) db_url = 'sqlite:///{}'.format(db_path) q_sel_employees = 'CREATE TABLE {tmp_emp} AS SELECT * FROM employees;' # We have to be careful about the datetime type in sqlite3. It will # forget if we don't keep reminding it, and if it forgets sqlalchemy # will be unhappy. Hence, we can't use CREATE TABLE AS if our table # has a DATETIME q_sel_hours = ('CREATE TABLE {tmp_hrs} ' '(id INT, employee_id INT, time DATETIME, ' ' event_type TEXT); ' 'INSERT INTO {tmp_hrs} SELECT * FROM hours;') q_join = ('CREATE TABLE {joined} ' '(id INT, last_name TEXT, salary REAL, time DATETIME, ' ' event_type TEXT); ' 'INSERT INTO {joined} ' 'SELECT {tmp_emp}.id, last_name, salary, time, event_type ' 'FROM {tmp_emp} JOIN {tmp_hrs} ON ' '{tmp_emp}.id = {tmp_hrs}.employee_id;') p = Pipeline() get_emp = p.add(RunSQL(db_url, q_sel_employees, [], ['tmp_emp'], {})) get_hrs = p.add(RunSQL(db_url, q_sel_hours, [], ['tmp_hrs'], {})) join = p.add(RunSQL(db_url, q_join, ['tmp_emp', 'tmp_hrs'], ['joined'], {})) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) get_emp['tmp_emp'] > join['tmp_emp'] get_hrs['tmp_hrs'] > join['tmp_hrs'] join['joined'] > csv_out['input'] self.run_pipeline(p) ctrl = csv_read(path_of_data('test_transform_test_sql_ctrl.csv')) result = self._tmp_files.csv_read('out.csv') # Because Numpy insists on printing times with local offsets, but # not every computer has the same offset, we have to force it back # into UTC for i, dt in enumerate(result['time']): # .item() makes a datetime, which we can format correctly later # http://stackoverflow.com/questions/25134639/how-to-force-python-print-numpy-datetime64-with-specified-timezone result['time'][i] = np.datetime64(dt).item().strftime( '%Y-%m-%dT%H:%M:%S') # Then we have to make the string field smaller new_cols = [] for col in result.dtype.names: new_cols.append(result[col].astype(ctrl.dtype[col])) result = merge_arrays(new_cols, flatten=True) result.dtype.names = ctrl.dtype.names self.assertTrue(np.array_equal(result, ctrl))
def test_sql(self): # Make sure we don't accidentally corrupt our test database db_path, db_file_name = self._tmp_files.tmp_copy( path_of_data('small.db')) db_url = 'sqlite:///{}'.format(db_path) q_sel_employees = 'CREATE TABLE {tmp_emp} AS SELECT * FROM employees;' # We have to be careful about the datetime type in sqlite3. It will # forget if we don't keep reminding it, and if it forgets sqlalchemy # will be unhappy. Hence, we can't use CREATE TABLE AS if our table # has a DATETIME q_sel_hours = ('CREATE TABLE {tmp_hrs} ' '(id INT, employee_id INT, time DATETIME, ' ' event_type TEXT); ' 'INSERT INTO {tmp_hrs} SELECT * FROM hours;') q_join = ('CREATE TABLE {joined} ' '(id INT, last_name TEXT, salary REAL, time DATETIME, ' ' event_type TEXT); ' 'INSERT INTO {joined} ' 'SELECT {tmp_emp}.id, last_name, salary, time, event_type ' 'FROM {tmp_emp} JOIN {tmp_hrs} ON ' '{tmp_emp}.id = {tmp_hrs}.employee_id;') p = Pipeline() get_emp = p.add(RunSQL(db_url, q_sel_employees, [], ['tmp_emp'], {})) get_hrs = p.add(RunSQL(db_url, q_sel_hours, [], ['tmp_hrs'], {})) join = p.add( RunSQL(db_url, q_join, ['tmp_emp', 'tmp_hrs'], ['joined'], {})) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) get_emp['tmp_emp'] > join['tmp_emp'] get_hrs['tmp_hrs'] > join['tmp_hrs'] join['joined'] > csv_out['input'] self.run_pipeline(p) ctrl = csv_read(path_of_data('test_transform_test_sql_ctrl.csv')) result = self._tmp_files.csv_read('out.csv') # Because Numpy insists on printing times with local offsets, but # not every computer has the same offset, we have to force it back # into UTC for i, dt in enumerate(result['time']): # .item() makes a datetime, which we can format correctly later # http://stackoverflow.com/questions/25134639/how-to-force-python-print-numpy-datetime64-with-specified-timezone result['time'][i] = np.datetime64(dt).item().strftime( '%Y-%m-%dT%H:%M:%S') # Then we have to make the string field smaller new_cols = [] for col in result.dtype.names: new_cols.append(result[col].astype(ctrl.dtype[col])) result = merge_arrays(new_cols, flatten=True) result.dtype.names = ctrl.dtype.names self.assertTrue(np.array_equal(result, ctrl))
def test_sql(self): # Make sure we don't accidentally corrupt our test database db_path, db_file_name = self._tmp_files.tmp_copy( path_of_data('small.db')) db_url = 'sqlite:///{}'.format(db_path) for tbl_name in ('employees', 'hours'): uo_in = UObject(UObjectPhase.Write) uo_in.from_sql(db_url, {}, tbl_name, False) uo_in.write_to_read_phase() sa = uo_in.to_np() uo_out = UObject(UObjectPhase.Write) uo_out.from_np(sa) uo_out.write_to_read_phase() tbl_result, conn_result, db_url, conn_params = uo_out.to_sql( db_url, {}, '{}_out'.format(tbl_name)) result = conn_result.execute(sqlalchemy.sql.select([tbl_result ])).fetchall() tbl_result.drop(conn_result) ctrl_engine = sqlalchemy.create_engine(db_url) md = sqlalchemy.MetaData() md.reflect(ctrl_engine) tbl_ctrl = md.tables[tbl_name] ctrl = ctrl_engine.execute(sqlalchemy.sql.select([tbl_ctrl ])).fetchall() self.assertEqual(result, ctrl)
def test_3_stage(self): from sklearn.preprocessing import Imputer infile_name = path_of_data('missing_vals.csv') p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv'))) impute_node = p.add(wrap_and_make_instance(Imputer)) csv_read_node['output'] > impute_node['X_train'] impute_node['X_new'] > csv_write_node['input'] self.run_pipeline(p) ctrl_imputer = Imputer() ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",", names=True) num_type = ctrl_X_sa[0][0].dtype ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa) ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd) control = ctrl_X_new_nd result = self._tmp_files.csv_read('out.csv', True) self.assertTrue(np.allclose(result, control))
def test_sql(self): # Make sure we don't accidentally corrupt our test database db_path, db_file_name = self._tmp_files.tmp_copy(path_of_data( 'small.db')) db_url = 'sqlite:///{}'.format(db_path) for tbl_name in ('employees', 'hours'): uo_in = UObject(UObjectPhase.Write) uo_in.from_sql(db_url, {}, tbl_name, False) uo_in.write_to_read_phase() sa = uo_in.to_np() uo_out = UObject(UObjectPhase.Write) uo_out.from_np(sa) uo_out.write_to_read_phase() tbl_result, conn_result, db_url, conn_params = uo_out.to_sql( db_url, {}, '{}_out'.format(tbl_name)) result = conn_result.execute( sqlalchemy.sql.select([tbl_result])).fetchall() tbl_result.drop(conn_result) ctrl_engine = sqlalchemy.create_engine(db_url) md = sqlalchemy.MetaData() md.reflect(ctrl_engine) tbl_ctrl = md.tables[tbl_name] ctrl = ctrl_engine.execute( sqlalchemy.sql.select([tbl_ctrl])).fetchall() self.assertEqual(result, ctrl)
def test_label_encode(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('categories.csv'))) le = p.add(LabelEncode()) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_in['output'] > le['input'] le['output'] > csv_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('test_transform_test_label_encode_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_fill_na(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('missing_vals_mixed.csv'))) fill_na = p.add(FillNA(-1)) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_in['output'] > fill_na['input'] fill_na['output'] > csv_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('test_transform_test_fill_na_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_pickle(self): # TODO this just makes sure the object can be pickled. It doesn't # verify that the unpickled object is correct uo = UObject(UObjectPhase.Write) np_array = np.array([[0]]) uo.from_np(np_array) self.__pickle('upsg.export.csv.CSVWrite', path_of_data('_out.csv')) self.__pickle('upsg.fetch.csv.CSVRead', path_of_data('mixed_csv.csv')) self.__pickle('upsg.fetch.np.NumpyRead', np.array([[0]])) self.__pickle('upsg.transform.split.SplitTrainTest') self.__pickle('upsg.transform.split.SplitY', 0) self.__pickle('upsg.transform.rename_cols.RenameCols', {'name': 'rename'}) self.__pickle(wrap('sklearn.preprocessing.Imputer'), strategy='mean', missing_values='NaN') self.__pickle(wrap('sklearn.svm.SVC'), gamma=0.1) self.__pickle(wrap('sklearn.metrics.roc_curve'))
def __process_in_data(self, in_data): if in_data is None: return (np.random.random((100, 10)), np.random.randint(0, 2, 100)) elif isinstance(in_data, str) and in_data.split(".")[-1] == "csv": a = np_sa_to_nd(csv_read(path_of_data(in_data)))[0] return (a[:, :-1], a[:, -1]) # assume in_data is a tuple (X, y) return (in_data[0], in_data[1])
def __process_in_data(self, in_data): if in_data is None: return (np.random.random((100, 10)), np.random.randint(0, 2, 100)) elif isinstance(in_data, str) and in_data.split('.')[-1] == 'csv': a = np_sa_to_nd(csv_read(path_of_data(in_data)))[0] return (a[:, :-1], a[:, -1]) # assume in_data is a tuple (X, y) return (in_data[0], in_data[1])
def test_label_encode(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('categories.csv'))) le = p.add(LabelEncode()) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_in['output'] > le['input'] le['output'] > csv_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read( path_of_data('test_transform_test_label_encode_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_csv_load_store(self): filename = path_of_data('mixed_csv.csv') uo = UObject(UObjectPhase.Write) uo.from_csv(filename) uo.write_to_read_phase() result = uo.to_np() control = np.genfromtxt(filename, dtype=None, delimiter=",", names=True) self.assertTrue(np.array_equal(result, control))
def test_csv_load_store(self): filename = path_of_data('mixed_csv.csv') uo = UObject(UObjectPhase.Write) uo.from_csv(filename) uo.write_to_read_phase() result = uo.to_np() control = np.genfromtxt( filename, dtype=None, delimiter=",", names=True) self.assertTrue(np.array_equal(result, control))
def test_split_columns(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('numbers.csv'))) split = p.add(SplitColumns(('F1', 'F3'))) csv_out_sel = p.add(CSVWrite(self._tmp_files('out_sel.csv'))) csv_out_rest = p.add(CSVWrite(self._tmp_files('out_rest.csv'))) csv_in['output'] > split['input'] split['output'] > csv_out_sel['input'] split['complement'] > csv_out_rest['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out_sel.csv') ctrl = csv_read(path_of_data('test_split_columns_ctrl_selected.csv')) self.assertTrue(np.array_equal(result, ctrl)) result = self._tmp_files.csv_read('out_rest.csv') ctrl = csv_read(path_of_data('test_split_columns_ctrl_rest.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_toaster(self): dt = DataToaster() # Read in a csv dt.from_csv(path_of_data('test_toaster.csv')) # Training is data before 2006-06-15; testing is after. The column # giving us classification is 'cat' dt.split_by_query('cat', "date < DT('2006-06-15')") # Select features (manually, in this case) dt.transform_select_cols(('factor_1', 'factor_2')) # Do some last-minute cleanup dt.transform_with_sklearn('sklearn.preprocessing.StandardScaler') # Try a bunch of classifiers and parameters dt.classify_and_report(report_file_name=self._tmp_files('report.html')) dt.run() self.assertTrue(os.path.isfile(self._tmp_files('report.html')))
def test_rw(self): infile_name = path_of_data('mixed_csv.csv') p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv'))) csv_read_node['output'] > csv_write_node['input'] self.run_pipeline(p) control = np.genfromtxt(infile_name, dtype=None, delimiter=",", names=True) result = self._tmp_files.csv_read('out.csv') self.assertTrue(np.array_equal(result, control))
def test_rename_cols(self): infile_name = path_of_data('mixed_csv.csv') rename_dict = {'name': 'designation', 'height': 'tallness'} p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) trans_node = p.add(RenameCols(rename_dict)) csv_write_node = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_read_node['output'] > trans_node['input'] trans_node['output'] > csv_write_node['input'] self.run_pipeline(p) control = {'id', 'designation', 'tallness'} result = set(self._tmp_files.csv_read('out.csv').dtype.names) self.assertTrue(np.array_equal(result, control))
def test_timify(self): in_file = path_of_data('with_dates.csv') p = Pipeline() csv_in = p.add(CSVRead(in_file)) timify = p.add(Timify()) csv_in['output'] > timify['input'] np_out = p.add(NumpyWrite()) timify['output'] > np_out['input'] self.run_pipeline(p) result = np_out.get_stage().result ctrl_raw = csv_read(in_file) ctrl_dtype = np.dtype([(name, '<M8[D]') if 'dt' in name else (name, fmt) for name, fmt in ctrl_raw.dtype.descr]) ctrl_better = csv_read(in_file, dtype=ctrl_dtype) self.assertEqual(result.dtype, ctrl_better.dtype) self.assertTrue(np.array_equal(result, ctrl_better))