class TestGenerator(TestCase): create_statement = "create table test_table_float_fea " \ "(features float, label int)" drop_statement = "drop table if exists test_table_float_fea" insert_statement = "insert into test_table_float_fea (features,label)" \ " values(1.0, 0), (2.0, 1)" @unittest.skipUnless(testing.get_driver() == "mysql", "skip non mysql tests") def test_generator(self): driver = testing.get_driver() user, password, host, port, database, _ = parseMySQLDSN( testing.get_mysql_dsn()) conn = connect(driver, database, user=user, password=password, host=host, port=int(port)) # prepare test data execute(driver, conn, self.drop_statement) execute(driver, conn, self.create_statement) execute(driver, conn, self.insert_statement) column_name_to_type = { "features": { "feature_name": "features", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] } } label_meta = {"feature_name": "label", "shape": [], "delimiter": ""} gen = db_generator(conn, "SELECT * FROM test_table_float_fea", label_meta) idx = 0 for row, label in gen(): features = read_features_from_row(row, ["features"], ["features"], column_name_to_type) d = (features, label) if idx == 0: self.assertEqual(d, (((1.0, ), ), 0)) elif idx == 1: self.assertEqual(d, (((2.0, ), ), 1)) idx += 1 self.assertEqual(idx, 2) @unittest.skipUnless(testing.get_driver() == "mysql", "skip non mysql tests") def test_generate_fetch_size(self): label_meta = {"feature_name": "label", "shape": [], "delimiter": ""} gen = db_generator(testing.get_singleton_db_connection(), 'SELECT * FROM iris.train limit 10', label_meta, fetch_size=4) self.assertEqual(len([g for g in gen()]), 10)
class TestXGBoostTrain(unittest.TestCase): @unittest.skipUnless(testing.get_driver() == "mysql", "skip non mysql tests") def test_train(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softmax", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = { "num_boost_round": 20, } model_params = {"num_class": 3, "objective": "multi:softmax"} eval_result = train(ds, original_sql, select, val_select, "xgboost.gbtree", "", None, NumericColumn(FieldDesc(name="class")), model_params, train_params, None, "iris.xgboost_train_model_test", None) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01)
class TestXGBoostTrain(TestCase): @unittest.skipUnless(testing.get_driver() == "mysql", "skip non mysql tests") def test_train(self): ds = testing.get_datasource() select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" feature_column_names = [ feature_metas[k]["feature_name"] for k in feature_metas ] is_pai = False pai_train_table = "" train_params = {"num_boost_round": 20} model_params = {"num_classes": 3} with tempfile.TemporaryDirectory() as tmp_dir_name: train_fn = os.path.join(tmp_dir_name, 'train.txt') val_fn = os.path.join(tmp_dir_name, 'val.txt') dtrain = xgb_dataset(ds, train_fn, select, feature_metas, feature_column_names, label_meta, is_pai, pai_train_table) dval = xgb_dataset(ds, val_fn, val_select, feature_metas, feature_column_names, label_meta, is_pai, pai_train_table) eval_result = train(dtrain, train_params, model_params, dval) self.assertLess(eval_result['train']['rmse'][-1], 0.01) self.assertLess(eval_result['validate']['rmse'][-1], 0.01)
class TestFetchVerifyColumnNameAndType(unittest.TestCase): def generate_select(self, table, columns): return "SELECT %s FROM %s" % (",".join(columns), table) @unittest.skipUnless(testing.get_driver() in ["mysql", "hive"], "skip non mysql/hive tests") def test_verify_column_name_and_type(self): conn = testing.get_singleton_db_connection() train_table = "iris.train" test_table = "iris.test" train_select = [ "petal_length", "petal_width", "sepal_length", "sepal_width", "class" ] test_select = train_select verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(test_table, test_select), "class") test_select = [ "petal_length", "petal_width", "sepal_length", "sepal_width" ] verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(test_table, test_select), "class") test_select = ["petal_length", "petal_width", "sepal_length"] with self.assertRaises(ValueError): verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(test_table, test_select), "class") cursor = conn.cursor() name_and_type = dict(db.get_table_schema(conn, test_table)) new_table_name = "iris.verifier_test_table" name_and_type["petal_length"] = "VARCHAR(255)" # change the data type create_column_str = ",".join( ["%s %s" % (n, t) for n, t in name_and_type.items()]) drop_sql = "DROP TABLE IF EXISTS %s" % new_table_name create_sql = "CREATE TABLE %s(%s)" % (new_table_name, create_column_str) cursor.execute(drop_sql) cursor.execute(create_sql) with self.assertRaises(ValueError): test_select = train_select verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(new_table_name, test_select), "class") cursor.execute(drop_sql) cursor.close()
class TestXGBoostTrain(unittest.TestCase): @unittest.skipUnless(testing.get_driver() == "mysql", "skip non mysql tests") def test_train(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softmax", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = { "num_boost_round": 20, } model_params = {"num_class": 3, "objective": "multi:softmax"} with temp_file.TemporaryDirectory(as_cwd=True): eval_result = train(ds, original_sql, select, val_select, "xgboost.gbtree", "", None, NumericColumn(FieldDesc(name="class")), model_params, train_params, None, "iris.xgboost_train_model_test", None) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01) with temp_file.TemporaryDirectory(as_cwd=True): pred_original_sql = """SELECT * FROM iris.test TO PREDICT iris.xgboost_pred_result.pred_val USING iris.xgboost_train_model_test;""" pred(ds, pred_original_sql, "SELECT * FROM iris.test", "iris.xgboost_train_model_test", "pred_val", model_params, "iris.xgboost_pred_result") with temp_file.TemporaryDirectory(as_cwd=True): explain_original_sql = """SELECT * FROM iris.test TO EXPLAIN iris.xgboost_train_model_test INTO iris.xgboost_explain_result;""" explain(ds, explain_original_sql, "SELECT * FROM iris.test", "iris.xgboost_train_model_test", model_params, "iris.xgboost_explain_result") with temp_file.TemporaryDirectory(as_cwd=True): evaluate_original_sql = """SELECT * FROM iris.test TO EVALUATE iris.xgboost_train_model_test WITH label_col=class INTO iris.xgboost_evaluate_result;""" evaluate(ds, evaluate_original_sql, "SELECT * FROM iris.test", "class", "iris.xgboost_train_model_test", model_params, "iris.xgboost_evaluate_result")
def test_mysql(self): driver = testing.get_driver() user, password, host, port, database, _ = parseMySQLDSN( testing.get_mysql_dsn()) conn = connect(driver, database, user=user, password=password, host=host, port=port) self._do_test(driver, conn) conn.close() conn = testing.get_singleton_db_connection() self._do_test(driver, conn)
class TestFetchSamples(unittest.TestCase): @unittest.skipUnless(testing.get_driver() in ["mysql", "hive"], "skip non mysql/hive tests") def test_fetch_sample(self): conn = testing.get_singleton_db_connection() select = "SELECT * FROM iris.train" name_and_type = db.selected_columns_and_types(conn, select) expect_field_names = [item[0] for item in name_and_type] expect_field_types = [item[1] for item in name_and_type] column_num = len(name_and_type) gen = fetch_samples(conn, select, n=0) self.assertTrue(gen is None) gen = fetch_samples(conn, select, n=-1) row_num = length(gen()) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertGreater(row_num, 25) gen = fetch_samples(conn, select, n=25) n = 0 self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) for rows in gen(): self.assertEqual(len(rows), column_num) n += 1 self.assertEqual(n, 25) gen = fetch_samples(conn, select, n=10) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertEqual(length(gen()), 10) gen = fetch_samples(conn, "%s LIMIT 1" % select, n=1000) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertEqual(length(gen()), 1) gen = fetch_samples(conn, select, n=row_num * 2) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertEqual(length(gen()), row_num)
def test_hive(self): driver = testing.get_driver() user, password, host, port, database, _, _ = parseHiveDSN( testing.get_hive_dsn()) conn = connect(driver, database, user=user, password=password, host=host, port=port) self._do_test(driver, conn, hdfs_namenode_addr="127.0.0.1:8020", hive_location="/sqlflow") conn.close() conn = testing.get_singleton_db_connection() self._do_test(driver, conn) self._do_test_hive_specified_db(driver, conn, hdfs_namenode_addr="127.0.0.1:8020", hive_location="/sqlflow")
def test_generator(self): driver = testing.get_driver() user, password, host, port, database, _ = parseMySQLDSN( testing.get_mysql_dsn()) conn = connect(driver, database, user=user, password=password, host=host, port=int(port)) # prepare test data execute(driver, conn, self.drop_statement) execute(driver, conn, self.create_statement) execute(driver, conn, self.insert_statement) column_name_to_type = { "features": { "feature_name": "features", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] } } label_meta = {"feature_name": "label", "shape": [], "delimiter": ""} gen = db_generator(conn, "SELECT * FROM test_table_float_fea", label_meta) idx = 0 for row, label in gen(): features = read_features_from_row(row, ["features"], ["features"], column_name_to_type) d = (features, label) if idx == 0: self.assertEqual(d, (((1.0, ), ), 0)) elif idx == 1: self.assertEqual(d, (((2.0, ), ), 1)) idx += 1 self.assertEqual(idx, 2)
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License import unittest from runtime import testing from runtime.dbapi.pyalisa.pop import Pop @unittest.skipUnless(testing.get_driver() == "alisa", "Skip non-alisa test") class TestPop(unittest.TestCase): def test_signature(self): params = { "name": "由由", "age": "3", "homepage": "http://little4.kg?true" } sign = Pop.signature(params, "POST", "test_secret_key") self.assertEqual("6kvgvUDEHtFdZKj8+HhtAS1ovHY=", sign) def test_percent_encode(self): # "~-_." should not be encoded param = "~-_." encoded = Pop.percent_encode(param) self.assertEqual("~-_.", encoded)
class TestDB(TestCase): create_statement = "create table test_db (features text, label int)" hive_create_statement = 'create table test_db (features string, ' \ 'label int) ROW FORMAT DELIMITED FIELDS ' \ 'TERMINATED BY "\001"' select_statement = "select * from test_db" drop_statement = "drop table if exists test_db" @unittest.skipUnless(testing.get_driver() == "mysql", "skip non mysql tests") def test_mysql(self): driver = testing.get_driver() user, password, host, port, database, _ = parseMySQLDSN( testing.get_mysql_dsn()) conn = connect(driver, database, user=user, password=password, host=host, port=port) self._do_test(driver, conn) conn.close() conn = testing.get_singleton_db_connection() self._do_test(driver, conn) @unittest.skipUnless(testing.get_driver() == "hive", "skip non hive tests") def test_hive(self): driver = testing.get_driver() user, password, host, port, database, _, _ = parseHiveDSN( testing.get_hive_dsn()) conn = connect(driver, database, user=user, password=password, host=host, port=port) self._do_test(driver, conn, hdfs_namenode_addr="127.0.0.1:8020", hive_location="/sqlflow") conn.close() conn = testing.get_singleton_db_connection() self._do_test(driver, conn) self._do_test_hive_specified_db(driver, conn, hdfs_namenode_addr="127.0.0.1:8020", hive_location="/sqlflow") def _do_test_hive_specified_db(self, driver, conn, hdfs_namenode_addr="", hive_location=""): create_db = '''create database if not exists test_db''' create_tbl = '''create table test_db.tbl (features string, label int) ROW FORMAT DELIMITED FIELDS TERMINATED BY "\001"''' drop_tbl = '''drop table if exists test_db.tbl''' select_tbl = '''select * from test_db.tbl''' table_schema = ["label", "features"] values = [(1, '5,6,1,2')] * 10 execute(driver, conn, create_db) execute(driver, conn, drop_tbl) execute(driver, conn, create_tbl) with buffered_db_writer(driver, conn, "test_db.tbl", table_schema, buff_size=10, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location) as w: for row in values: w.write(row) field_names, data = execute(driver, conn, select_tbl) expect_features = ['5,6,1,2'] * 10 expect_labels = [1] * 10 self.assertEqual(field_names, ['features', 'label']) self.assertEqual(expect_features, data[0]) self.assertEqual(expect_labels, data[1]) def _do_test(self, driver, conn, hdfs_namenode_addr="", hive_location=""): table_name = "test_db" table_schema = ["label", "features"] values = [(1, '5,6,1,2')] * 10 execute(driver, conn, self.drop_statement) if driver == "hive": execute(driver, conn, self.hive_create_statement) else: execute(driver, conn, self.create_statement) with buffered_db_writer(driver, conn, table_name, table_schema, buff_size=10, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location) as w: for row in values: w.write(row) field_names, data = execute(driver, conn, self.select_statement) expect_features = ['5,6,1,2'] * 10 expect_labels = [1] * 10 self.assertEqual(field_names, ['features', 'label']) self.assertEqual(expect_features, data[0]) self.assertEqual(expect_labels, data[1])
def test_get_table_schema(self): driver = testing.get_driver() conn = testing.get_singleton_db_connection() if driver == "mysql": schema = get_table_schema(conn, "iris.train") expect = ( ('sepal_length', 'FLOAT'), ('sepal_width', 'FLOAT'), ('petal_length', 'FLOAT'), ('petal_width', 'FLOAT'), ('class', 'INT(11)'), ) self.assertTrue(np.array_equal(expect, schema)) schema = selected_columns_and_types( conn, "SELECT sepal_length, petal_width * 2.3 new_petal_width, " "class FROM iris.train") expect = [ ("sepal_length", "FLOAT"), ("new_petal_width", "DOUBLE"), ("class", "INT"), ] self.assertTrue(np.array_equal(expect, schema)) elif driver == "hive": schema = get_table_schema(conn, "iris.train") expect = ( ('sepal_length', 'FLOAT'), ('sepal_width', 'FLOAT'), ('petal_length', 'FLOAT'), ('petal_width', 'FLOAT'), ('class', 'INT'), ) self.assertTrue(np.array_equal(expect, schema)) schema = selected_columns_and_types( conn, "SELECT sepal_length, petal_width * 2.3 AS new_petal_width, " "class FROM iris.train") expect = [ ("sepal_length", "FLOAT"), ("new_petal_width", "FLOAT"), ("class", "INT"), ] self.assertTrue(np.array_equal(expect, schema)) elif driver == "maxcompute": case_db = os.getenv("SQLFLOW_TEST_DB_MAXCOMPUTE_PROJECT") table = "%s.sqlflow_test_iris_train" % case_db schema = get_table_schema(conn, table) expect = [ ('sepal_length', 'DOUBLE'), ('sepal_width', 'DOUBLE'), ('petal_length', 'DOUBLE'), ('petal_width', 'DOUBLE'), ('class', 'BIGINT'), ] self.assertTrue(np.array_equal(expect, schema)) schema = selected_columns_and_types( conn, "SELECT sepal_length, petal_width * 2.3 new_petal_width, " "class FROM %s" % table) expect = [ ("sepal_length", "DOUBLE"), ("new_petal_width", "DOUBLE"), ("class", "BIGINT"), ] self.assertTrue(np.array_equal(expect, schema))
class TestLimitSelect(TestCase): def test_limit_select(self): self.assertEqual("SELECT * FROM t LIMIT 2", limit_select("SELECT * FROM t LIMIT 30", 2)) self.assertEqual("SELECT * FROM t LIMIT 30; \t", limit_select("SELECT * FROM t LIMIT 30; \t", 100)) self.assertEqual("SELECT * FROM t LIMIT 3", limit_select("SELECT * FROM t", 3)) self.assertEqual("SELECT * FROM t \t LIMIT 4; ", limit_select("SELECT * FROM t \t ; ", 4)) @unittest.skipIf(testing.get_driver() == "maxcompute", "skip non mysql tests") class TestQuery(TestCase): def test_query(self): conn = connect_with_data_source(testing.get_datasource()) gen = query(conn, "select * from iris.train limit 1") rows = [row for row in gen()] self.assertEqual(1, len(rows)) query(conn, "drop table if exists A") query(conn, "create table A(a int);") query(conn, "insert into A values(1)") gen = query(conn, "select * from A;") rows = [row for row in gen()] self.assertEqual(1, len(rows))
class TestXGBoostTrain(unittest.TestCase): def get_table_row_count(self, conn, table): ret = list(conn.query("SELECT COUNT(*) FROM %s" % table)) self.assertEqual(len(ret), 1) ret = ret[0] self.assertEqual(len(ret), 1) return ret[0] def get_table_schema(self, conn, table): name_and_types = conn.get_table_schema(table) return dict(name_and_types) @unittest.skipUnless(testing.get_driver() == "mysql", "skip non mysql tests") def test_main(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softmax", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = {"num_boost_round": 20} model_params = {"num_class": 3, "objective": "multi:softmax"} save_name = "iris.xgboost_train_model_test" class_name = "class" with temp_file.TemporaryDirectory(as_cwd=True): eval_result = train(original_sql=original_sql, model_image="sqlflow:step", estimator_string="xgboost.gbtree", datasource=ds, select=select, validation_select=val_select, model_params=model_params, train_params=train_params, feature_column_map=None, label_column=NumericColumn( FieldDesc(name=class_name)), save=save_name) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01) conn = db.connect_with_data_source(ds) pred_select = "SELECT * FROM iris.test" pred(ds, pred_select, "iris.predict_result_table", class_name, save_name) self.assertEqual( self.get_table_row_count(conn, "iris.test"), self.get_table_row_count(conn, "iris.predict_result_table")) schema1 = self.get_table_schema(conn, "iris.test") schema2 = self.get_table_schema(conn, "iris.predict_result_table") self.assertEqual(len(schema1), len(schema2)) for name in schema1: if name == 'class': self.assertEqual(schema2[name], "BIGINT") continue self.assertTrue(name in schema2) self.assertEqual(schema1[name], schema2[name]) diff_schema = schema2.keys() - schema1.keys() self.assertEqual(len(diff_schema), 0) evaluate(ds, pred_select, "iris.evaluate_result_table", save_name, 'class', ['accuracy_score']) eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table") self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score']))
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License import unittest from unittest import TestCase from runtime import testing from runtime.dbapi.hive import HiveConnection @unittest.skipUnless(testing.get_driver() == "hive", "Skip non-hive test") class TestHiveConnection(TestCase): def test_connecion(self): try: conn = HiveConnection(testing.get_datasource()) conn.close() except: # noqa: E722 self.fail() def test_query(self): conn = HiveConnection(testing.get_datasource()) rs = conn.query("select * from notexist limit 1") self.assertFalse(rs.success()) self.assertTrue("Table not found" in rs.error()) rs = conn.query("select * from train limit 1")
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License import unittest from unittest import TestCase from runtime import testing from runtime.dbapi.mysql import MySQLConnection @unittest.skipUnless(testing.get_driver() == "mysql", "Skip non-mysql test") class TestMySQLConnection(TestCase): def test_connecion(self): try: conn = MySQLConnection(testing.get_datasource()) conn.close() except: # noqa: E722 self.fail() def test_query(self): conn = MySQLConnection(testing.get_datasource()) rs = conn.query("select * from notexist limit 1") self.assertFalse(rs.success()) rs = conn.query("select * from train limit 1") self.assertTrue(rs.success())
class TestGenerator(TestCase): create_statement = """create table test_table_float_fea (f1 float, f2 int, f3str VARCHAR(255), f4sparse VARCHAR(255), f5dense VARCHAR(255), label int)""" drop_statement = "drop table if exists test_table_float_fea" insert_statement = """insert into test_table_float_fea (f1,f2,f3str,f4sparse,f5dense,label) values(1.0,1,'a','1:1.0 2:2.0','1,2,3',0), (NULL,NULL,NULL,NULL,'1,2,3',1)""" @unittest.skipUnless(testing.get_driver() == "mysql", "skip non mysql tests") def test_generator(self): conn = connect(testing.get_datasource()) # prepare test data conn.execute(self.drop_statement) conn.execute(self.create_statement) conn.execute(self.insert_statement) column_name_to_type = { "f1": { "feature_name": "f1", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] }, "f2": { "feature_name": "f2", "delimiter": "", "dtype": "int64", "is_sparse": False, "shape": [] }, "f3str": { "feature_name": "f3str", "delimiter": "", "dtype": "string", "is_sparse": False, "shape": [] }, "f4sparse": { "feature_name": "f4sparse", "delimiter": "", "dtype": "float32", "is_sparse": True, "shape": [], "format": "kv" }, "f5dense": { "feature_name": "f5dense", "delimiter": ",", "dtype": "int64", "is_sparse": False, "shape": [3] } } label_meta = {"feature_name": "label", "shape": [], "delimiter": ""} gen = db_generator(conn, "SELECT * FROM test_table_float_fea", label_meta) idx = 0 for row, label in gen(): if idx == 0: features = read_features_from_row( row, ["f1", "f2", "f3str", "f4sparse", "f5dense"], ["f1", "f2", "f3str", "f4sparse", "f5dense"], column_name_to_type) self.assertEqual(1.0, features[0][0]) self.assertEqual(1, features[1][0]) self.assertEqual('a', features[2][0]) self.assertTrue( np.array_equal(np.array([[1], [2]]), features[3][0])) self.assertTrue( np.array_equal(np.array([1., 2.], dtype=np.float32), features[3][1])) self.assertTrue( np.array_equal(np.array([1, 2, 3]), features[4][0])) self.assertEqual(0, label) elif idx == 1: try: features = read_features_from_row( row, ["f1", "f2", "f3str", "f4sparse", "f5dense"], ["f1", "f2", "f3str", "f4sparse", "f5dense"], column_name_to_type) except Exception as e: self.assertTrue(isinstance(e, ValueError)) features = read_features_from_row( row, ["f1", "f2", "f3str", "f4sparse", "f5dense"], ["f1", "f2", "f3str", "f4sparse", "f5dense"], column_name_to_type, is_xgboost=True) self.assertEqual(XGBOOST_NULL_MAGIC, features[0][0]) self.assertEqual(int(XGBOOST_NULL_MAGIC), features[1][0]) self.assertEqual("", features[2][0]) self.assertTrue(np.array_equal(np.array([]), features[3][0])) self.assertTrue(np.array_equal(np.array([]), features[3][1])) self.assertTrue( np.array_equal(np.array([1, 2, 3]), features[4][0])) self.assertEqual(1, label) idx += 1 self.assertEqual(idx, 2) @unittest.skipUnless(testing.get_driver() == "mysql", "skip non mysql tests") def test_generate_fetch_size(self): label_meta = {"feature_name": "label", "shape": [], "delimiter": ""} gen = db_generator(testing.get_singleton_db_connection(), 'SELECT * FROM iris.train limit 10', label_meta) self.assertEqual(len([g for g in gen()]), 10)
def check_main_impl(self, estimator): if testing.get_driver() != "mysql": return ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN %s WITH model.hidden_units=[32,64], model.n_classes=3, validation.select="SELECT * FROM iris.test" LABEL class INTO iris.tensorflow_train_model_test; """ % estimator select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = {"batch_size": 10} model_params = {"n_classes": 3, "hidden_units": [32, 64]} save_name = "iris.tensorflow_train_model_test" class_name = "class" with temp_file.TemporaryDirectory(as_cwd=True): train(original_sql=original_sql, model_image="sqlflow:step", estimator_string=estimator, datasource=ds, select=select, validation_select=val_select, model_params=model_params, train_params=train_params, validation_params=None, feature_column_map=None, label_column=NumericColumn( FieldDesc(name=class_name, shape=[])), save=save_name, load=None) conn = db.connect_with_data_source(ds) pred_select = "SELECT * FROM iris.test" with temp_file.TemporaryDirectory(as_cwd=True): pred(ds, pred_select, "iris.predict_result_table", class_name, save_name) self.assertEqual( self.get_table_row_count(conn, "iris.test"), self.get_table_row_count(conn, "iris.predict_result_table")) schema1 = self.get_table_schema(conn, "iris.test") schema2 = self.get_table_schema(conn, "iris.predict_result_table") self.assertEqual(len(schema1), len(schema2)) for name in schema1: if name == 'class': self.assertEqual(schema2[name], "BIGINT") continue self.assertTrue(name in schema2) self.assertEqual(schema1[name], schema2[name]) diff_schema = schema2.keys() - schema1.keys() self.assertEqual(len(diff_schema), 0) with temp_file.TemporaryDirectory(as_cwd=True): evaluate(ds, select, "iris.evaluate_result_table", save_name, class_name, {'validation.metrics': 'Accuracy'}) eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table") eval_schema = set([k.lower() for k in eval_schema.keys()]) self.assertEqual(eval_schema, set(['loss', 'accuracy'])) with temp_file.TemporaryDirectory(as_cwd=True): explain(ds, select, None, {"plot_type": "bar"}, "iris.explain_result_table", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table") self.assertEqual( explain_schema.keys(), set(['petal_length', 'petal_width', 'sepal_length', 'sepal_width'])) conn.close()
iris_feature_column_names = [ "sepal_length", "sepal_width", "petal_length", "petal_width", ] feature_column_map = { "feature_columns": [fc.NumericColumn(fd.FieldDesc(name="sepal_length"))] } label_column = fc.NumericColumn(fd.FieldDesc(name="class")) @unittest.skipUnless(testing.get_driver() == "maxcompute" and testing.get_submitter() == "pai", "skip non PAI tests") class SubmitPAITrainTask(TestCase): def test_submit_pai_train_task(self): model_params = dict() model_params["hidden_units"] = [10, 20] model_params["n_classes"] = 3 original_sql = """ SELECT * FROM alifin_jtest_dev.sqlflow_test_iris_train TO TRAIN DNNClassifier WITH model.n_classes = 3, model.hidden_units = [10, 20] LABEL class INTO e2etest_pai_dnn;"""
class TestDB(TestCase): create_statement = "create table test_db (features text, label int)" hive_create_statement = 'create table test_db (features string, ' \ 'label int) ROW FORMAT DELIMITED FIELDS ' \ 'TERMINATED BY "\001"' select_statement = "select * from test_db" drop_statement = "drop table if exists test_db" @unittest.skipUnless(testing.get_driver() == "mysql", "skip non mysql tests") def test_mysql(self): conn = connect(testing.get_datasource()) self._do_test(conn) conn.close() @unittest.skipUnless(testing.get_driver() == "hive", "skip non hive tests") def test_hive(self): uri = testing.get_datasource() conn = connect(uri) self._do_test(conn) self._do_test_hive_specified_db(conn) def _do_test_hive_specified_db(self, conn): create_db = '''create database if not exists test_db''' create_tbl = '''create table test_db.tbl (features string, label int) ROW FORMAT DELIMITED FIELDS TERMINATED BY "\001"''' drop_tbl = '''drop table if exists test_db.tbl''' select_tbl = '''select * from test_db.tbl''' table_schema = ["label", "features"] values = [(1, '5,6,1,2')] * 10 self.assertTrue(conn.execute(create_db)) self.assertTrue(conn.execute(drop_tbl)) self.assertTrue(conn.execute(create_tbl)) with buffered_db_writer(conn, "test_db.tbl", table_schema, buff_size=10) as w: for row in values: w.write(row) field_names, data = execute(conn, select_tbl) expect_result = [('5,6,1,2', 1)] * 10 self.assertEqual(field_names, ['features', 'label']) self.assertEqual(expect_result, data) def _do_test(self, conn): table_name = "test_db" table_schema = ["features", "label"] values = [('5,6,1,2', 1)] * 10 conn.execute(self.drop_statement) if conn.driver == "hive": conn.execute(self.hive_create_statement) else: conn.execute(self.create_statement) with buffered_db_writer(conn, table_name, table_schema, buff_size=10) as w: for row in values: w.write(row) field_names, data = execute(conn, self.select_statement) self.assertEqual(table_schema, field_names) self.assertEqual(values, data)
class TestLimitSelect(TestCase): def test_limit_select(self): self.assertEqual("SELECT * FROM t LIMIT 2", limit_select("SELECT * FROM t LIMIT 30", 2)) self.assertEqual("SELECT * FROM t LIMIT 30; \t", limit_select("SELECT * FROM t LIMIT 30; \t", 100)) self.assertEqual("SELECT * FROM t LIMIT 3", limit_select("SELECT * FROM t", 3)) self.assertEqual("SELECT * FROM t \t LIMIT 4; ", limit_select("SELECT * FROM t \t ; ", 4)) @unittest.skipIf(testing.get_driver() == "maxcompute", "skip maxcompute tests") class TestQuery(TestCase): def test_query(self): conn = connect_with_data_source(testing.get_datasource()) rs = conn.query("select * from iris.train limit 1") rows = [row for row in rs] self.assertEqual(1, len(rows)) conn.execute("drop table if exists A") conn.execute("create table A(a int);") conn.execute("insert into A values(1)") rs = conn.query("select * from A;") rows = [row for row in rs] self.assertEqual(1, len(rows)) conn.query("truncate table A")
self.assertEqual(fd.infer_string_data_format(s), DataFormat.PLAIN) class TestGetMaxIndexOfKeyValueString(unittest.TestCase): def infer_index(self, string): field_desc = FieldDesc(shape=[0]) fd.fill_kv_field_desc(string, field_desc) return field_desc.shape[0] def test_infer_index(self): self.assertEqual(self.infer_index("1:2 3:4 2:1"), 4) self.assertEqual(self.infer_index("7:2\t 3:-4 10:20"), 11) self.assertEqual(self.infer_index("1:7"), 2) @unittest.skipUnless(testing.get_driver() in ["mysql", "hive"], "skip non MySQL and Hive tests") class TestFeatureDerivationWithMockedFeatures(unittest.TestCase): def check_json_dump(self, features): dump_json = json.dumps(features, cls=fc.JSONEncoderWithFeatureColumn) new_features = json.loads(dump_json, cls=fc.JSONDecoderWithFeatureColumn) new_dump_json = json.dumps(new_features, cls=fc.JSONEncoderWithFeatureColumn) self.assertEqual(dump_json, new_dump_json) def test_without_cross(self): features = { 'feature_columns': [ EmbeddingColumn(dimension=256, combiner="mean", name="c3"), EmbeddingColumn(category_column=CategoryIDColumn(
class TestXGBoostTrain(unittest.TestCase): def get_table_row_count(self, conn, table): rs = conn.query("SELECT COUNT(*) FROM %s" % table) ret = list(rs) rs.close() self.assertEqual(len(ret), 1) ret = ret[0] self.assertEqual(len(ret), 1) return ret[0] def get_table_schema(self, conn, table): name_and_types = conn.get_table_schema(table) return dict(name_and_types) @unittest.skipUnless(testing.get_driver() == "mysql", "skip non mysql tests") def test_main(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softprob", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" LABEL class INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = {"num_boost_round": 20} model_params = {"num_class": 3, "objective": "multi:softprob"} save_name = "iris.xgboost_train_model_test" class_name = "class" with temp_file.TemporaryDirectory(as_cwd=True): eval_result = train(datasource=ds, original_sql=original_sql, select=select, validation_select=val_select, estimator_string="xgboost.gbtree", model_image="sqlflow:step", feature_column_map=None, label_column=NumericColumn( FieldDesc(name=class_name)), model_params=model_params, train_params=train_params, validation_params=None, save=save_name, load=None) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01) conn = db.connect_with_data_source(ds) pred_select = "SELECT * FROM iris.test" with temp_file.TemporaryDirectory(as_cwd=True): result_column_names, train_label_idx = create_predict_table( conn, select, "iris.predict_result_table", FieldDesc(name=class_name), "class") predict(ds, pred_select, "iris.predict_result_table", result_column_names, train_label_idx, save_name) self.assertEqual( self.get_table_row_count(conn, "iris.test"), self.get_table_row_count(conn, "iris.predict_result_table")) schema1 = self.get_table_schema(conn, "iris.test") schema2 = self.get_table_schema(conn, "iris.predict_result_table") self.assertEqual(len(schema1), len(schema2)) for name in schema1: if name == 'class': self.assertEqual(schema2[name], "BIGINT") continue self.assertTrue(name in schema2) self.assertEqual(schema1[name], schema2[name]) diff_schema = schema2.keys() - schema1.keys() self.assertEqual(len(diff_schema), 0) with temp_file.TemporaryDirectory(as_cwd=True): result_column_names = create_evaluate_table( conn, "iris.evaluate_result_table", ["accuracy_score"]) evaluate(ds, pred_select, "iris.evaluate_result_table", save_name, label_name='class', model_params={'validation.metrics': 'accuracy_score'}, result_column_names=result_column_names) eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table") self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score'])) with temp_file.TemporaryDirectory(as_cwd=True): feature_column_names = [ "petal_width", "petal_length", "sepal_width", "sepal_length" ] create_explain_table(conn, EstimatorType.XGBOOST, "TreeExplainer", "xgboost.gbtree", "iris.explain_result_table", feature_column_names) explain(ds, select, "TreeExplainer", {"plot_type": "decision"}, "iris.explain_result_table", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table") self.assertEqual(explain_schema.keys(), set(feature_column_names)) with temp_file.TemporaryDirectory(as_cwd=True): create_explain_table(conn, EstimatorType.XGBOOST, "XGBoostExplainer", "xgboost.gbtree", "iris.explain_result_table_2", feature_column_names) explain(ds, select, "XGBoostExplainer", {}, "iris.explain_result_table_2", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table_2") self.assertEqual(explain_schema.keys(), set(['feature', 'fscore', 'gain'])) conn.close()