def test_category_id_column(self): cc = CategoryIDColumn(FieldDesc(name='c1'), 128) for model_type in [TENSORFLOW, XGBOOST]: compiled_cc = self.compile_fc(cc, model_type) self.assertEqual(compiled_cc.key, 'c1') self.assertEqual(compiled_cc.num_buckets, 128) cc = CategoryIDColumn(FieldDesc(name='c1', vocabulary=set(['a', 'b'])), 128) for model_type in [TENSORFLOW, XGBOOST]: compiled_cc = self.compile_fc(cc, model_type) vocab = sorted(compiled_cc.vocabulary_list) self.assertEqual(vocab, ['a', 'b'])
def test_category_hash_column(self): chc = CategoryHashColumn(FieldDesc(name='c1', dtype=DataType.STRING), 32) for model_type in [TENSORFLOW, XGBOOST]: compiled_chc = self.compile_fc(chc, model_type) self.assertEqual(compiled_chc.key, 'c1') self.assertEqual(compiled_chc.hash_bucket_size, 32)
def test_numeric_column(self): nc = NumericColumn(FieldDesc(name='c1', shape=(2, 3))) for model_type in [TENSORFLOW, XGBOOST]: compiled_nc = self.compile_fc(nc, model_type) self.assertEqual(compiled_nc.key, 'c1') self.assertEqual(compiled_nc.shape, (2, 3))
def test_train(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softmax", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = { "num_boost_round": 20, "original_sql": original_sql, "feature_column_map": None, "label_column": NumericColumn(FieldDesc(name="class")), "model_image": "sqlflow:step" } model_params = {"num_class": 3, "objective": "multi:softmax"} eval_result = train(ds, "xgboost.gbtree", select, val_select, model_params, "iris.xgboost_train_model_test", None, train_params) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01)
def test_bucket_column(self): nc = NumericColumn(FieldDesc(name='c1', shape=(1, ))) bc = BucketColumn(nc, (-10, -5, 3, 7)) for model_type in [TENSORFLOW, XGBOOST]: compiled_bc = self.compile_fc(bc, model_type) self.assertEqual(compiled_bc.source_column.key, 'c1') self.assertEqual(compiled_bc.boundaries, (-10, -5, 3, 7))
def test_cross_column(self): cc = CrossColumn(['c1', NumericColumn(FieldDesc(name='c2'))], 4096) compiled_cc = self.compile_fc(cc, TENSORFLOW) self.assertEqual(list(compiled_cc.keys), ['c1', 'c2']) self.assertEqual(compiled_cc.hash_bucket_size, 4096) with self.assertRaises(AssertionError): self.compile_fc(cc, XGBOOST)
def test_indicator_column(self): cc = CategoryIDColumn(FieldDesc(name='c1'), 128) ic = IndicatorColumn(category_column=cc) for model_type in [TENSORFLOW, XGBOOST]: compiled_chc = self.compile_fc(ic, model_type) compiled_cc = compiled_chc.categorical_column self.assertEqual(compiled_cc.key, 'c1') self.assertEqual(compiled_cc.num_buckets, 128)
def test_seq_category_id_column(self): scc = SeqCategoryIDColumn(FieldDesc(name='c1'), 64) compiled_scc = self.compile_fc(scc, TENSORFLOW) # NOTE: TensorFlow SeqCategoryIDColumn does not have key # attribute # self.assertEqual(compiled_scc.key, 'c1') self.assertEqual(compiled_scc.num_buckets, 64) with self.assertRaises(AssertionError): self.compile_fc(scc, XGBOOST)
def get_field_desc(self): descs = [] for k in self.keys: if isinstance(k, six.string_types): descs.append( FieldDesc(name=k, dtype=DataType.STRING, shape=[1])) elif isinstance(k, NumericColumn): descs.extend(k.get_field_desc()) else: raise ValueError("unsupported type %s" % type(k)) return descs
def new_default_field_desc(name): """ Create a new default FieldDesc object. Args: name: the FieldDesc name. Returns: A FieldDesc object whose name is the given name, and the data type is INT. """ return FieldDesc(name=name, dtype=DataType.INT64)
def test_train(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softmax", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = { "num_boost_round": 20, } model_params = {"num_class": 3, "objective": "multi:softmax"} with temp_file.TemporaryDirectory(as_cwd=True): eval_result = train(ds, original_sql, select, val_select, "xgboost.gbtree", "", None, NumericColumn(FieldDesc(name="class")), model_params, train_params, None, "iris.xgboost_train_model_test", None) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01) with temp_file.TemporaryDirectory(as_cwd=True): pred_original_sql = """SELECT * FROM iris.test TO PREDICT iris.xgboost_pred_result.pred_val USING iris.xgboost_train_model_test;""" pred(ds, pred_original_sql, "SELECT * FROM iris.test", "iris.xgboost_train_model_test", "pred_val", model_params, "iris.xgboost_pred_result") with temp_file.TemporaryDirectory(as_cwd=True): explain_original_sql = """SELECT * FROM iris.test TO EXPLAIN iris.xgboost_train_model_test INTO iris.xgboost_explain_result;""" explain(ds, explain_original_sql, "SELECT * FROM iris.test", "iris.xgboost_train_model_test", model_params, "iris.xgboost_explain_result") with temp_file.TemporaryDirectory(as_cwd=True): evaluate_original_sql = """SELECT * FROM iris.test TO EVALUATE iris.xgboost_train_model_test WITH label_col=class INTO iris.xgboost_evaluate_result;""" evaluate(ds, evaluate_original_sql, "SELECT * FROM iris.test", "class", "iris.xgboost_train_model_test", model_params, "iris.xgboost_evaluate_result")
def test_embedding_column(self): chc = CategoryHashColumn(FieldDesc(name='c1', dtype=DataType.STRING), 32) ec = EmbeddingColumn(category_column=chc, combiner='sum', dimension=23) compiled_ec = self.compile_fc(ec, TENSORFLOW) self.assertEqual(compiled_ec.combiner, 'sum') self.assertEqual(compiled_ec.dimension, 23) compiled_chc = compiled_ec.categorical_column self.assertEqual(compiled_chc.key, 'c1') self.assertEqual(compiled_chc.hash_bucket_size, 32) with self.assertRaises(AssertionError): self.compile_fc(ec, XGBOOST)
def test_no_column_clause(self): columns = [ "sepal_length", "sepal_width", "petal_length", "petal_width", ] select = "select %s, class from iris.train" % ",".join(columns) conn = testing.get_singleton_db_connection() features = None label = NumericColumn( FieldDesc(name='class', dtype=DataType.INT64, shape=[1])) features, label = fd.infer_feature_columns(conn, select, features, label) self.check_json_dump(features) self.check_json_dump(label) self.assertEqual(len(features), 1) self.assertTrue("feature_columns" in features) features = features["feature_columns"] self.assertEqual(len(features), 4) for i, f in enumerate(features): self.assertTrue(isinstance(f, NumericColumn)) self.assertEqual(len(f.get_field_desc()), 1) field_desc = f.get_field_desc()[0] self.assertEqual(field_desc.name, columns[i]) self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) self.assertTrue(isinstance(label, NumericColumn)) self.assertEqual(len(label.get_field_desc()), 1) field_desc = label.get_field_desc()[0] self.assertEqual(field_desc.name, "class") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [])
def _from_dict(cls, d): field_desc = FieldDesc.from_dict(d["field_desc"]) bucket_size = d["bucket_size"] return SeqCategoryIDColumn(field_desc, bucket_size)
def _from_dict(cls, d): fd = FieldDesc.from_dict(d["field_desc"]) return NumericColumn(fd)
def test_main(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softprob", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" LABEL class INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = {"num_boost_round": 20} model_params = {"num_class": 3, "objective": "multi:softprob"} save_name = "iris.xgboost_train_model_test" class_name = "class" with temp_file.TemporaryDirectory(as_cwd=True): eval_result = train(datasource=ds, original_sql=original_sql, select=select, validation_select=val_select, estimator_string="xgboost.gbtree", model_image="sqlflow:step", feature_column_map=None, label_column=NumericColumn( FieldDesc(name=class_name)), model_params=model_params, train_params=train_params, validation_params=None, save=save_name, load=None) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01) conn = db.connect_with_data_source(ds) pred_select = "SELECT * FROM iris.test" with temp_file.TemporaryDirectory(as_cwd=True): result_column_names, train_label_idx = create_predict_table( conn, select, "iris.predict_result_table", FieldDesc(name=class_name), "class") predict(ds, pred_select, "iris.predict_result_table", result_column_names, train_label_idx, save_name) self.assertEqual( self.get_table_row_count(conn, "iris.test"), self.get_table_row_count(conn, "iris.predict_result_table")) schema1 = self.get_table_schema(conn, "iris.test") schema2 = self.get_table_schema(conn, "iris.predict_result_table") self.assertEqual(len(schema1), len(schema2)) for name in schema1: if name == 'class': self.assertEqual(schema2[name], "BIGINT") continue self.assertTrue(name in schema2) self.assertEqual(schema1[name], schema2[name]) diff_schema = schema2.keys() - schema1.keys() self.assertEqual(len(diff_schema), 0) with temp_file.TemporaryDirectory(as_cwd=True): result_column_names = create_evaluate_table( conn, "iris.evaluate_result_table", ["accuracy_score"]) evaluate(ds, pred_select, "iris.evaluate_result_table", save_name, label_name='class', model_params={'validation.metrics': 'accuracy_score'}, result_column_names=result_column_names) eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table") self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score'])) with temp_file.TemporaryDirectory(as_cwd=True): feature_column_names = [ "petal_width", "petal_length", "sepal_width", "sepal_length" ] create_explain_table(conn, EstimatorType.XGBOOST, "TreeExplainer", "xgboost.gbtree", "iris.explain_result_table", feature_column_names) explain(ds, select, "TreeExplainer", {"plot_type": "decision"}, "iris.explain_result_table", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table") self.assertEqual(explain_schema.keys(), set(feature_column_names)) with temp_file.TemporaryDirectory(as_cwd=True): create_explain_table(conn, EstimatorType.XGBOOST, "XGBoostExplainer", "xgboost.gbtree", "iris.explain_result_table_2", feature_column_names) explain(ds, select, "XGBoostExplainer", {}, "iris.explain_result_table_2", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table_2") self.assertEqual(explain_schema.keys(), set(['feature', 'fscore', 'gain'])) conn.close()
def test_main(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softmax", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = {"num_boost_round": 20} model_params = {"num_class": 3, "objective": "multi:softmax"} save_name = "iris.xgboost_train_model_test" class_name = "class" with temp_file.TemporaryDirectory(as_cwd=True): eval_result = train(original_sql=original_sql, model_image="sqlflow:step", estimator_string="xgboost.gbtree", datasource=ds, select=select, validation_select=val_select, model_params=model_params, train_params=train_params, feature_column_map=None, label_column=NumericColumn( FieldDesc(name=class_name)), save=save_name) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01) conn = db.connect_with_data_source(ds) pred_select = "SELECT * FROM iris.test" pred(ds, pred_select, "iris.predict_result_table", class_name, save_name) self.assertEqual( self.get_table_row_count(conn, "iris.test"), self.get_table_row_count(conn, "iris.predict_result_table")) schema1 = self.get_table_schema(conn, "iris.test") schema2 = self.get_table_schema(conn, "iris.predict_result_table") self.assertEqual(len(schema1), len(schema2)) for name in schema1: if name == 'class': self.assertEqual(schema2[name], "BIGINT") continue self.assertTrue(name in schema2) self.assertEqual(schema1[name], schema2[name]) diff_schema = schema2.keys() - schema1.keys() self.assertEqual(len(diff_schema), 0) evaluate(ds, pred_select, "iris.evaluate_result_table", save_name, 'class', ['accuracy_score']) eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table") self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score']))
def test_without_cross(self): features = { 'feature_columns': [ EmbeddingColumn(dimension=256, combiner="mean", name="c3"), EmbeddingColumn(category_column=CategoryIDColumn( FieldDesc(name="c5", dtype=DataType.INT64, shape=[10000], delimiter=",", is_sparse=True), bucket_size=5000), dimension=64, combiner="sqrtn", name="c5"), ] } label = NumericColumn( FieldDesc(name="class", dtype=DataType.INT64, shape=[1])) select = "select c1, c2, c3, c4, c5, c6, class " \ "from feature_derivation_case.train" conn = testing.get_singleton_db_connection() features, label = fd.infer_feature_columns(conn, select, features, label) self.check_json_dump(features) self.check_json_dump(label) self.assertEqual(len(features), 1) self.assertTrue("feature_columns" in features) features = features["feature_columns"] self.assertEqual(len(features), 6) fc1 = features[0] self.assertTrue(isinstance(fc1, NumericColumn)) self.assertEqual(len(fc1.get_field_desc()), 1) field_desc = fc1.get_field_desc()[0] self.assertEqual(field_desc.name, "c1") self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) fc2 = features[1] self.assertTrue(isinstance(fc2, NumericColumn)) self.assertEqual(len(fc2.get_field_desc()), 1) field_desc = fc2.get_field_desc()[0] self.assertEqual(field_desc.name, "c2") self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) fc3 = features[2] self.assertTrue(isinstance(fc3, EmbeddingColumn)) self.assertEqual(len(fc3.get_field_desc()), 1) field_desc = fc3.get_field_desc()[0] self.assertEqual(field_desc.name, "c3") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.CSV) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [4]) self.assertEqual(fc3.dimension, 256) self.assertEqual(fc3.combiner, "mean") self.assertEqual(fc3.name, "c3") self.assertTrue(isinstance(fc3.category_column, CategoryIDColumn)) self.assertEqual(fc3.category_column.bucket_size, 10) fc4 = features[3] self.assertTrue(isinstance(fc4, NumericColumn)) self.assertEqual(len(fc4.get_field_desc()), 1) field_desc = fc4.get_field_desc()[0] self.assertEqual(field_desc.name, "c4") self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.CSV) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [4]) fc5 = features[4] self.assertTrue(isinstance(fc5, EmbeddingColumn)) self.assertEqual(len(fc5.get_field_desc()), 1) field_desc = fc5.get_field_desc()[0] self.assertEqual(field_desc.name, "c5") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.CSV) self.assertTrue(field_desc.is_sparse) self.assertEqual(field_desc.shape, [10000]) self.assertEqual(fc5.dimension, 64) self.assertEqual(fc5.combiner, "sqrtn") self.assertEqual(fc5.name, "c5") self.assertTrue(isinstance(fc5.category_column, CategoryIDColumn)) self.assertEqual(fc5.category_column.bucket_size, 5000) fc6 = features[5] self.assertTrue(isinstance(fc6, EmbeddingColumn)) self.assertEqual(len(fc6.get_field_desc()), 1) field_desc = fc6.get_field_desc()[0] self.assertEqual(field_desc.name, "c6") self.assertEqual(field_desc.dtype, DataType.STRING) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) self.assertEqual(field_desc.vocabulary, set(['FEMALE', 'MALE', 'NULL'])) self.assertEqual(fc6.dimension, 128) self.assertEqual(fc6.combiner, "sum") self.assertEqual(fc6.name, "c6") self.assertTrue(isinstance(fc6.category_column, CategoryIDColumn)) self.assertEqual(fc6.category_column.bucket_size, 3) self.assertTrue(isinstance(label, NumericColumn)) self.assertEqual(len(label.get_field_desc()), 1) field_desc = label.get_field_desc()[0] self.assertEqual(field_desc.name, "class") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [])
def test_with_cross(self): c1 = NumericColumn( FieldDesc(name='c1', dtype=DataType.INT64, shape=[1])) c2 = NumericColumn( FieldDesc(name='c2', dtype=DataType.INT64, shape=[1])) c4 = NumericColumn( FieldDesc(name='c4', dtype=DataType.INT64, shape=[1])) c5 = NumericColumn( FieldDesc(name='c5', dtype=DataType.INT64, shape=[1], is_sparse=True)) features = { 'feature_columns': [ c1, c2, CrossColumn([c4, c5], 128), CrossColumn([c1, c2], 256), ] } label = NumericColumn( FieldDesc(name='class', dtype=DataType.INT64, shape=[1])) select = "select c1, c2, c3, c4, c5, class " \ "from feature_derivation_case.train" conn = testing.get_singleton_db_connection() features, label = fd.infer_feature_columns(conn, select, features, label) self.check_json_dump(features) self.check_json_dump(label) self.assertEqual(len(features), 1) self.assertTrue("feature_columns" in features) features = features["feature_columns"] self.assertEqual(len(features), 5) fc1 = features[0] self.assertTrue(isinstance(fc1, NumericColumn)) self.assertEqual(len(fc1.get_field_desc()), 1) field_desc = fc1.get_field_desc()[0] self.assertEqual(field_desc.name, "c1") self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) fc2 = features[1] self.assertTrue(isinstance(fc2, NumericColumn)) self.assertEqual(len(fc2.get_field_desc()), 1) field_desc = fc2.get_field_desc()[0] self.assertEqual(field_desc.name, "c2") self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) fc3 = features[2] self.assertTrue(isinstance(fc3, NumericColumn)) self.assertEqual(len(fc3.get_field_desc()), 1) field_desc = fc3.get_field_desc()[0] self.assertEqual(field_desc.name, "c3") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.CSV) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [4]) fc4 = features[3] self.assertTrue(isinstance(fc4, CrossColumn)) self.assertEqual(len(fc4.get_field_desc()), 2) field_desc1 = fc4.get_field_desc()[0] self.assertEqual(field_desc1.name, "c4") self.assertEqual(field_desc1.dtype, DataType.FLOAT32) self.assertEqual(field_desc1.format, DataFormat.CSV) self.assertEqual(field_desc1.shape, [4]) self.assertFalse(field_desc1.is_sparse) field_desc2 = fc4.get_field_desc()[1] self.assertEqual(field_desc2.name, "c5") self.assertEqual(field_desc2.dtype, DataType.INT64) self.assertEqual(field_desc2.format, DataFormat.CSV) self.assertTrue(field_desc2.is_sparse) fc5 = features[4] self.assertTrue(isinstance(fc5, CrossColumn)) self.assertEqual(len(fc4.get_field_desc()), 2) field_desc1 = fc5.get_field_desc()[0] self.assertEqual(field_desc1.name, "c1") self.assertEqual(field_desc1.dtype, DataType.FLOAT32) self.assertEqual(field_desc1.format, DataFormat.PLAIN) self.assertEqual(field_desc1.shape, [1]) self.assertFalse(field_desc1.is_sparse) field_desc2 = fc5.get_field_desc()[1] self.assertEqual(field_desc2.name, "c2") self.assertEqual(field_desc2.dtype, DataType.FLOAT32) self.assertEqual(field_desc2.format, DataFormat.PLAIN) self.assertEqual(field_desc2.shape, [1]) self.assertFalse(field_desc2.is_sparse) self.assertTrue(isinstance(label, NumericColumn)) self.assertEqual(len(label.get_field_desc()), 1) field_desc = label.get_field_desc()[0] self.assertEqual(field_desc.name, "class") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [])
def test_metadata(self): original_sql = ''' SELECT c1, c2, class FROM my_db.train_table TO TRAIN my_docker_image:latest/DNNClassifier WITH model.n_classes = 3, model.hidden_units = [16, 32], validation.select="SELECT c1, c2, class FROM my_db.val_table" INTO my_db.my_dnn_model; ''' select = "SELECT c1, c2, class FROM my_db.train_table" validation_select = "SELECT c1, c2, class FROM my_db.val_table" model_repo_image = "my_docker_image:latest" estimator = "DNNClassifier" attributes = { 'n_classes': 3, 'hidden_units': [16, 32], } features = { 'feature_columns': [ NumericColumn(FieldDesc(name='c1', shape=[3], delimiter=",")), NumericColumn(FieldDesc(name='c2', shape=[1])), ], } label = NumericColumn(FieldDesc(name='class', shape=[5], delimiter=',')) def check_metadata(meta): self.assertEqual(meta['original_sql'], original_sql) self.assertEqual(meta['select'], select) self.assertEqual(meta['validation_select'], validation_select) self.assertEqual(meta['model_repo_image'], model_repo_image) self.assertEqual(meta['class_name'], estimator) self.assertEqual(meta['attributes'], attributes) meta_features = meta['features'] meta_label = meta['label'] self.assertEqual(len(meta_features), 1) self.assertEqual(len(meta_features['feature_columns']), 2) meta_features = meta_features['feature_columns'] self.assertEqual(type(meta_features[0]), NumericColumn) self.assertEqual(type(meta_features[1]), NumericColumn) field_desc = meta_features[0].get_field_desc()[0] self.assertEqual(field_desc.name, 'c1') self.assertEqual(field_desc.shape, [3]) self.assertEqual(field_desc.delimiter, ',') field_desc = meta_features[1].get_field_desc()[0] self.assertEqual(field_desc.name, 'c2') self.assertEqual(field_desc.shape, [1]) self.assertEqual(type(meta_label), NumericColumn) field_desc = meta_label.get_field_desc()[0] self.assertEqual(field_desc.name, 'class') self.assertEqual(field_desc.shape, [5]) self.assertEqual(field_desc.delimiter, ',') self.assertEqual(meta['evaluation'], {'accuracy': 0.5}) self.assertEqual(meta['my_data'], 0.25) meta = collect_metadata(original_sql, select, validation_select, model_repo_image, estimator, attributes, features, label, {'accuracy': 0.5}, my_data=0.25) check_metadata(meta) save_metadata(self.file_name, meta) meta = load_metadata(self.file_name) check_metadata(meta)
def infer_index(self, string): field_desc = FieldDesc(shape=[0]) fd.fill_kv_field_desc(string, field_desc) return field_desc.shape[0]
def check_main_impl(self, estimator): if testing.get_driver() != "mysql": return ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN %s WITH model.hidden_units=[32,64], model.n_classes=3, validation.select="SELECT * FROM iris.test" LABEL class INTO iris.tensorflow_train_model_test; """ % estimator select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = {"batch_size": 10} model_params = {"n_classes": 3, "hidden_units": [32, 64]} save_name = "iris.tensorflow_train_model_test" class_name = "class" with temp_file.TemporaryDirectory(as_cwd=True): train(original_sql=original_sql, model_image="sqlflow:step", estimator_string=estimator, datasource=ds, select=select, validation_select=val_select, model_params=model_params, train_params=train_params, validation_params=None, feature_column_map=None, label_column=NumericColumn( FieldDesc(name=class_name, shape=[])), save=save_name, load=None) conn = db.connect_with_data_source(ds) pred_select = "SELECT * FROM iris.test" with temp_file.TemporaryDirectory(as_cwd=True): pred(ds, pred_select, "iris.predict_result_table", class_name, save_name) self.assertEqual( self.get_table_row_count(conn, "iris.test"), self.get_table_row_count(conn, "iris.predict_result_table")) schema1 = self.get_table_schema(conn, "iris.test") schema2 = self.get_table_schema(conn, "iris.predict_result_table") self.assertEqual(len(schema1), len(schema2)) for name in schema1: if name == 'class': self.assertEqual(schema2[name], "BIGINT") continue self.assertTrue(name in schema2) self.assertEqual(schema1[name], schema2[name]) diff_schema = schema2.keys() - schema1.keys() self.assertEqual(len(diff_schema), 0) with temp_file.TemporaryDirectory(as_cwd=True): evaluate(ds, select, "iris.evaluate_result_table", save_name, class_name, {'validation.metrics': 'Accuracy'}) eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table") eval_schema = set([k.lower() for k in eval_schema.keys()]) self.assertEqual(eval_schema, set(['loss', 'accuracy'])) with temp_file.TemporaryDirectory(as_cwd=True): explain(ds, select, None, {"plot_type": "bar"}, "iris.explain_result_table", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table") self.assertEqual( explain_schema.keys(), set(['petal_length', 'petal_width', 'sepal_length', 'sepal_width'])) conn.close()