def evaluate_step(datasource, select, result_table, model, label_name, model_params, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(',')] validation_steps = model_params.get("validation.steps", None) batch_size = model_params.get("validation.batch_size", 1) verbose = model_params.get("validation.verbose", 0) conn = db.connect_with_data_source(datasource) create_evaluate_table(conn, result_table, validation_metrics) conn.close() model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) _evaluate(datasource=datasource, estimator_string=estimator_string, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, validation_metrics=validation_metrics, save=save, batch_size=batch_size, validation_steps=validation_steps, verbose=verbose, pai_table=pai_table)
def submit_local_evaluate(datasource, original_sql, select, label_name, model, model_params, result_table, user=""): model = Model.load_from_db(datasource, model) if model.get_type() == EstimatorType.XGBOOST: evaluate_func = xgboost_evaluate validation_metrics = model_params.get("validation.metrics", "accuracy_score") else: evaluate_func = tf_evaluate validation_metrics = model_params.get("validation.metrics", "Accuracy") conn = db.connect_with_data_source(datasource) validation_metrics = [m.strip() for m in validation_metrics.split(",")] result_column_names = create_evaluate_table(conn, result_table, validation_metrics) conn.close() evaluate_func(datasource=datasource, select=select, result_table=result_table, model=model, label_name=label_name, model_params=model_params, result_column_names=result_column_names)
def test_main(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softprob", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" LABEL class INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = {"num_boost_round": 20} model_params = {"num_class": 3, "objective": "multi:softprob"} save_name = "iris.xgboost_train_model_test" class_name = "class" with temp_file.TemporaryDirectory(as_cwd=True): eval_result = train(datasource=ds, original_sql=original_sql, select=select, validation_select=val_select, estimator_string="xgboost.gbtree", model_image="sqlflow:step", feature_column_map=None, label_column=NumericColumn( FieldDesc(name=class_name)), model_params=model_params, train_params=train_params, validation_params=None, save=save_name, load=None) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01) conn = db.connect_with_data_source(ds) pred_select = "SELECT * FROM iris.test" with temp_file.TemporaryDirectory(as_cwd=True): result_column_names, train_label_idx = create_predict_table( conn, select, "iris.predict_result_table", FieldDesc(name=class_name), "class") predict(ds, pred_select, "iris.predict_result_table", result_column_names, train_label_idx, save_name) self.assertEqual( self.get_table_row_count(conn, "iris.test"), self.get_table_row_count(conn, "iris.predict_result_table")) schema1 = self.get_table_schema(conn, "iris.test") schema2 = self.get_table_schema(conn, "iris.predict_result_table") self.assertEqual(len(schema1), len(schema2)) for name in schema1: if name == 'class': self.assertEqual(schema2[name], "BIGINT") continue self.assertTrue(name in schema2) self.assertEqual(schema1[name], schema2[name]) diff_schema = schema2.keys() - schema1.keys() self.assertEqual(len(diff_schema), 0) with temp_file.TemporaryDirectory(as_cwd=True): result_column_names = create_evaluate_table( conn, "iris.evaluate_result_table", ["accuracy_score"]) evaluate(ds, pred_select, "iris.evaluate_result_table", save_name, label_name='class', model_params={'validation.metrics': 'accuracy_score'}, result_column_names=result_column_names) eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table") self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score'])) with temp_file.TemporaryDirectory(as_cwd=True): feature_column_names = [ "petal_width", "petal_length", "sepal_width", "sepal_length" ] create_explain_table(conn, EstimatorType.XGBOOST, "TreeExplainer", "xgboost.gbtree", "iris.explain_result_table", feature_column_names) explain(ds, select, "TreeExplainer", {"plot_type": "decision"}, "iris.explain_result_table", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table") self.assertEqual(explain_schema.keys(), set(feature_column_names)) with temp_file.TemporaryDirectory(as_cwd=True): create_explain_table(conn, EstimatorType.XGBOOST, "XGBoostExplainer", "xgboost.gbtree", "iris.explain_result_table_2", feature_column_names) explain(ds, select, "XGBoostExplainer", {}, "iris.explain_result_table_2", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table_2") self.assertEqual(explain_schema.keys(), set(['feature', 'fscore', 'gain'])) conn.close()
def submit_pai_evaluate(datasource, original_sql, select, label_name, model, model_params, result_table, user=""): """Submit a PAI evaluation task Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string The label name to evaluate. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) model_type, estimator = pai_model.get_saved_model_type_and_estimator( datasource, model) if model_type == EstimatorType.PAIML: raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.") if model_type == EstimatorType.XGBOOST: params["entry_type"] = "evaluate_xgb" validation_metrics = model_params.get("validation.metrics", "accuracy_score") else: params["entry_type"] = "evaluate_tf" validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(",")] with db.connect_with_data_source(datasource) as conn: result_column_names = create_evaluate_table(conn, result_table, validation_metrics) with table_ops.create_tmp_tables_guard(select, datasource) as data_table: params["pai_table"] = data_table params["result_column_names"] = result_column_names if try_pai_local_run(params, oss_model_path): return conf = cluster_conf.get_cluster_config(model_params) with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_tf_cmd( conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model, oss_model_path, data_table, "", result_table, project) submit_pai_task(cmd, datasource)
def evaluate(datasource, select, result_table, model, label_name=None, model_params=None, pai_table="", oss_model_path=""): """TBD """ if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "accuracy_score") validation_metrics = [m.strip() for m in validation_metrics.split(",")] is_pai = True if pai_table != "" else False if is_pai: assert (oss_model_path != "") # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, train_label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] if label_name: train_label_desc.name = label_name feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names = create_evaluate_table(conn, result_table, validation_metrics) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn) for i, pred_dmatrix in enumerate(dpred): feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()