def submit_local_pred(datasource, original_sql, select, model, label_name, pred_params, result_table, user=""): model = Model.load_from_db(datasource, model) if model.get_type() == EstimatorType.XGBOOST: pred_func = xgboost_pred else: pred_func = tf_pred if model.get_meta("label") is None: train_label_desc = None else: train_label_desc = model.get_meta("label").get_field_desc()[0] if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("predict.extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] with db.connect_with_data_source(datasource) as conn: result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name, extra_result_cols) pred_func(datasource=datasource, select=select, result_table=result_table, result_column_names=result_column_names, train_label_idx=train_label_idx, model=model, extra_result_cols=extra_result_cols)
def predict(datasource, select, result_table, label_name, model, pai_table="", oss_model_path=""): """PAI XGBoost prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_name: prediction label column oss_model_path: the model path on OSS """ is_pai = True if pai_table != "" else False if is_pai: # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, train_label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir) # NOTE: default to use external memory print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def predict_step(datasource, select, result_table, label_name, model, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") label_meta = model.get_meta("label") train_label_desc = label_meta.get_field_desc()[0] if label_meta else None train_label_name = train_label_desc.name if train_label_desc else None estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) is_pai = True if pai_table else False if is_pai: select = "SELECT * FROM %s" % pai_table conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) if is_pai: conn.close() conn = PaiIOConnection.from_table(pai_table) select = None selected_cols = result_column_names[0:-1] if train_label_idx >= 0: selected_cols = selected_cols[0:train_label_idx] + [ train_label_name ] + selected_cols[train_label_idx:] estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table) conn.close()
def test_main(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softprob", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" LABEL class INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = {"num_boost_round": 20} model_params = {"num_class": 3, "objective": "multi:softprob"} save_name = "iris.xgboost_train_model_test" class_name = "class" with temp_file.TemporaryDirectory(as_cwd=True): eval_result = train(datasource=ds, original_sql=original_sql, select=select, validation_select=val_select, estimator_string="xgboost.gbtree", model_image="sqlflow:step", feature_column_map=None, label_column=NumericColumn( FieldDesc(name=class_name)), model_params=model_params, train_params=train_params, validation_params=None, save=save_name, load=None) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01) conn = db.connect_with_data_source(ds) pred_select = "SELECT * FROM iris.test" with temp_file.TemporaryDirectory(as_cwd=True): result_column_names, train_label_idx = create_predict_table( conn, select, "iris.predict_result_table", FieldDesc(name=class_name), "class") predict(ds, pred_select, "iris.predict_result_table", result_column_names, train_label_idx, save_name) self.assertEqual( self.get_table_row_count(conn, "iris.test"), self.get_table_row_count(conn, "iris.predict_result_table")) schema1 = self.get_table_schema(conn, "iris.test") schema2 = self.get_table_schema(conn, "iris.predict_result_table") self.assertEqual(len(schema1), len(schema2)) for name in schema1: if name == 'class': self.assertEqual(schema2[name], "BIGINT") continue self.assertTrue(name in schema2) self.assertEqual(schema1[name], schema2[name]) diff_schema = schema2.keys() - schema1.keys() self.assertEqual(len(diff_schema), 0) with temp_file.TemporaryDirectory(as_cwd=True): result_column_names = create_evaluate_table( conn, "iris.evaluate_result_table", ["accuracy_score"]) evaluate(ds, pred_select, "iris.evaluate_result_table", save_name, label_name='class', model_params={'validation.metrics': 'accuracy_score'}, result_column_names=result_column_names) eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table") self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score'])) with temp_file.TemporaryDirectory(as_cwd=True): feature_column_names = [ "petal_width", "petal_length", "sepal_width", "sepal_length" ] create_explain_table(conn, EstimatorType.XGBOOST, "TreeExplainer", "xgboost.gbtree", "iris.explain_result_table", feature_column_names) explain(ds, select, "TreeExplainer", {"plot_type": "decision"}, "iris.explain_result_table", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table") self.assertEqual(explain_schema.keys(), set(feature_column_names)) with temp_file.TemporaryDirectory(as_cwd=True): create_explain_table(conn, EstimatorType.XGBOOST, "XGBoostExplainer", "xgboost.gbtree", "iris.explain_result_table_2", feature_column_names) explain(ds, select, "XGBoostExplainer", {}, "iris.explain_result_table_2", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table_2") self.assertEqual(explain_schema.keys(), set(['feature', 'fscore', 'gain'])) conn.close()
def submit_pai_predict(datasource, original_sql, select, model, label_name, pred_params, result_table, user=""): """This function pack needed params and resource to a tarball and submit a prediction task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string Name of the label column, if not exist in select. pred_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) model_metas = Model.load_metadata_from_db(datasource, model) model_type = model_metas.get_type() estimator = model_metas.get_meta("class_name") setup_predict_entry(params, model_type) train_label = model_metas.get_meta("label") if train_label is not None: train_label_desc = train_label.get_field_desc()[0] else: train_label_desc = None if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("predict.extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] with db.connect_with_data_source(datasource) as conn: result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name, extra_result_cols) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: del params["label_name"] params["pai_table"] = data_table params["result_column_names"] = result_column_names params["train_label_idx"] = train_label_idx params["extra_result_cols"] = extra_result_cols if try_pai_local_run(params, oss_model_path): return with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_predict_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, pred_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE)) submit_pai_task(cmd, datasource)