def predict_step(datasource, select, data_table, result_table, label_column, oss_model_path): """PAI TensorFlow prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_column: prediction label column oss_model_path: the model path on OSS """ try: tf.enable_eager_execution() except: # noqa: E722 pass (estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params, feature_columns_code) = oss.load_metas(oss_model_path, "tensorflow_model_desc") fc_map_ir = feature_columns_code feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) is_estimator = is_tf_estimator(import_model(estimator)) # Keras single node is using h5 format to save the model, no need to deal # with export model format. Keras distributed mode will use estimator, so # this is also needed. model_local_dir = oss_model_path.split("/")[-1] if is_estimator: oss.load_file(oss_model_path, "exported_path") # NOTE(typhoonzero): directory "model_save" is hardcoded in # codegen/tensorflow/codegen.go oss.load_dir("%s/%s" % (oss_model_path, model_local_dir)) else: oss.load_dir(os.path.join(oss_model_path, "model_save")) _predict(datasource=datasource, estimator_string=estimator, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_column_names_map=feature_column_names_map, train_label_name=label_meta["feature_name"], result_col_name=label_column, feature_metas=feature_metas, model_params=model_params, save=model_local_dir, batch_size=1, pai_table=data_table)
def evaluate_step(datasource, select, data_table, result_table, oss_model_path, metrics): """PAI TensorFlow evaluate wrapper This function do some preparation for the local evaluation, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result oss_model_path: the model path on OSS metrics: metrics to evaluate """ (estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params, feature_columns_code) = oss.load_metas(oss_model_path, "tensorflow_model_desc") fc_map_ir = feature_columns_code feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) # NOTE(typhoonzero): No need to eval model_params["optimizer"] and # model_params["loss"] because predicting do not need these parameters. is_estimator = is_tf_estimator(import_model(estimator)) # Keras single node is using h5 format to save the model, no need to deal # with export model format. Keras distributed mode will use estimator, so # this is also needed. model_name = oss_model_path.split("/")[-1] if is_estimator: oss.load_file(oss_model_path, "exported_path") # NOTE(typhoonzero): directory "model_save" is hardcoded in # codegen/tensorflow/codegen.go oss.load_dir("%s/%s" % (oss_model_path, model_name)) else: oss.load_dir(os.path.join(oss_model_path, "model_save")) _evaluate(datasource=datasource, estimator_string=estimator, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, validation_metrics=metrics, save="model_save", batch_size=1, validation_steps=None, verbose=0, pai_table=data_table)
def predict(datasource, select, data_table, result_table, label_column, oss_model_path): """PAI TensorFlow prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_column: prediction label column oss_model_path: the model path on OSS """ try: tf.enable_eager_execution() except: # noqa: E722 pass (estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params, feature_columns_code) = oss.load_metas(oss_model_path, "tensorflow_model_desc") feature_columns = eval(feature_columns_code) # NOTE(typhoonzero): No need to eval model_params["optimizer"] and # model_params["loss"] because predicting do not need these parameters. is_estimator = is_tf_estimator(import_model(estimator)) # Keras single node is using h5 format to save the model, no need to deal # with export model format. Keras distributed mode will use estimator, so # this is also needed. if is_estimator: oss.load_file(oss_model_path, "exported_path") # NOTE(typhoonzero): directory "model_save" is hardcoded in # codegen/tensorflow/codegen.go oss.load_dir("%s/model_save" % oss_model_path) else: oss.load_file(oss_model_path, "model_save") _predict(datasource=datasource, estimator_string=estimator, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_column_names_map=feature_column_names_map, train_label_name=label_meta["feature_name"], result_col_name=label_column, feature_metas=feature_metas, model_params=model_params, save="model_save", batch_size=1, pai_table=data_table)
def explain_step(datasource, select, data_table, result_table, label_column, oss_model_path): try: tf.enable_eager_execution() except Exception as e: sys.stderr.write("warning: failed to enable_eager_execution: %s" % e) pass (estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params, feature_columns_code) = oss.load_metas(oss_model_path, "tensorflow_model_desc") fc_map_ir = feature_columns_code feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) # NOTE(typhoonzero): No need to eval model_params["optimizer"] and # model_params["loss"] because predicting do not need these parameters. is_estimator = is_tf_estimator(import_model(estimator)) # Keras single node is using h5 format to save the model, no need to deal # with export model format. Keras distributed mode will use estimator, so # this is also needed. model_name = oss_model_path.split("/")[-1] if is_estimator: oss.load_file(oss_model_path, "exported_path") # NOTE(typhoonzero): directory "model_save" is hardcoded in # codegen/tensorflow/codegen.go oss.load_dir("%s/%s" % (oss_model_path, model_name)) else: oss.load_dir(os.path.join(oss_model_path, "model_save")) # (TODO: lhw) use oss to store result image _explain(datasource=datasource, estimator_string=estimator, select=select, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, save="model_save", result_table=result_table, pai_table=data_table, oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None)
def explain_step(datasource, select, data_table, explainer, result_table, label_column, oss_model_path): """Do XGBoost model explanation, this function use selected data to explain the model stored at oss_model_path Args: datasource: The datasource to load explain data select: SQL statement to get the data set data_table: tmp table to save the explain data result_table: table to store the explanation result label_column: name of the label column oss_model_path: path to the model to be explained """ # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_field_meta, feature_column_names, label_field_meta, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] explain_xgb( datasource=datasource, select=select, feature_field_meta=feature_field_meta, feature_column_names=feature_column_names, label_meta=label_field_meta, summary_params=summary_params, explainer=explainer, result_table=result_table, is_pai=True, pai_explain_table=data_table, # (TODO:lhw) save/load explain result storage info into/from FLAGS oss_dest="", oss_ak="", oss_sk="", oss_endpoint="", oss_bucket_name="", transform_fn=transform_fn, feature_column_code=fc_map_ir)
def explain(datasource, select, data_table, result_table, label_column, oss_model_path): """Do XGBoost model explanation, this function use selected data to explain the model stored at oss_model_path Args: datasource: The datasource to load explain data select: SQL statement to get the data set data_table: tmp table to save the explain data result_table: table to store the explanation result label_column: name of the label column oss_model_path: path to the model to be explained """ # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_field_meta, feature_column_names, label_field_meta, feature_column_code) = oss.load_metas(oss_model_path, "xgboost_model_desc") feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) explain_xgb( datasource=datasource, select=select, feature_field_meta=feature_field_meta, feature_column_names=feature_column_names, label_meta=label_field_meta, summary_params={}, result_table=result_table, is_pai=True, pai_explain_table=data_table, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", # (TODO:lhw) save/load explain result storage info into/from FLAGS oss_dest="", oss_ak="", oss_sk="", oss_endpoint="", oss_bucket_name="", transform_fn=transform_fn, feature_column_code=feature_column_code)
def predict(datasource, select, data_table, result_table, label_column, oss_model_path): """PAI XGBoost prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_column: prediction label column oss_model_path: the model path on OSS """ # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, label_meta, feature_column_code) = oss.load_metas(oss_model_path, "xgboost_model_desc") pred_label_meta = copy.copy(label_meta) pred_label_meta["feature_name"] = label_column feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) pred(datasource=datasource, select=select, feature_metas=feature_metas, feature_column_names=feature_column_names, train_label_meta=label_meta, pred_label_meta=label_meta, result_table=result_table, is_pai=True, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table=data_table, model_params=model_params, train_params=train_params, transform_fn=transform_fn, feature_column_code=feature_column_code)
def predict_step(datasource, select, data_table, result_table, label_column, oss_model_path): """PAI XGBoost prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_column: prediction label column oss_model_path: the model path on OSS """ # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, label_meta, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") pred_label_meta = copy.copy(label_meta) pred_label_meta["feature_name"] = label_column feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) pred(datasource=datasource, select=select, feature_metas=feature_metas, feature_column_names=feature_column_names, train_label_meta=label_meta, pred_label_meta=label_meta, result_table=result_table, is_pai=True, pai_table=data_table, model_params=model_params, train_params=train_params, transform_fn=transform_fn, feature_column_code=fc_map_ir)
def predict(datasource, select, result_table, label_name, model, pai_table="", oss_model_path=""): """PAI XGBoost prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_name: prediction label column oss_model_path: the model path on OSS """ is_pai = True if pai_table != "" else False if is_pai: # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, train_label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir) # NOTE: default to use external memory print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def explain(datasource, select, explainer, model_params, result_table, model, pai_table="", oss_model_path=""): """Do XGBoost model explanation, this function use selected data to explain the model stored at oss_model_path Args: datasource: The datasource to load explain data select: SQL statement to get the data set data_table: tmp table to save the explain data result_table: table to store the explanation result label_column: name of the label column oss_model_path: path to the model to be explained """ if model_params is None: model_params = {} summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] is_pai = True if pai_table != "" else False if is_pai: # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_field_meta, feature_column_names, label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") label_meta = label_desc.to_dict(dtype_to_string=True) else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) fc_map_ir = model.get_meta("features") label_meta = model.get_meta("label").get_field_desc()[0].to_dict( dtype_to_string=True) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(fc_map_ir, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) dataset = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, transform_fn) bst = xgb.Booster() bst.load_model("my_model") if explainer == "XGBoostExplainer": xgb_native_explain(bst, datasource, result_table) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(bst, datasource, dataset, summary_params, result_table)
def evaluate(datasource, select, result_table, model, label_name=None, model_params=None, pai_table="", oss_model_path=""): """TBD """ if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "accuracy_score") validation_metrics = [m.strip() for m in validation_metrics.split(",")] is_pai = True if pai_table != "" else False if is_pai: assert (oss_model_path != "") # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, train_label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] if label_name: train_label_desc.name = label_name feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names = create_evaluate_table(conn, result_table, validation_metrics) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn) for i, pred_dmatrix in enumerate(dpred): feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()