def predict_step(datasource, select, data_table, result_table, label_column, oss_model_path): """PAI TensorFlow prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_column: prediction label column oss_model_path: the model path on OSS """ try: tf.enable_eager_execution() except: # noqa: E722 pass (estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params, feature_columns_code) = oss.load_metas(oss_model_path, "tensorflow_model_desc") fc_map_ir = feature_columns_code feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) is_estimator = is_tf_estimator(import_model(estimator)) # Keras single node is using h5 format to save the model, no need to deal # with export model format. Keras distributed mode will use estimator, so # this is also needed. model_local_dir = oss_model_path.split("/")[-1] if is_estimator: oss.load_file(oss_model_path, "exported_path") # NOTE(typhoonzero): directory "model_save" is hardcoded in # codegen/tensorflow/codegen.go oss.load_dir("%s/%s" % (oss_model_path, model_local_dir)) else: oss.load_dir(os.path.join(oss_model_path, "model_save")) _predict(datasource=datasource, estimator_string=estimator, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_column_names_map=feature_column_names_map, train_label_name=label_meta["feature_name"], result_col_name=label_column, feature_metas=feature_metas, model_params=model_params, save=model_local_dir, batch_size=1, pai_table=data_table)
def pai_download_table_data_worker(dname, feature_metas, feature_column_names, label_meta, pai_table, slice_id, slice_count, feature_column_code, raw_data_dir): import runtime.xgboost as xgboost_extended if isinstance(feature_column_code, dict): # NOTE(typhoonzero): feature_column_code is a dict of # runtime.feature.column in refactored step code. feature_column_transformers = compile_ir_feature_columns( feature_column_code, EstimatorType.XGBOOST) transform_fn = \ xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers["feature_columns"]) else: feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = \ xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count) gen = db.db_generator(conn, None, label_meta=label_meta)() selected_cols = db.selected_cols(conn, None) filename = "{}/{}.txt".format(dname, slice_id) dump_dmatrix(filename, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def evaluate_step(datasource, select, data_table, result_table, oss_model_path, metrics): """PAI TensorFlow evaluate wrapper This function do some preparation for the local evaluation, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result oss_model_path: the model path on OSS metrics: metrics to evaluate """ (estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params, feature_columns_code) = oss.load_metas(oss_model_path, "tensorflow_model_desc") fc_map_ir = feature_columns_code feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) # NOTE(typhoonzero): No need to eval model_params["optimizer"] and # model_params["loss"] because predicting do not need these parameters. is_estimator = is_tf_estimator(import_model(estimator)) # Keras single node is using h5 format to save the model, no need to deal # with export model format. Keras distributed mode will use estimator, so # this is also needed. model_name = oss_model_path.split("/")[-1] if is_estimator: oss.load_file(oss_model_path, "exported_path") # NOTE(typhoonzero): directory "model_save" is hardcoded in # codegen/tensorflow/codegen.go oss.load_dir("%s/%s" % (oss_model_path, model_name)) else: oss.load_dir(os.path.join(oss_model_path, "model_save")) _evaluate(datasource=datasource, estimator_string=estimator, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, validation_metrics=metrics, save="model_save", batch_size=1, validation_steps=None, verbose=0, pai_table=data_table)
def compile_fc(self, fc, model_type): fc_dict = {"feature_columns": [fc]} rt_fc_dict = compile_ir_feature_columns(fc_dict, model_type) self.assertEqual(len(rt_fc_dict), 1) self.assertTrue("feature_columns" in rt_fc_dict) fc_list = rt_fc_dict.get("feature_columns") self.assertEqual(len(fc_list), 1) return fc_list[0]
def evaluate_step(datasource, select, result_table, model, label_name, model_params, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(',')] validation_steps = model_params.get("validation.steps", None) batch_size = model_params.get("validation.batch_size", 1) verbose = model_params.get("validation.verbose", 0) conn = db.connect_with_data_source(datasource) create_evaluate_table(conn, result_table, validation_metrics) conn.close() model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) _evaluate(datasource=datasource, estimator_string=estimator_string, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, validation_metrics=validation_metrics, save=save, batch_size=batch_size, validation_steps=validation_steps, verbose=verbose, pai_table=pai_table)
def explain_step(datasource, select, data_table, result_table, label_column, oss_model_path): try: tf.enable_eager_execution() except Exception as e: sys.stderr.write("warning: failed to enable_eager_execution: %s" % e) pass (estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params, feature_columns_code) = oss.load_metas(oss_model_path, "tensorflow_model_desc") fc_map_ir = feature_columns_code feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) # NOTE(typhoonzero): No need to eval model_params["optimizer"] and # model_params["loss"] because predicting do not need these parameters. is_estimator = is_tf_estimator(import_model(estimator)) # Keras single node is using h5 format to save the model, no need to deal # with export model format. Keras distributed mode will use estimator, so # this is also needed. model_name = oss_model_path.split("/")[-1] if is_estimator: oss.load_file(oss_model_path, "exported_path") # NOTE(typhoonzero): directory "model_save" is hardcoded in # codegen/tensorflow/codegen.go oss.load_dir("%s/%s" % (oss_model_path, model_name)) else: oss.load_dir(os.path.join(oss_model_path, "model_save")) # (TODO: lhw) use oss to store result image _explain(datasource=datasource, estimator_string=estimator, select=select, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, save="model_save", result_table=result_table, pai_table=data_table, oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None)
def explain_step(datasource, select, data_table, explainer, result_table, label_column, oss_model_path): """Do XGBoost model explanation, this function use selected data to explain the model stored at oss_model_path Args: datasource: The datasource to load explain data select: SQL statement to get the data set data_table: tmp table to save the explain data result_table: table to store the explanation result label_column: name of the label column oss_model_path: path to the model to be explained """ # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_field_meta, feature_column_names, label_field_meta, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] explain_xgb( datasource=datasource, select=select, feature_field_meta=feature_field_meta, feature_column_names=feature_column_names, label_meta=label_field_meta, summary_params=summary_params, explainer=explainer, result_table=result_table, is_pai=True, pai_explain_table=data_table, # (TODO:lhw) save/load explain result storage info into/from FLAGS oss_dest="", oss_ak="", oss_sk="", oss_endpoint="", oss_bucket_name="", transform_fn=transform_fn, feature_column_code=fc_map_ir)
def predict_step(datasource, select, data_table, result_table, label_column, oss_model_path): """PAI XGBoost prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_column: prediction label column oss_model_path: the model path on OSS """ # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, label_meta, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") pred_label_meta = copy.copy(label_meta) pred_label_meta["feature_name"] = label_column feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) pred(datasource=datasource, select=select, feature_metas=feature_metas, feature_column_names=feature_column_names, train_label_meta=label_meta, pred_label_meta=label_meta, result_table=result_table, is_pai=True, pai_table=data_table, model_params=model_params, train_params=train_params, transform_fn=transform_fn, feature_column_code=fc_map_ir)
def predict(datasource, select, result_table, result_column_names, train_label_idx, model, extra_result_cols=[], pai_table=None): """TBD """ bst = xgb.Booster() if isinstance(model, six.string_types): # NOTE(typhoonzero): must run Model.load_from_db in a temp # directory, calling pyodps in current directory on PAI # workers will cause paiio fails. with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) is_pai = True if pai_table else False if is_pai: conn = PaiIOConnection.from_table(pai_table) else: conn = db.connect_with_data_source(datasource) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset(datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, feature_column_code=fc_map_ir) print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): if is_pai: feature_file_name = os.path.join(tmp_dir_name, "predict.txt.raw") else: feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, feature_column_map, label_column, save, load=None): """ Train, evaluate and save the XGBoost model locally. Args: original_sql (str): the original SQL statement. model_image (str): the model repo docker image. estimator (str): the XGBoost booster type like xgboost.gbtree. datasource (str): the database connection URI. select (str): the SQL statement for training. validation_select (str): the SQL statement for evaluation. model_params (dict): the XGBoost model parameters. train_params (dict): the training parameters, can have disk_cache(bool), batch_size(int), epoch(int) settings in the dict. feature_column_map (dict): the feature column map to do derivation. label_column (FeatureColumn): the label column. save (str): the table name to save the trained model and meta. load (str): the table name to load the pretrained model. Returns: A dict which indicates the evaluation result. """ conn = db.connect_with_data_source(datasource) fc_map_ir, fc_label_ir = infer_feature_columns(conn, select, feature_column_map, label_column, n=1000) fc_map = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) feature_column_list = fc_map["feature_columns"] field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) label_meta = label_column.get_field_desc()[0].to_dict() # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_list) disk_cache = False batch_size = None epoch = 1 if "disk_cache" in train_params: disk_cache = train_params.pop("disk_cache") if "batch_size" in train_params: batch_size = train_params.pop("batch_size") if "epoch" in train_params: epoch = train_params.pop("epoch") def build_dataset(fn, slct): return xgb_dataset(datasource, fn, slct, feature_metas, feature_column_names, label_meta, cache=disk_cache, batch_size=batch_size, epoch=epoch, transform_fn=transform_fn) file_name = "my_model" if load: Model.load_from_db(datasource, load) bst = xgb.Booster() bst.load_model(file_name) else: bst = None with temp_file.TemporaryDirectory() as tmp_dir_name: train_fn = os.path.join(tmp_dir_name, 'train.txt') val_fn = os.path.join(tmp_dir_name, 'val.txt') train_dataset = build_dataset(train_fn, select) if validation_select: val_dataset = build_dataset(val_fn, validation_select) else: val_dataset = None eval_result = dict() watchlist = [None] if val_dataset: # The `xgboost.train` API only accepts the XGBoost DMatrix # object as the training or validation dataset, so we should # convert the generator to DMatrix. if isinstance(val_dataset, types.GeneratorType): val_dataset = list(val_dataset)[0] watchlist.append((val_dataset, "validate")) for per_batch_dmatrix in train_dataset: watchlist[0] = (per_batch_dmatrix, "train") bst = xgb.train(model_params, per_batch_dmatrix, evals=watchlist, evals_result=eval_result, xgb_model=bst, **train_params) print("Evaluation result: %s" % eval_result) meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_image, class_name=estimator_string, attributes=model_params, features=fc_map_ir, label=fc_label_ir, evaluation=eval_result, num_workers=1) save_model_to_local_file(bst, model_params, file_name) model = Model(EstimatorType.XGBOOST, meta) model.save_to_db(datasource, save) return eval_result
def explain_step(datasource, select, explainer, model_params, result_table, model, pai_table=None, oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): """ Do explanation to a trained TensorFlow model. Args: datasource (str): the database connection string. select (str): the input data to predict. explainer (str): the explainer to explain the model. Not used in TensorFlow models. model_params (dict): the parameters for evaluation. result_table (str): the output data table. model (Model|str): the model object or where to load the model. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) plot_type = model_params.get("summary.plot_type", "bar") train_attributes = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) label_name = model_params.get("label_col", train_label_desc.name) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) if pai_table: assert oss_dest, "oss_dest must be given when submit to PAI" else: assert oss_dest is None if os.environ.get('DISPLAY', '') == '': print('no display found. Using non-interactive Agg backend') matplotlib.use('Agg') _explain(datasource=datasource, estimator_string=estimator_string, select=select, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=train_attributes, save=save, pai_table=pai_table, plot_type=plot_type, result_table=result_table, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name) print_image_as_base64_html('summary.png')
def evaluate(datasource, select, result_table, model, pred_label_name=None, model_params=None): """ Do evaluation to a trained TensorFlow model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. model (Model|str): the model object or where to load the model. pred_label_name (str): the label column name. model_params (dict): the parameters for evaluation. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(',')] validation_steps = model_params.get("validation.steps", None) batch_size = model_params.get("validation.batch_size", 1) verbose = model_params.get("validation.verbose", 0) conn = db.connect_with_data_source(datasource) create_evaluate_table(conn, result_table, validation_metrics) conn.close() model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) train_label_desc.name = pred_label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) _evaluate(datasource=datasource, estimator_string=estimator_string, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, validation_metrics=validation_metrics, save=save, batch_size=batch_size, validation_steps=validation_steps, verbose=verbose)
def evaluate(datasource, select, result_table, model, pred_label_name=None, model_params=None): """ Do evaluation to a trained XGBoost model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. model (Model|str): the model object or where to load the model. pred_label_name (str): the label column name. model_params (dict): the parameters for evaluation. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(",")] model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] if pred_label_name: train_label_desc.name = pred_label_name field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names = create_evaluate_table(conn, result_table, validation_metrics) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn) for i, pred_dmatrix in enumerate(dpred): feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()
def train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, validation_params, feature_column_map, label_column, save, load=None, pai_table="", pai_val_table=""): is_pai = True if pai_table != "" else False is_dist_train = False FLAGS = None oss_model_dir = "" if is_pai: FLAGS = define_tf_flags() num_workers = len(FLAGS.worker_hosts.split(",")) is_dist_train = num_workers > 1 oss_model_dir = FLAGS.sqlflow_oss_modeldir try: oss_path_to_load = train_params.pop("oss_path_to_load") if load: oss.load_file(oss_path_to_load, "my_model") except: # noqa: E722 pass feature_columns = compile_ir_feature_columns(feature_column_map, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(feature_column_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) label_meta = label_column.get_field_desc()[0].to_dict(dtype_to_string=True) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) batch_size = train_params.pop("batch_size", None) epoch = train_params.pop("epoch", 1) load_pretrained_model = True if load else False disk_cache = train_params.pop("disk_cache", False) if is_dist_train: # NOTE(typhoonzero): dist_train returns None dist_train(flags=FLAGS, datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=feature_column_map, model_repo_image=model_image, original_sql=original_sql) else: return local_train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, feature_metas, feature_column_names, feature_column_map, label_column, transform_fn, save, load=load, is_pai=is_pai, oss_model_dir=oss_model_dir)
def pred(datasource, select, result_table, pred_label_name, model): """ Do prediction using a trained model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. pred_label_name (str): the output label name to predict. model (Model|str): the model object or where to load the model. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, pred_label_name) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir) # NOTE: default to use external memory print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def predict_step(datasource, select, result_table, label_name, model, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") label_meta = model.get_meta("label") train_label_desc = label_meta.get_field_desc()[0] if label_meta else None train_label_name = train_label_desc.name if train_label_desc else None estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) is_pai = True if pai_table else False if is_pai: select = "SELECT * FROM %s" % pai_table conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) if is_pai: conn.close() conn = PaiIOConnection.from_table(pai_table) select = None selected_cols = result_column_names[0:-1] if train_label_idx >= 0: selected_cols = selected_cols[0:train_label_idx] + [ train_label_name ] + selected_cols[train_label_idx:] estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table) conn.close()
def train_step(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, validation_params, feature_column_map, label_column, save, load=None, pai_table=None, pai_val_table=None): if model_params is None: model_params = {} if train_params is None: train_params = {} if validation_params is None: validation_params = {} if load: Model.load_from_db(datasource, load) load = "model_save" else: load = None is_pai = True if pai_table else False fc_map = compile_ir_feature_columns(feature_column_map, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(feature_column_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # no label for clustering model label_meta = None if label_column: label_meta = label_column.get_field_desc()[0].to_dict( dtype_to_string=True) feature_column_names_map = dict() for target in feature_column_map: fclist = feature_column_map[target] feature_column_names_map[target] = [ fc.get_field_desc()[0].name for fc in fclist ] # Construct optimizer objects to pass to model initializer. # The original model_params is serializable (do not have tf.xxx objects). model_params_constructed = copy.deepcopy(model_params) for optimizer_arg in ["optimizer", "dnn_optimizer", "linear_optimizer"]: if optimizer_arg in model_params_constructed: model_params_constructed[optimizer_arg] = get_tf_optimizer( model_params_constructed[optimizer_arg]) if "loss" in model_params_constructed: model_params_constructed["loss"] = get_tf_loss( model_params_constructed["loss"]) # extract params for training. verbose = train_params.get("verbose", 1) batch_size = train_params.get("batch_size", 1) epoch = train_params.get("epoch", 1) save_checkpoints_steps = train_params.get("save_checkpoints_steps", 100) max_steps = train_params.get("max_steps", None) if max_steps is not None and max_steps <= 0: max_steps = None validation_metrics = validation_params.get("metrics", "Accuracy") validation_metrics = [v.strip() for v in validation_metrics.split(",")] validation_steps = validation_params.get("steps", 1) validation_start_delay_secs = validation_params.get("start_delay_secs", 0) validation_throttle_secs = validation_params.get("throttle_secs", 0) estimator = import_model(estimator_string) is_estimator = is_tf_estimator(estimator) # always use verbose == 1 when using PAI to get more logs if verbose < 1: verbose = 1 set_log_level(verbose, is_estimator) model_params_constructed.update(fc_map) FLAGS = define_tf_flags() set_oss_environs(FLAGS) num_workers = len(FLAGS.worker_hosts.split(",")) worker_id = FLAGS.task_index train_dataset_fn = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_table, batch_size, epochs=epoch, shuffle_size=1000, num_workers=num_workers, worker_id=worker_id) val_dataset_fn = None if validation_select or pai_val_table: val_dataset_fn = get_dataset_fn(validation_select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_val_table, batch_size) model_meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_image, class_name=estimator_string, attributes=model_params, features=feature_column_map, label=label_column) # FIXME(typhoonzero): avoid save model_meta twice, keras_train_and_save, # estimator_train_and_save also dumps model_meta to a file under cwd. # should only keep the model.save_to_db part. save_dir = "model_save" if not is_estimator: if isinstance(estimator, types.FunctionType): # functional model need field_metas parameter model_params_constructed["field_metas"] = feature_metas keras_train_and_save(estimator, model_params_constructed, save_dir, FLAGS, train_dataset_fn, val_dataset_fn, label_meta, epoch, verbose, validation_metrics, validation_steps, load, model_meta, is_pai) else: estimator_train_and_save(estimator, model_params_constructed, save_dir, FLAGS, train_dataset_fn, val_dataset_fn, max_steps, validation_start_delay_secs, validation_throttle_secs, save_checkpoints_steps, validation_metrics, load, model_meta) # save model to DB/OSS model = Model(EstimatorType.TENSORFLOW, model_meta) if num_workers == 1 or worker_id == 0: saved = model.save_to_db(datasource, save, oss_model_dir=FLAGS.sqlflow_oss_modeldir) print("Model saved to DB: %s" % saved) print("Done training")
def evaluate(datasource, select, result_table, model, label_name=None, model_params=None, result_column_names=[], pai_table=None): """TBD """ if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "accuracy_score") validation_metrics = [m.strip() for m in validation_metrics.split(",")] bst = xgb.Booster() if isinstance(model, six.string_types): with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label = model.get_meta("label") train_label_desc = train_label.get_field_desc()[0] if label_name: train_label_desc.name = label_name feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) is_pai = True if pai_table else False if is_pai: conn = PaiIOConnection.from_table(pai_table) else: conn = db.connect_with_data_source(datasource) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, feature_column_code=fc_map_ir) for i, pred_dmatrix in enumerate(dpred): if is_pai: feature_file_name = pred_fn else: feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()
def explain(datasource, select, explainer, model_params, result_table, model): """ Do explanation to a trained TensorFlow model. Args: datasource (str): the database connection string. select (str): the input data to predict. explainer (str): the explainer to explain the model. Not used in TensorFlow models. model_params (dict): the parameters for evaluation. result_table (str): the output data table. model (Model|str): the model object or where to load the model. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) plot_type = model_params.get("summary.plot_type", "bar") train_attributes = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) label_name = model_params.get("label_col", train_label_desc.name) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) if result_table: conn = db.connect_with_data_source(datasource) if estimator_string.startswith("BoostedTrees"): column_defs = [ "feature %s" % DataType.to_db_field_type(conn.driver, DataType.STRING), "dfc %s" % DataType.to_db_field_type(conn.driver, DataType.FLOAT32), "gain %s" % DataType.to_db_field_type(conn.driver, DataType.FLOAT32), ] else: selected_cols = db.selected_cols(conn, select) if label_name in selected_cols: selected_cols.remove(label_name) name_to_shape = dict([(fd.name, fd.shape) for fd in field_descs]) column_defs = [] float_field_type = DataType.to_db_field_type( conn.driver, DataType.FLOAT32) for name in selected_cols: shape = name_to_shape.get(name, None) if shape is None: raise ValueError("cannot find column %s" % name) size = int(np.prod(shape)) if size == 1: column_def = "%s %s" % (name, float_field_type) column_defs.append(column_def) else: for i in six.moves.range(size): column_def = "%s_%d %s" % (name, i, float_field_type) column_defs.append(column_def) drop_sql = "DROP TABLE IF EXISTS %s;" % result_table create_sql = "CREATE TABLE %s (%s);" % (result_table, ",".join(column_defs)) conn.execute(drop_sql) conn.execute(create_sql) conn.close() _explain(datasource=datasource, estimator_string=estimator_string, select=select, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=train_attributes, save=save, plot_type=plot_type, result_table=result_table) with open('summary.png', 'rb') as f: img = f.read() img = base64.b64encode(img) if six.PY3: img = img.decode('utf-8') img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \ % img print(img)
def explain(datasource, select, explainer, model_params, result_table, model, pai_table="", oss_model_path=""): """Do XGBoost model explanation, this function use selected data to explain the model stored at oss_model_path Args: datasource: The datasource to load explain data select: SQL statement to get the data set data_table: tmp table to save the explain data result_table: table to store the explanation result label_column: name of the label column oss_model_path: path to the model to be explained """ if model_params is None: model_params = {} summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] is_pai = True if pai_table != "" else False if is_pai: # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_field_meta, feature_column_names, label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") label_meta = label_desc.to_dict(dtype_to_string=True) else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) fc_map_ir = model.get_meta("features") label_meta = model.get_meta("label").get_field_desc()[0].to_dict( dtype_to_string=True) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(fc_map_ir, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) dataset = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, transform_fn) bst = xgb.Booster() bst.load_model("my_model") if explainer == "XGBoostExplainer": xgb_native_explain(bst, datasource, result_table) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(bst, datasource, dataset, summary_params, result_table)
def train_step(original_sql, model_image, estimator_string, datasource, select, validation_select, pai_table, pai_val_table, model_params, train_params, feature_column_map, label_column, save, load=None): FLAGS = define_tf_flags() num_workers = len(FLAGS.worker_hosts.split(",")) is_dist_train = num_workers > 1 oss_model_dir = FLAGS.sqlflow_oss_modeldir oss_path_to_load = train_params.pop("oss_path_to_load") if load: oss.load_file(oss_path_to_load, "my_model") conn = db.connect_with_data_source(datasource) fc_map_ir, fc_label_ir = infer_feature_columns(conn, select, feature_column_map, label_column, n=1000) feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) label_meta = label_column.get_field_desc()[0].to_dict() transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) batch_size = train_params.pop("batch_size", None) epoch = train_params.pop("epoch", 1) load_pretrained_model = True if load else False disk_cache = train_params.pop("disk_cache", False) if is_dist_train: dist_train(flags=FLAGS, datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=fc_map_ir, model_repo_image=model_image, original_sql=original_sql) else: local_train(datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, rank=0, nworkers=1, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=fc_map_ir, model_repo_image=model_image, original_sql=original_sql)
def explain(datasource, select, explainer, model_params, result_table, model, pai_table="", oss_model_path="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): """TBD """ if model_params is None: model_params = {} summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] bst = xgb.Booster() if isinstance(model, six.string_types): with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") fc_map_ir = model.get_meta("features") label_meta = model.get_meta("label").get_field_desc()[0].to_dict( dtype_to_string=True) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) is_pai = True if pai_table else False # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) dataset = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, is_pai, pai_table, transform_fn) if explainer == "XGBoostExplainer": xgb_native_explain(bst, datasource, result_table) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(bst, datasource, dataset, summary_params, result_table, is_pai=is_pai, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name)
def shap_explain(booster, datasource, select, summary_params, result_table, model): train_fc_map = model.get_meta("features") label_meta = model.get_meta("label").get_field_desc()[0].to_dict( dtype_to_string=True) field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) dataset = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, transform_fn) tree_explainer = shap.TreeExplainer(booster) shap_values = tree_explainer.shap_values(dataset) if result_table: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values columns = list(dataset.columns) dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32) ] * len(columns) _create_table(conn, result_table, columns, dtypes) with db.buffered_db_writer(conn, result_table, columns) as w: for row in to_write: w.write(list(row)) conn.close() if summary_params.get("plot_type") == "decision": shap_interaction_values = tree_explainer.shap_interaction_values( dataset) expected_value = tree_explainer.expected_value if isinstance(shap_interaction_values, list): shap_interaction_values = shap_interaction_values[0] if isinstance(expected_value, list): expected_value = expected_value[0] plot_func = lambda: shap.decision_plot( # noqa: E731 expected_value, shap_interaction_values, dataset, show=False, feature_display_range=slice(None, -40, -1), alpha=1) else: plot_func = lambda: shap.summary_plot( # noqa: E731 shap_values, dataset, show=False, **summary_params) filename = 'summary.png' with temp_file.TemporaryDirectory(as_cwd=True): explainer.plot_and_save(plot_func, filename=filename) with open(filename, 'rb') as f: img = f.read() img = base64.b64encode(img) if six.PY3: img = img.decode('utf-8') img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \ % img print(img)
def predict(datasource, select, result_table, label_name, model, pai_table="", oss_model_path=""): """PAI XGBoost prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_name: prediction label column oss_model_path: the model path on OSS """ is_pai = True if pai_table != "" else False if is_pai: # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, train_label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir) # NOTE: default to use external memory print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def evaluate(datasource, select, result_table, load, pred_label_name=None, validation_metrics=["accuracy_score"]): """ Do evaluation to a trained XGBoost model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. load (str): where the trained model stores. pred_label_name (str): the label column name. validation_metrics (list[str]): the evaluation metric names. Returns: None. """ model = Model.load_from_db(datasource, load) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] if pred_label_name: train_label_desc.name = pred_label_name field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names = _create_evaluate_table(conn, result_table, validation_metrics) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset(datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(), cache=True, batch_size=10000, transform_fn=transform_fn) for i, pred_dmatrix in enumerate(dpred): feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()