def load_from_oss(oss_model_dir, local_dir=None): """ Load the saved model from OSS and unzip it on local_dir. Args: oss_model_dir (str): the OSS model directory to load. It is in the format of oss://bucket/path/to/dir/. local_dir (str): the local directory to load. Returns: Model: a Model object represent the model type and meta information. """ if local_dir is None: local_dir = os.getcwd() with temp_file.TemporaryDirectory() as tmp_dir: tarball = os.path.join(tmp_dir, TARBALL_NAME) oss.load_file(oss_model_dir, tarball, TARBALL_NAME) Model._unzip(local_dir, tarball) model_obj_file = os.path.join(tmp_dir, MODEL_OBJ_FILE_NAME) oss.load_file(oss_model_dir, model_obj_file, MODEL_OBJ_FILE_NAME) with open(model_obj_file, "r") as f: d = json.loads(f.read(), cls=JSONDecoderWithFeatureColumn) model = Model._from_dict(d) return model
def predict_step(datasource, select, data_table, result_table, label_column, oss_model_path): """PAI TensorFlow prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_column: prediction label column oss_model_path: the model path on OSS """ try: tf.enable_eager_execution() except: # noqa: E722 pass (estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params, feature_columns_code) = oss.load_metas(oss_model_path, "tensorflow_model_desc") fc_map_ir = feature_columns_code feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) is_estimator = is_tf_estimator(import_model(estimator)) # Keras single node is using h5 format to save the model, no need to deal # with export model format. Keras distributed mode will use estimator, so # this is also needed. model_local_dir = oss_model_path.split("/")[-1] if is_estimator: oss.load_file(oss_model_path, "exported_path") # NOTE(typhoonzero): directory "model_save" is hardcoded in # codegen/tensorflow/codegen.go oss.load_dir("%s/%s" % (oss_model_path, model_local_dir)) else: oss.load_dir(os.path.join(oss_model_path, "model_save")) _predict(datasource=datasource, estimator_string=estimator, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_column_names_map=feature_column_names_map, train_label_name=label_meta["feature_name"], result_col_name=label_column, feature_metas=feature_metas, model_params=model_params, save=model_local_dir, batch_size=1, pai_table=data_table)
def evaluate_step(datasource, select, data_table, result_table, oss_model_path, metrics): """PAI TensorFlow evaluate wrapper This function do some preparation for the local evaluation, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result oss_model_path: the model path on OSS metrics: metrics to evaluate """ (estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params, feature_columns_code) = oss.load_metas(oss_model_path, "tensorflow_model_desc") fc_map_ir = feature_columns_code feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) # NOTE(typhoonzero): No need to eval model_params["optimizer"] and # model_params["loss"] because predicting do not need these parameters. is_estimator = is_tf_estimator(import_model(estimator)) # Keras single node is using h5 format to save the model, no need to deal # with export model format. Keras distributed mode will use estimator, so # this is also needed. model_name = oss_model_path.split("/")[-1] if is_estimator: oss.load_file(oss_model_path, "exported_path") # NOTE(typhoonzero): directory "model_save" is hardcoded in # codegen/tensorflow/codegen.go oss.load_dir("%s/%s" % (oss_model_path, model_name)) else: oss.load_dir(os.path.join(oss_model_path, "model_save")) _evaluate(datasource=datasource, estimator_string=estimator, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, validation_metrics=metrics, save="model_save", batch_size=1, validation_steps=None, verbose=0, pai_table=data_table)
def predict(datasource, select, data_table, result_table, label_column, oss_model_path): """PAI TensorFlow prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_column: prediction label column oss_model_path: the model path on OSS """ try: tf.enable_eager_execution() except: # noqa: E722 pass (estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params, feature_columns_code) = oss.load_metas(oss_model_path, "tensorflow_model_desc") feature_columns = eval(feature_columns_code) # NOTE(typhoonzero): No need to eval model_params["optimizer"] and # model_params["loss"] because predicting do not need these parameters. is_estimator = is_tf_estimator(import_model(estimator)) # Keras single node is using h5 format to save the model, no need to deal # with export model format. Keras distributed mode will use estimator, so # this is also needed. if is_estimator: oss.load_file(oss_model_path, "exported_path") # NOTE(typhoonzero): directory "model_save" is hardcoded in # codegen/tensorflow/codegen.go oss.load_dir("%s/model_save" % oss_model_path) else: oss.load_file(oss_model_path, "model_save") _predict(datasource=datasource, estimator_string=estimator, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_column_names_map=feature_column_names_map, train_label_name=label_meta["feature_name"], result_col_name=label_column, feature_metas=feature_metas, model_params=model_params, save="model_save", batch_size=1, pai_table=data_table)
def explain_step(datasource, select, data_table, result_table, label_column, oss_model_path): try: tf.enable_eager_execution() except Exception as e: sys.stderr.write("warning: failed to enable_eager_execution: %s" % e) pass (estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params, feature_columns_code) = oss.load_metas(oss_model_path, "tensorflow_model_desc") fc_map_ir = feature_columns_code feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) # NOTE(typhoonzero): No need to eval model_params["optimizer"] and # model_params["loss"] because predicting do not need these parameters. is_estimator = is_tf_estimator(import_model(estimator)) # Keras single node is using h5 format to save the model, no need to deal # with export model format. Keras distributed mode will use estimator, so # this is also needed. model_name = oss_model_path.split("/")[-1] if is_estimator: oss.load_file(oss_model_path, "exported_path") # NOTE(typhoonzero): directory "model_save" is hardcoded in # codegen/tensorflow/codegen.go oss.load_dir("%s/%s" % (oss_model_path, model_name)) else: oss.load_dir(os.path.join(oss_model_path, "model_save")) # (TODO: lhw) use oss to store result image _explain(datasource=datasource, estimator_string=estimator, select=select, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, save="model_save", result_table=result_table, pai_table=data_table, oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None)
def explain_step(datasource, select, data_table, explainer, result_table, label_column, oss_model_path): """Do XGBoost model explanation, this function use selected data to explain the model stored at oss_model_path Args: datasource: The datasource to load explain data select: SQL statement to get the data set data_table: tmp table to save the explain data result_table: table to store the explanation result label_column: name of the label column oss_model_path: path to the model to be explained """ # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_field_meta, feature_column_names, label_field_meta, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] explain_xgb( datasource=datasource, select=select, feature_field_meta=feature_field_meta, feature_column_names=feature_column_names, label_meta=label_field_meta, summary_params=summary_params, explainer=explainer, result_table=result_table, is_pai=True, pai_explain_table=data_table, # (TODO:lhw) save/load explain result storage info into/from FLAGS oss_dest="", oss_ak="", oss_sk="", oss_endpoint="", oss_bucket_name="", transform_fn=transform_fn, feature_column_code=fc_map_ir)
def explain(datasource, select, data_table, result_table, label_column, oss_model_path): """Do XGBoost model explanation, this function use selected data to explain the model stored at oss_model_path Args: datasource: The datasource to load explain data select: SQL statement to get the data set data_table: tmp table to save the explain data result_table: table to store the explanation result label_column: name of the label column oss_model_path: path to the model to be explained """ # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_field_meta, feature_column_names, label_field_meta, feature_column_code) = oss.load_metas(oss_model_path, "xgboost_model_desc") feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) explain_xgb( datasource=datasource, select=select, feature_field_meta=feature_field_meta, feature_column_names=feature_column_names, label_meta=label_field_meta, summary_params={}, result_table=result_table, is_pai=True, pai_explain_table=data_table, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", # (TODO:lhw) save/load explain result storage info into/from FLAGS oss_dest="", oss_ak="", oss_sk="", oss_endpoint="", oss_bucket_name="", transform_fn=transform_fn, feature_column_code=feature_column_code)
def predict(datasource, select, data_table, result_table, label_column, oss_model_path): """PAI XGBoost prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_column: prediction label column oss_model_path: the model path on OSS """ # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, label_meta, feature_column_code) = oss.load_metas(oss_model_path, "xgboost_model_desc") pred_label_meta = copy.copy(label_meta) pred_label_meta["feature_name"] = label_column feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) pred(datasource=datasource, select=select, feature_metas=feature_metas, feature_column_names=feature_column_names, train_label_meta=label_meta, pred_label_meta=label_meta, result_table=result_table, is_pai=True, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table=data_table, model_params=model_params, train_params=train_params, transform_fn=transform_fn, feature_column_code=feature_column_code)
def predict_step(datasource, select, data_table, result_table, label_column, oss_model_path): """PAI XGBoost prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_column: prediction label column oss_model_path: the model path on OSS """ # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, label_meta, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") pred_label_meta = copy.copy(label_meta) pred_label_meta["feature_name"] = label_column feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) pred(datasource=datasource, select=select, feature_metas=feature_metas, feature_column_names=feature_column_names, train_label_meta=label_meta, pred_label_meta=label_meta, result_table=result_table, is_pai=True, pai_table=data_table, model_params=model_params, train_params=train_params, transform_fn=transform_fn, feature_column_code=fc_map_ir)
def load_from_oss(oss_model_dir, local_dir=None): """ Load the saved model from OSS and unzip it on local_dir. Args: oss_model_dir (str): the OSS model directory to load. It is in the format of oss://bucket/path/to/dir/. local_dir (str): the local directory to load. Returns: Model: a Model object represent the model type and meta information. """ if local_dir is None: local_dir = os.getcwd() with temp_file.TemporaryDirectory() as tmp_dir: tarball = os.path.join(tmp_dir, TARBALL_NAME) oss.load_file(oss_model_dir, tarball, TARBALL_NAME) return Model._unzip(local_dir, tarball)
def predict(datasource, select, result_table, label_name, model, pai_table="", oss_model_path=""): """PAI XGBoost prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_name: prediction label column oss_model_path: the model path on OSS """ is_pai = True if pai_table != "" else False if is_pai: # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, train_label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir) # NOTE: default to use external memory print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, validation_params, feature_column_map, label_column, save, load=None, pai_table="", pai_val_table=""): is_pai = True if pai_table != "" else False is_dist_train = False FLAGS = None oss_model_dir = "" if is_pai: FLAGS = define_tf_flags() num_workers = len(FLAGS.worker_hosts.split(",")) is_dist_train = num_workers > 1 oss_model_dir = FLAGS.sqlflow_oss_modeldir try: oss_path_to_load = train_params.pop("oss_path_to_load") if load: oss.load_file(oss_path_to_load, "my_model") except: # noqa: E722 pass feature_columns = compile_ir_feature_columns(feature_column_map, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(feature_column_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) label_meta = label_column.get_field_desc()[0].to_dict(dtype_to_string=True) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) batch_size = train_params.pop("batch_size", None) epoch = train_params.pop("epoch", 1) load_pretrained_model = True if load else False disk_cache = train_params.pop("disk_cache", False) if is_dist_train: # NOTE(typhoonzero): dist_train returns None dist_train(flags=FLAGS, datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=feature_column_map, model_repo_image=model_image, original_sql=original_sql) else: return local_train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, feature_metas, feature_column_names, feature_column_map, label_column, transform_fn, save, load=load, is_pai=is_pai, oss_model_dir=oss_model_dir)
def explain(datasource, select, explainer, model_params, result_table, model, pai_table="", oss_model_path=""): """Do XGBoost model explanation, this function use selected data to explain the model stored at oss_model_path Args: datasource: The datasource to load explain data select: SQL statement to get the data set data_table: tmp table to save the explain data result_table: table to store the explanation result label_column: name of the label column oss_model_path: path to the model to be explained """ if model_params is None: model_params = {} summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] is_pai = True if pai_table != "" else False if is_pai: # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_field_meta, feature_column_names, label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") label_meta = label_desc.to_dict(dtype_to_string=True) else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) fc_map_ir = model.get_meta("features") label_meta = model.get_meta("label").get_field_desc()[0].to_dict( dtype_to_string=True) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(fc_map_ir, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) dataset = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, transform_fn) bst = xgb.Booster() bst.load_model("my_model") if explainer == "XGBoostExplainer": xgb_native_explain(bst, datasource, result_table) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(bst, datasource, dataset, summary_params, result_table)
def train(datasource, estimator_string, select, validation_select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, train_params={}, validation_metrics=["Accuracy"], disk_cache=False, save="", batch_size=None, epoch=1, validation_steps=1, verbose=0, max_steps=None, validation_start_delay_secs=0, validation_throttle_secs=0, save_checkpoints_steps=100, log_every_n_iter=10, load_pretrained_model=False, is_pai=True, pai_table="", pai_val_table="", feature_columns_code="", model_repo_image="", original_sql="", oss_model_dir_to_load="", feature_column_names_map=None): FLAGS = define_tf_flags() num_workers = len(FLAGS.worker_hosts.split(",")) is_dist_train = num_workers > 1 oss_model_dir = FLAGS.sqlflow_oss_modeldir if load_pretrained_model: oss.load_file(oss_model_dir_to_load, "my_model") # NOTE: in the current implementation, we are generating a transform_fn # from COLUMN clause. The transform_fn is executed during the process of # dumping the original data into DMatrix SVM file. transform_fn = ComposedColumnTransformer(feature_column_names, *feature_columns) if is_dist_train: dist_train(flags=FLAGS, datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=feature_columns_code, model_repo_image=model_repo_image, original_sql=original_sql) else: local_train(datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, rank=0, nworkers=1, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=feature_columns_code, model_repo_image=model_repo_image, original_sql=original_sql)
def train_step(original_sql, model_image, estimator_string, datasource, select, validation_select, pai_table, pai_val_table, model_params, train_params, feature_column_map, label_column, save, load=None): FLAGS = define_tf_flags() num_workers = len(FLAGS.worker_hosts.split(",")) is_dist_train = num_workers > 1 oss_model_dir = FLAGS.sqlflow_oss_modeldir oss_path_to_load = train_params.pop("oss_path_to_load") if load: oss.load_file(oss_path_to_load, "my_model") conn = db.connect_with_data_source(datasource) fc_map_ir, fc_label_ir = infer_feature_columns(conn, select, feature_column_map, label_column, n=1000) feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) label_meta = label_column.get_field_desc()[0].to_dict() transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) batch_size = train_params.pop("batch_size", None) epoch = train_params.pop("epoch", 1) load_pretrained_model = True if load else False disk_cache = train_params.pop("disk_cache", False) if is_dist_train: dist_train(flags=FLAGS, datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=fc_map_ir, model_repo_image=model_image, original_sql=original_sql) else: local_train(datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, rank=0, nworkers=1, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=fc_map_ir, model_repo_image=model_image, original_sql=original_sql)
def evaluate(datasource, select, result_table, model, label_name=None, model_params=None, pai_table="", oss_model_path=""): """TBD """ if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "accuracy_score") validation_metrics = [m.strip() for m in validation_metrics.split(",")] is_pai = True if pai_table != "" else False if is_pai: assert (oss_model_path != "") # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, train_label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] if label_name: train_label_desc.name = label_name feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names = create_evaluate_table(conn, result_table, validation_metrics) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn) for i, pred_dmatrix in enumerate(dpred): feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()