def shap_explain(booster, datasource, dataset, summary_params, result_table="", is_pai=False, oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): tree_explainer = shap.TreeExplainer(booster) shap_values = tree_explainer.shap_values(dataset) if result_table: if is_pai: conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values columns = list(dataset.columns) with db.buffered_db_writer(conn, result_table, columns) as w: for row in to_write: w.write(list(row)) conn.close() if summary_params.get("plot_type") == "decision": shap_interaction_values = tree_explainer.shap_interaction_values( dataset) expected_value = tree_explainer.expected_value if isinstance(shap_interaction_values, list): shap_interaction_values = shap_interaction_values[0] if isinstance(expected_value, list): expected_value = expected_value[0] plot_func = lambda: shap.decision_plot( # noqa: E731 expected_value, shap_interaction_values, dataset, show=False, feature_display_range=slice(None, -40, -1), alpha=1) else: plot_func = lambda: shap.summary_plot( # noqa: E731 shap_values, dataset, show=False, **summary_params) explainer.plot_and_save(plot_func, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name, filename='summary')
def pai_download_table_data_worker(dname, feature_metas, feature_column_names, label_meta, pai_table, slice_id, slice_count, feature_column_code, raw_data_dir): import runtime.xgboost as xgboost_extended if isinstance(feature_column_code, dict): # NOTE(typhoonzero): feature_column_code is a dict of # runtime.feature.column in refactored step code. feature_column_transformers = compile_ir_feature_columns( feature_column_code, EstimatorType.XGBOOST) transform_fn = \ xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers["feature_columns"]) else: feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = \ xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count) gen = db.db_generator(conn, None, label_meta=label_meta)() selected_cols = db.selected_cols(conn, None) filename = "{}/{}.txt".format(dname, slice_id) dump_dmatrix(filename, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0, pai_table=""): FLAGS = define_tf_flags() set_oss_environs(FLAGS) estimator_cls = import_model(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) is_pai = True if pai_table else False eval_dataset = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=is_pai, pai_table=pai_table, batch_size=batch_size) model_params.update(feature_columns) pop_optimizer_and_loss(model_params) if is_estimator: with open("exported_path", "r") as fid: exported_path = str(fid.read()) model_params["warm_start_from"] = exported_path estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator_cls, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) if result_table: metric_name_list = ["loss"] + validation_metrics if is_pai: conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) write_result_metrics(result_metrics, metric_name_list, result_table, conn) conn.close()
def shap_explain(datasource, select, feature_field_meta, feature_column_names, label_meta, summary_params, result_table="", is_pai=False, pai_explain_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None, transform_fn=None, feature_column_code=""): x = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_field_meta, is_pai, pai_explain_table, transform_fn=transform_fn, feature_column_code=feature_column_code) shap_values, shap_interaction_values, expected_value = xgb_shap_values(x) if result_table != "": if is_pai: from runtime.dbapi.paiio import PaiIOConnection conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values write_shap_values(to_write, conn, result_table, feature_column_names) if summary_params.get("plot_type") == "decision": explainer.plot_and_save( lambda: shap.decision_plot(expected_value, shap_interaction_values, x, show=False, feature_display_range=slice( None, -40, -1), alpha=1), oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: explainer.plot_and_save( lambda: shap.summary_plot( shap_values, x, show=False, **summary_params), oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def pred_imp(datasource, select, feature_metas, feature_column_names, train_label_meta, pred_label_meta, result_table, is_pai=False, pai_table="", model_params=None, train_params=None, transform_fn=None, feature_column_code="", rank=0, nworkers=1): print("rank={} nworkers={}".format(rank, nworkers)) if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = PaiIOConnection.from_table(pai_table) dpred = xgb_dataset( datasource=datasource, fn='predict.txt', dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, cache=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, rank=rank, nworkers=nworkers, transform_fn=transform_fn, feature_column_code=feature_column_code, raw_data_dir="predict.raw.dir") # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("{} Start predicting XGBoost model...".format(datetime.now())) if not model_params: model_params = load_metadata("model_meta.json")["attributes"] selected_cols = db.selected_cols(conn, select) feature_file_id = 0 train_label_name = train_label_meta["feature_name"] pred_label_name = pred_label_meta["feature_name"] for pred_dmatrix in dpred: predict_and_store_result(bst, pred_dmatrix, feature_file_id, model_params, selected_cols, train_label_name, pred_label_name, feature_column_names, feature_metas, is_pai, conn, result_table, rank) feature_file_id += 1 print("{} Done predicting. Predict table: {}".format( datetime.now(), result_table))
def pred(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, pred_params={}, save="", batch_size=1, pai_table=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) if pai_table != "": conn = PaiIOConnection.from_table(pai_table) selected_cols = db.selected_cols(conn, None) predict_generator = db.db_generator(conn, None) else: conn = db.connect_with_data_source(datasource) selected_cols = db.selected_cols(conn, select) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols, extra_result_cols) else: # TODO(sneaxiy): support extra_result_cols for estimator model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table)
def explain(datasource, select, feature_field_meta, feature_column_names, label_meta, summary_params, explainer="TreeExplainer", result_table="", is_pai=False, pai_explain_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None, transform_fn=None, feature_column_code=""): if explainer == "XGBoostExplainer": if result_table == "": raise ValueError("""XGBoostExplainer must use with INTO to output result to a table.""") bst = xgb.Booster() bst.load_model("my_model") gain_map = bst.get_score(importance_type="gain") fscore_map = bst.get_fscore() if is_pai: from runtime.dbapi.paiio import PaiIOConnection conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) all_feature_keys = list(gain_map.keys()) all_feature_keys.sort() with db.buffered_db_writer(conn, result_table, ["feature", "fscore", "gain"], 100) as w: for fkey in all_feature_keys: row = [fkey, fscore_map[fkey], gain_map[fkey]] w.write(list(row)) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(datasource, select, feature_field_meta, feature_column_names, label_meta, summary_params, result_table=result_table, is_pai=is_pai, pai_explain_table=pai_explain_table, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name, transform_fn=transform_fn, feature_column_code=feature_column_code)
def pred(datasource, select, feature_metas, feature_column_names, train_label_meta, pred_label_meta, result_table, is_pai=False, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table="", model_params=None, train_params=None, transform_fn=None, feature_column_code=""): if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = PaiIOConnection.from_table(pai_table) dpred = xgb_dataset( datasource=datasource, fn='predict.txt', dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, cache=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, transform_fn=transform_fn, feature_column_code=feature_column_code, raw_data_dir="predict.raw.dir") # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("Start predicting XGBoost model...") selected_cols = db.selected_cols(conn, select) feature_file_id = 0 train_label_name = train_label_meta["feature_name"] pred_label_name = pred_label_meta["feature_name"] for pred_dmatrix in dpred: predict_and_store_result(bst, pred_dmatrix, feature_file_id, model_params, selected_cols, train_label_name, pred_label_name, feature_column_names, feature_metas, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) feature_file_id += 1 print("Done predicting. Predict table : %s" % result_table)
def _explain(datasource, estimator_string, select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", pai_table="", plot_type='bar', result_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): estimator_cls = import_model(estimator_string) FLAGS = tf.app.flags.FLAGS model_params["model_dir"] = FLAGS.checkpointDir model_params.update(feature_columns) def _input_fn(): dataset = input_fn("", datasource, feature_column_names, feature_metas, label_meta, is_pai=True, pai_table=pai_table) return dataset.batch(1).cache() estimator = init_model_with_feature_column(estimator_cls, model_params) driver = "paiio" conn = PaiIOConnection.from_table(result_table) if result_table else None if estimator_cls in (tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor): explain_boosted_trees(datasource, estimator, _input_fn, plot_type, result_table, feature_column_names, driver, conn, "", "", "", "", oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: shap_dataset = pd.DataFrame(columns=feature_column_names) for i, (features, label) in enumerate(_input_fn()): shap_dataset.loc[i] = [ item.numpy()[0][0] for item in features.values() ] explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, driver, conn, "", "", "", "", oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def get_pai_table_slice_count(table, nworkers, batch_size): if batch_size is None or batch_size <= 0: batch_size = 4096 # default batch_size row_cnt = PaiIOConnection.from_table(table).get_table_row_num() assert row_cnt >= nworkers, "Data number {} should not " \ "less than worker number {}"\ .format(row_cnt, nworkers) slice_num_per_worker = max(int(row_cnt / (nworkers * batch_size)), 1) slice_count = slice_num_per_worker * nworkers print('row_cnt = {}, slice_count = {}, nworkers = {}'.format( row_cnt, slice_count, nworkers)) return slice_count
def _evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0, pai_table=""): estimator_cls = import_model(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) eval_dataset = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=True, pai_table=pai_table, batch_size=batch_size) model_params.update(feature_columns) pop_optimizer_and_loss(model_params) if is_estimator: FLAGS = tf.app.flags.FLAGS model_params["model_dir"] = FLAGS.checkpointDir estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) if result_table: metric_name_list = ["loss"] + validation_metrics write_result_metrics(result_metrics, metric_name_list, result_table, PaiIOConnection.from_table(result_table))
def evaluate(datasource, select, feature_metas, feature_column_names, label_meta, result_table, validation_metrics=["accuracy_score"], is_pai=False, pai_table="", model_params=None, transform_fn=None, feature_column_code=""): if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = PaiIOConnection.from_table(pai_table) dpred = xgb_dataset(datasource, 'predict.txt', select, feature_metas, feature_column_names, label_meta, is_pai, pai_table, True, True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, transform_fn=transform_fn, feature_column_code=feature_column_code ) # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load model if not model_params: model_params = load_metadata("model_meta.json")["attributes"] print("Start evaluating XGBoost model...") feature_file_id = 0 for pred_dmatrix in dpred: evaluate_and_store_result(bst, pred_dmatrix, feature_file_id, validation_metrics, model_params, feature_column_names, label_meta, is_pai, conn, result_table) feature_file_id += 1 print("Done evaluating. Result table : %s" % result_table)
def pai_download_table_data_worker(dname, feature_metas, feature_column_names, label_meta, pai_table, slice_id, slice_count, feature_column_code, raw_data_dir): import runtime.xgboost as xgboost_extended feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count) gen = db.db_generator(conn, None, label_meta=label_meta)() selected_cols = db.selected_cols(conn, None) filename = "{}/{}.txt".format(dname, slice_id) dump_dmatrix(filename, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def _predict(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, save="", batch_size=1, pai_table=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) conn = PaiIOConnection.from_table(pai_table) selected_cols = db.selected_cols(conn, None) predict_generator = db.db_generator(conn, None) pop_optimizer_and_loss(model_params) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table)
def xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, is_pai, pai_explain_table, transform_fn=None, feature_column_code=""): if is_pai: # (TODO: lhw) we may specify pai_explain_table in datasoure # and discard the condition statement here conn = PaiIOConnection.from_table(pai_explain_table) stream = db.db_generator(conn, None, label_meta) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn, select, label_meta) selected_cols = db.selected_cols(conn, select) if transform_fn: feature_names = transform_fn.get_feature_column_names() else: feature_names = feature_column_names xs = None dtypes = [] sizes = [] offsets = [] i = 0 for row, label in stream(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas, is_xgboost=True) if transform_fn: features = transform_fn(features) flatten_features = [] for j, feature in enumerate(features): if len(feature) == 3: # convert sparse to dense col_indices, values, dense_shape = feature size = int(np.prod(dense_shape)) row_indices = np.zeros(shape=[col_indices.size]) sparse_matrix = scipy.sparse.csr_matrix( (values, (row_indices, col_indices)), shape=[1, size]) values = sparse_matrix.toarray() else: values = feature[0] if isinstance(values, np.ndarray): flatten_features.extend(values.flatten().tolist()) if i == 0: sizes.append(values.size) dtypes.append(infer_dtype(values)) else: flatten_features.append(values) if i == 0: sizes.append(1) dtypes.append(infer_dtype(values)) # Create the column name according to the feature number # of each column. # # If the column "c" contains only 1 feature, the result # column name would be "c" too. # # If the column "c" contains 3 features, # the result column name would be "c_0", "c_1" and "c_2" if i == 0: offsets = np.cumsum([0] + sizes) column_names = [] for j in six.moves.range(len(offsets) - 1): start = offsets[j] end = offsets[j + 1] if end - start == 1: column_names.append(feature_names[j]) else: for k in six.moves.range(start, end): column_names.append('{}_{}'.format( feature_names[j], k)) xs = pd.DataFrame(columns=column_names) xs.loc[i] = flatten_features i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) columns = xs.columns for i, dtype in enumerate(dtypes): for j in six.moves.range(offsets[i], offsets[i + 1]): xs[columns[j]] = xs[columns[j]].astype(dtype) return xs
def predict_step(datasource, select, result_table, label_name, model, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") label_meta = model.get_meta("label") train_label_desc = label_meta.get_field_desc()[0] if label_meta else None train_label_name = train_label_desc.name if train_label_desc else None estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) is_pai = True if pai_table else False if is_pai: select = "SELECT * FROM %s" % pai_table conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) if is_pai: conn.close() conn = PaiIOConnection.from_table(pai_table) select = None selected_cols = result_column_names[0:-1] if train_label_idx >= 0: selected_cols = selected_cols[0:train_label_idx] + [ train_label_name ] + selected_cols[train_label_idx:] estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table) conn.close()
def predict(datasource, select, result_table, result_column_names, train_label_idx, model, extra_result_cols=[], pai_table=None): """TBD """ bst = xgb.Booster() if isinstance(model, six.string_types): # NOTE(typhoonzero): must run Model.load_from_db in a temp # directory, calling pyodps in current directory on PAI # workers will cause paiio fails. with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) is_pai = True if pai_table else False if is_pai: conn = PaiIOConnection.from_table(pai_table) else: conn = db.connect_with_data_source(datasource) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset(datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, feature_column_code=fc_map_ir) print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): if is_pai: feature_file_name = os.path.join(tmp_dir_name, "predict.txt.raw") else: feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def explain(datasource, estimator_string, select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", pai_table="", plot_type='bar', result_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): estimator_cls = import_model(estimator_string) is_pai = True if pai_table else False if is_pai: FLAGS = tf.app.flags.FLAGS model_params["model_dir"] = FLAGS.checkpointDir select = "" else: if is_tf_estimator(estimator_cls): model_params['model_dir'] = save model_params.update(feature_columns) pop_optimizer_and_loss(model_params) def _input_fn(): dataset = input_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=is_pai, pai_table=pai_table) return dataset.batch(1).cache() estimator = init_model_with_feature_column(estimator_cls, model_params) if not is_tf_estimator(estimator_cls): load_keras_model_weights(estimator, save) if is_pai: conn = PaiIOConnection.from_table( result_table) if result_table else None else: conn = connect_with_data_source(datasource) if estimator_cls in (tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor): explain_boosted_trees(datasource, estimator, _input_fn, plot_type, result_table, feature_column_names, conn, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: shap_dataset = pd.DataFrame(columns=feature_column_names) for i, (features, label) in enumerate(_input_fn()): shap_dataset.loc[i] = [ item.numpy()[0][0] for item in features.values() ] explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, conn, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) if conn is not None: conn.close()
def explain(datasource, select, feature_field_meta, feature_column_names, label_meta, summary_params, result_table="", is_pai=False, pai_explain_table="", hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None, transform_fn=None, feature_column_code=""): x = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_field_meta, is_pai, pai_explain_table, transform_fn=transform_fn, feature_column_code=feature_column_code) shap_values, shap_interaction_values, expected_value = xgb_shap_values(x) if result_table != "": if is_pai: from runtime.dbapi.paiio import PaiIOConnection conn = PaiIOConnection.from_table(result_table) # TODO(typhoonzero): the shape of shap_values is # (3, num_samples, num_features), use the first # dimension here, should find out how to use # the other two. else: conn = db.connect_with_data_source(datasource) write_shap_values(shap_values[0], conn, result_table, feature_column_names) return if summary_params.get("plot_type") == "decision": explainer.plot_and_save( lambda: shap.decision_plot(expected_value, shap_interaction_values, x, show=False, feature_display_range=slice( None, -40, -1), alpha=1), oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: explainer.plot_and_save( lambda: shap.summary_plot( shap_values, x, show=False, **summary_params), oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def evaluate(datasource, select, result_table, model, label_name=None, model_params=None, result_column_names=[], pai_table=None): """TBD """ if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "accuracy_score") validation_metrics = [m.strip() for m in validation_metrics.split(",")] bst = xgb.Booster() if isinstance(model, six.string_types): with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label = model.get_meta("label") train_label_desc = train_label.get_field_desc()[0] if label_name: train_label_desc.name = label_name feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) is_pai = True if pai_table else False if is_pai: conn = PaiIOConnection.from_table(pai_table) else: conn = db.connect_with_data_source(datasource) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, feature_column_code=fc_map_ir) for i, pred_dmatrix in enumerate(dpred): if is_pai: feature_file_name = pred_fn else: feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()