def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, is_pai, pai_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): def predict(d): def input_fn(): return tf.data.Dataset.from_tensor_slices( dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1) return np.array( [p['probabilities'][0] for p in estimator.predict(input_fn)]) shap_values = shap.KernelExplainer(predict, shap_dataset).shap_values(shap_dataset) print(shap_values) for row in shap_values: print(list(row)) print(len(list(row))) if result_table != "": if is_pai: write_shap_values(shap_values, "pai_maxcompute", None, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_shap_values(shap_values, conn.driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: explainer.plot_and_save(lambda: shap.summary_plot( shap_values, shap_dataset, show=False, plot_type=plot_type))
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, is_pai, pai_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): def predict(d): def input_fn(): return tf.data.Dataset.from_tensor_slices( dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1000) return np.array( [p['probabilities'][-1] for p in estimator.predict(input_fn)]) if len(shap_dataset) > 100: # Reduce to 16 weighted samples to speed up shap_dataset_summary = shap.kmeans(shap_dataset, 16) else: shap_dataset_summary = shap_dataset shap_values = shap.KernelExplainer( predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic") if result_table != "": if is_pai: write_shap_values(shap_values, "pai_maxcompute", None, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_shap_values(shap_values, conn.driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: explainer.plot_and_save(lambda: shap.summary_plot( shap_values, shap_dataset, show=False, plot_type=plot_type))
def explain(datasource, select, feature_field_meta, label_name, summary_params): feature_column_names = [k["name"] for k in feature_field_meta] feature_specs = {k['name']: k for k in feature_field_meta} x = xgb_shap_dataset(datasource, select, feature_column_names, label_name, feature_specs) shap_values = xgb_shap_values(x) # save summary.png using the default backend explainer.plot_and_save(lambda: shap.summary_plot( shap_values, x, show=False, **summary_params))
def explain(datasource, select, feature_field_meta, feature_column_names, label_spec, summary_params, result_table="", is_pai=False, pai_explain_table="", hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): x = xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_field_meta, is_pai, pai_explain_table) shap_values, shap_interaction_values, expected_value = xgb_shap_values(x) if result_table != "": if is_pai: # TODO(typhoonzero): the shape of shap_values is (3, num_samples, num_features) # use the first dimension here, should find out how to use the other two. write_shap_values(shap_values[0], "pai_maxcompute", None, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_shap_values(shap_values[0], conn.driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) return if summary_params.get("plot_type") == "decision": explainer.plot_and_save( lambda: shap.decision_plot(expected_value, shap_interaction_values, x, show=False, feature_display_range=slice( None, -40, -1), alpha=1), is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: explainer.plot_and_save( lambda: shap.summary_plot( shap_values, x, show=False, **summary_params), is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, is_pai, pai_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name): def predict(d): if len(d) == 1: # This is to make sure the progress bar of SHAP display properly: # 1. The newline makes the progress bar string captured in pipe # 2. The ASCII control code moves cursor up twice for alignment print("\033[A" * 2) def input_fn(): return tf.data.Dataset.from_tensor_slices( dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1000) if plot_type == 'bar': predictions = [ p['logits'] if 'logits' in p else p['predictions'] for p in estimator.predict(input_fn) ] else: predictions = [ p['logits'][-1] if 'logits' in p else p['predictions'][-1] for p in estimator.predict(input_fn) ] return np.array(predictions) if len(shap_dataset) > 100: # Reduce to 16 weighted samples to speed up shap_dataset_summary = shap.kmeans(shap_dataset, 16) else: shap_dataset_summary = shap_dataset shap_values = shap.KernelExplainer( predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic") if result_table != "": if is_pai: write_shap_values(shap_values, "pai_maxcompute", None, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_shap_values(shap_values, conn.driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) explainer.plot_and_save( lambda: shap.summary_plot( shap_values, shap_dataset, show=False, plot_type=plot_type), is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): def predict(d): def input_fn(): return tf.data.Dataset.from_tensor_slices( dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1) return np.array( [p['probabilities'][0] for p in estimator.predict(input_fn)]) shap_values = shap.KernelExplainer(predict, shap_dataset).shap_values(shap_dataset) explainer.plot_and_save(lambda: shap.summary_plot( shap_values, shap_dataset, show=False, plot_type=plot_type))
def explain_boosted_trees(datasource, estimator, input_fn, plot_type, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): result = estimator.experimental_predict_with_explanations(input_fn) pred_dicts = list(result) df_dfc = pd.DataFrame([pred['dfc'] for pred in pred_dicts]) dfc_mean = df_dfc.abs().mean() if result_table != "": conn = connect_with_data_source(datasource) gain = estimator.experimental_feature_importances(normalize=True) create_explain_result_table(conn, result_table) write_dfc_result(dfc_mean, gain, result_table, conn, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) explainer.plot_and_save(lambda: eval(plot_type)(df_dfc))
def explain(datasource, select, feature_field_meta, label_spec, summary_params): feature_column_names = [k["name"] for k in feature_field_meta] feature_specs = {k['name']: k for k in feature_field_meta} x = xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs) shap_values, shap_interaction_values, expected_value = xgb_shap_values(x) if summary_params.get("plot_type") == "decision": explainer.plot_and_save(lambda: shap.decision_plot( expected_value, shap_interaction_values, x, show=False, feature_display_range=slice(None, -40, -1), alpha=1)) else: explainer.plot_and_save(lambda: shap.summary_plot( shap_values, x, show=False, **summary_params))
def explain_boosted_trees(datasource, estimator, input_fn, plot_type, result_table, feature_column_names, is_pai, pai_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name): result = estimator.experimental_predict_with_explanations(input_fn) pred_dicts = list(result) df_dfc = pd.DataFrame([pred['dfc'] for pred in pred_dicts]) dfc_mean = df_dfc.abs().mean() gain = estimator.experimental_feature_importances(normalize=True) if result_table != "": if is_pai: write_dfc_result(dfc_mean, gain, result_table, "pai_maxcompute", None, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_dfc_result(dfc_mean, gain, result_table, conn.driver, conn, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) explainer.plot_and_save(lambda: eval(plot_type)(df_dfc), is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)