def xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs, is_pai, pai_explain_table): label_column_name = label_spec["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = db.pai_maxcompute_db_generator(formated_pai_table, feature_column_names, label_column_name, feature_specs) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn.driver, conn, select, feature_column_names, label_spec, feature_specs) xs = pd.DataFrame(columns=feature_column_names) i = 0 for row in stream(): xs.loc[i] = [item[0] for item in row[0]] i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) for fname in feature_column_names: dtype = feature_specs[fname]["dtype"] xs[fname] = xs[fname].astype(dtype) return xs
def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) if is_pai: pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) gen = db.pai_maxcompute_db_generator(formatted_pai_table, feature_column_names, None, feature_metas) else: gen = db.db_generator(conn.driver, conn, select, feature_column_names, None, feature_metas) dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset
def pai_download_table_data_worker(dname, feature_specs, feature_column_names, label_spec, pai_table, slice_id, slice_count, feature_column_code, raw_data_dir): import sqlflow_submitter.xgboost as xgboost_extended feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) label_column_name = label_spec['feature_name'] if label_spec else None gen = db.pai_maxcompute_db_generator(pai_table, feature_column_names, label_column_name, feature_specs, slice_id=slice_id, slice_count=slice_count)() selected_cols = db.pai_selected_cols(pai_table) filename = "{}/{}.txt".format(dname, slice_id) dump_dmatrix(filename, gen, feature_column_names, feature_specs, label_spec, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def pai_download_table_data_worker(dname, feature_specs, feature_column_names, label_spec, pai_table, slice_id): label_column_name = label_spec['feature_name'] if label_spec else None gen = db.pai_maxcompute_db_generator(pai_table, feature_column_names, label_column_name, feature_specs, slice_id=slice_id, slice_count=SLICE_NUM)() filename = "{}/{}.txt".format(dname, slice_id) dump_dmatrix(filename, gen, feature_column_names, feature_specs, label_spec)
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): if not is_pai: conn = db.connect_with_data_source(datasource) column_names = feature_column_names[:] column_names.append(result_col_name) if is_pai: driver = "pai_maxcompute" conn = None pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) predict_generator = db.pai_maxcompute_db_generator( formatted_pai_table, feature_column_names, None, feature_metas)() else: driver = conn.driver predict_generator = db.db_generator(conn.driver, conn, select, feature_column_names, None, feature_metas)() # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if "feature_columns" in feature_columns: idx = feature_column_names.index(feature_name) fc = feature_columns["feature_columns"][idx] else: # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param. idx = -1 try: idx = feature_column_names_map[ "dnn_feature_columns"].index(feature_name) fc = feature_columns["dnn_feature_columns"][idx] except: try: idx = feature_column_names_map[ "linear_feature_columns"].index(feature_name) fc = feature_columns["linear_feature_columns"][idx] except: pass if idx == -1: raise ValueError( "can not found feature %s in all feature columns") if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": numeric_type = type(tf.feature_column.numeric_column("tmp")) if type(fc) == numeric_type: example.features.feature[ feature_name].float_list.value.extend( (float(x[0][i][0]), )) else: example.features.feature[ feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in predict_generator: result = predict(features) row = [] for idx, _ in enumerate(feature_column_names): per_feature = features[0][idx] if isinstance(per_feature, tuple) or isinstance( per_feature, list): # is sparse feature: tuple (indices, values, shape) or scalar val = per_feature[0] elif isinstance(per_feature, np.ndarray): val = per_feature # val = features[0][idx][0] row.append(str(val)) if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): if not is_pai: conn = db.connect_with_data_source(datasource) column_names = feature_column_names[:] column_names.append(result_col_name) if is_pai: driver = "pai_maxcompute" conn = None pai_table_parts = pai_table.split(".") formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) predict_generator = db.pai_maxcompute_db_generator( formated_pai_table, feature_column_names, None, feature_metas)() else: driver = conn.driver predict_generator = db.db_generator(conn.driver, conn, select, feature_column_names, None, feature_metas)() # load from the exported model if save.startswith("oss://"): with open("exported_path", "r") as fn: export_path = fn.read() parts = save.split("?") export_path_oss = parts[0] + export_path if TF_VERSION_2: imported = tf.saved_model.load(export_path_oss) else: imported = tf.saved_model.load_v2(export_path_oss) else: with open("exported_path", "r") as fn: export_path = fn.read() if TF_VERSION_2: imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": # FIXME(typhoonzero): figure out why int64 features need to convert to float example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in predict_generator: result = predict(features) row = [] for idx, _ in enumerate(feature_column_names): per_feature = features[0][idx] if isinstance(per_feature, tuple) or isinstance( per_feature, list): # is sparse feature: tuple (indices, values, shape) or scalar val = per_feature[0] elif isinstance(per_feature, np.ndarray): val = per_feature # val = features[0][idx][0] row.append(str(val)) if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs, is_pai, pai_explain_table, transform_fn=None, feature_column_code=""): label_column_name = label_spec["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = db.pai_maxcompute_db_generator(formatted_pai_table, feature_column_names, label_column_name, feature_specs) selected_cols = db.pai_selected_cols(formatted_pai_table) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn.driver, conn, select, feature_column_names, label_spec, feature_specs) selected_cols = db.selected_cols(conn.driver, conn, select) if transform_fn: column_names = transform_fn.get_column_names() else: column_names = feature_column_names # NOTE(sneaxiy): pandas.DataFrame does not support Tensor whose rank is larger than 2. # But `INDICATOR` would generate one hot vector for each element, and pandas.DataFrame # would not accept `INDICATOR` results as its input. In a word, we do not support # `TO EXPLAIN` when using `INDICATOR`. xs = pd.DataFrame(columns=column_names) dtypes = [] i = 0 for row, label in stream(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_specs) if transform_fn: features = transform_fn(features) # TODO(sneaxiy): support sparse features in `TO EXPLAIN` features = [item[0] for item in features] xs.loc[i] = features if i == 0: for f in features: if isinstance(f, np.ndarray): if f.dtype == np.float32 or f.dtype == np.float64: dtypes.append('float32') elif f.dtype == np.int32 or f.dtype == np.int64: dtypes.append('int64') else: raise ValueError('Not supported data type {}'.format( f.dtype)) elif isinstance(f, (np.float32, np.float64, float)): dtypes.append('float32') elif isinstance(f, (np.int32, np.int64, six.integer_types)): dtypes.append('int64') else: raise ValueError('Not supported data type {}'.format( type(f))) i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) for dtype, name in zip(dtypes, column_names): xs[name] = xs[name].astype(dtype) return xs