def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) if is_pai: pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) gen = db.pai_maxcompute_db_generator(formatted_pai_table, feature_column_names, None, feature_metas) selected_cols = feature_column_names else: gen = db.db_generator(driver, conn, select, feature_column_names, None, feature_metas) selected_cols = db.selected_cols(driver, conn, select) tf_gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset
def pred(datasource, select, feature_metas, feature_column_names, label_meta, result_table, is_pai=False, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table="", model_params=None, train_params=None, transform_fn=None, feature_column_code=""): if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = None label_name = label_meta["feature_name"] dpred = xgb_dataset( datasource=datasource, fn='predict.txt', dataset_sql=select, feature_specs=feature_metas, feature_column_names=feature_column_names, label_spec=None, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, cache=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, transform_fn=transform_fn, feature_column_code=feature_column_code, raw_data_dir="predict.raw.dir") # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("Start predicting XGBoost model...") if is_pai: pai_table = "odps://{}/tables/{}".format(*pai_table.split(".")) selected_cols = db.pai_selected_cols(pai_table) else: selected_cols = db.selected_cols(conn.driver, conn, select) feature_file_id = 0 for pred_dmatrix in dpred: predict_and_store_result(bst, pred_dmatrix, feature_file_id, model_params, selected_cols, label_name, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) feature_file_id += 1 print("Done predicting. Predict table : %s" % result_table)
def input_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=False, pai_table="", num_workers=1, worker_id=0): feature_types = [] shapes = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) shapes.append((None, None, None)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) shapes.append(feature_metas[name]["shape"]) if is_pai: pai_table = "odps://{}/tables/{}".format(*pai_table.split(".")) return pai_dataset(pai_table, feature_column_names, label_meta, feature_metas, slice_id=worker_id, slice_count=num_workers) selected_cols = db.pai_selected_cols(pai_table) else: conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn.driver, conn, select, feature_column_names, label_meta, feature_metas) selected_cols = db.selected_cols(conn.driver, conn, select) gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas) # Clustering model do not have label if not label_meta or label_meta["feature_name"] == "": dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ), (tuple(shapes), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) else: dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])), (tuple(shapes), label_meta["shape"])) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) return dataset.map(ds_mapper)
def xgb_dataset(datasource, fn, dataset_sql, feature_specs, feature_column_names, label_spec, is_pai=False, pai_table="", pai_single_file=False, cache=False, batch_size=None, epoch=1, rank=0, nworkers=1): if is_pai: for dmatrix in pai_dataset( fn, feature_specs, feature_column_names, label_spec, "odps://{}/tables/{}".format(*pai_table.split(".")), pai_single_file, cache, rank, nworkers, batch_size=batch_size): yield dmatrix return conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn.driver, conn, dataset_sql, feature_column_names, label_spec, feature_specs)() selected_cols = db.selected_cols(conn.driver, conn, dataset_sql) for i in range(epoch): step = 0 # the filename per batch is [filename]_[step] step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_specs, label_spec, selected_cols) while written_rows > 0: yield load_dmatrix('{0}#{0}.cache'.format(step_file_name) if cache else step_file_name) os.remove(step_file_name) step += 1 step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_specs, label_spec, selected_cols)
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs, is_pai, pai_explain_table): label_column_name = label_spec["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = db.pai_maxcompute_db_generator(formatted_pai_table, feature_column_names, label_column_name, feature_specs) selected_cols = feature_column_names[:] else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn.driver, conn, select, feature_column_names, label_spec, feature_specs) selected_cols = db.selected_cols(conn.driver, conn, select) xs = pd.DataFrame(columns=feature_column_names) i = 0 for row, label in stream(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_specs) xs.loc[i] = [item[0] for item in features] i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) for fname in feature_column_names: dtype = feature_specs[fname]["dtype"] xs[fname] = xs[fname].astype(dtype) return xs
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs, is_pai, pai_explain_table, transform_fn=None, feature_column_code=""): label_column_name = label_spec["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = db.pai_maxcompute_db_generator(formatted_pai_table, feature_column_names, label_column_name, feature_specs) selected_cols = db.pai_selected_cols(formatted_pai_table) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn.driver, conn, select, feature_column_names, label_spec, feature_specs) selected_cols = db.selected_cols(conn.driver, conn, select) if transform_fn: column_names = transform_fn.get_column_names() else: column_names = feature_column_names # NOTE(sneaxiy): pandas.DataFrame does not support Tensor whose rank is larger than 2. # But `INDICATOR` would generate one hot vector for each element, and pandas.DataFrame # would not accept `INDICATOR` results as its input. In a word, we do not support # `TO EXPLAIN` when using `INDICATOR`. xs = pd.DataFrame(columns=column_names) dtypes = [] i = 0 for row, label in stream(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_specs) if transform_fn: features = transform_fn(features) # TODO(sneaxiy): support sparse features in `TO EXPLAIN` features = [item[0] for item in features] xs.loc[i] = features if i == 0: for f in features: if isinstance(f, np.ndarray): if f.dtype == np.float32 or f.dtype == np.float64: dtypes.append('float32') elif f.dtype == np.int32 or f.dtype == np.int64: dtypes.append('int64') else: raise ValueError('Not supported data type {}'.format( f.dtype)) elif isinstance(f, (np.float32, np.float64, float)): dtypes.append('float32') elif isinstance(f, (np.int32, np.int64, six.integer_types)): dtypes.append('int64') else: raise ValueError('Not supported data type {}'.format( type(f))) i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) for dtype, name in zip(dtypes, column_names): xs[name] = xs[name].astype(dtype) return xs
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): if not is_pai: conn = db.connect_with_data_source(datasource) column_names = feature_column_names[:] column_names.append(result_col_name) if is_pai: driver = "pai_maxcompute" conn = None pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) selected_cols = db.pai_selected_cols(formatted_pai_table) predict_generator = db.pai_maxcompute_db_generator( formatted_pai_table, feature_column_names, None, feature_metas)() else: driver = conn.driver # bypass all selected cols to the prediction result table selected_cols = db.selected_cols(conn.driver, conn, select) predict_generator = db.db_generator(conn.driver, conn, select, feature_column_names, None, feature_metas)() write_cols, target_col_index = write_cols_from_selected( result_col_name, selected_cols) # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if "feature_columns" in feature_columns: idx = feature_column_names.index(feature_name) fc = feature_columns["feature_columns"][idx] else: # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param. idx = -1 try: idx = feature_column_names_map[ "dnn_feature_columns"].index(feature_name) fc = feature_columns["dnn_feature_columns"][idx] except: try: idx = feature_column_names_map[ "linear_feature_columns"].index(feature_name) fc = feature_columns["linear_feature_columns"][idx] except: pass if idx == -1: raise ValueError( "can not found feature %s in all feature columns") if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": numeric_type = type(tf.feature_column.numeric_column("tmp")) if type(fc) == numeric_type: example.features.feature[ feature_name].float_list.value.extend( (float(x[0][i][0]), )) else: example.features.feature[ feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(driver, conn, result_table, write_cols, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for row, _ in predict_generator: features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas) result = predict((features, )) if target_col_index != -1: del row[target_col_index] if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def xgb_dataset(datasource, fn, dataset_sql, feature_specs, feature_column_names, label_spec, is_pai=False, pai_table="", pai_single_file=False, cache=False, batch_size=None, epoch=1, rank=0, nworkers=1, transform_fn=None, feature_column_code="", raw_data_dir=None): if raw_data_dir: # raw_data_dir is needed when predicting. Because we # should write the raw data from the source db into # the dest db, instead of the transformed data after # `transform_fn(features)` . If raw_data_dir is not # None, the raw data from the source db would be written # into another file. if os.path.exists(raw_data_dir): shutil.rmtree(raw_data_dir, ignore_errors=True) os.mkdir(raw_data_dir) if is_pai: for dmatrix in pai_dataset( fn, feature_specs, feature_column_names, label_spec, "odps://{}/tables/{}".format(*pai_table.split(".")), pai_single_file, cache, rank, nworkers, batch_size=batch_size, feature_column_code=feature_column_code, raw_data_dir=raw_data_dir): yield dmatrix return conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn.driver, conn, dataset_sql, feature_column_names, label_spec, feature_specs)() selected_cols = db.selected_cols(conn.driver, conn, dataset_sql) for _ in six.moves.range(epoch): step = 0 # the filename per batch is [filename]_[step] step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_specs, label_spec, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir) while written_rows > 0: yield load_dmatrix('{0}#{0}.cache'.format(step_file_name) if cache else step_file_name) os.remove(step_file_name) step += 1 step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_specs, label_spec, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)