def test_generator(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "mysql": database = "iris" user = os.environ.get('SQLFLOW_TEST_DB_MYSQL_USER') or "root" password = os.environ.get('SQLFLOW_TEST_DB_MYSQL_PASSWD') or "root" conn = connect(driver, database, user=user, password=password, host="127.0.0.1", port="3306") # prepare test data execute(driver, conn, self.drop_statement) execute(driver, conn, self.create_statement) execute(driver, conn, self.insert_statement) column_name_to_type = {"features": { "feature_name": "features", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] }} gen = db_generator(driver, conn, "SELECT * FROM test_table_float_fea", ["features"], "label", column_name_to_type) idx = 0 for d in gen(): if idx == 0: self.assertEqual(d, ((1.0,), [0])) elif idx == 1: self.assertEqual(d, ((2.0,), [1])) idx += 1 self.assertEqual(idx, 2)
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs, is_pai, pai_explain_table): label_column_name = label_spec["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = db.pai_maxcompute_db_generator(formated_pai_table, feature_column_names, label_column_name, feature_specs) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn.driver, conn, select, feature_column_names, label_spec, feature_specs) xs = pd.DataFrame(columns=feature_column_names) i = 0 for row in stream(): xs.loc[i] = [item[0] for item in row[0]] i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) for fname in feature_column_names: dtype = feature_specs[fname]["dtype"] xs[fname] = xs[fname].astype(dtype) return xs
def input_fn(select, conn, feature_column_names, feature_metas, label_meta): feature_types = [] shapes = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) shapes.append((None, None, None)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) shapes.append(feature_metas[name]["shape"]) gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas) # Clustering model do not have label if label_meta["feature_name"] == "": dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ), (tuple(shapes), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) else: dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])), (tuple(shapes), label_meta["shape"])) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) return dataset.map(ds_mapper)
def test_generate_fetch_size(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "mysql": database = "iris" user = os.environ.get('SQLFLOW_TEST_DB_MYSQL_USER') or "root" password = os.environ.get('SQLFLOW_TEST_DB_MYSQL_PASSWD') or "root" conn = connect(driver, database, user=user, password=password, host="127.0.0.1", port="3306") column_name_to_type = { "sepal_length": { "feature_name": "sepal_length", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] } } gen = db_generator(driver, conn, 'SELECT * FROM iris.train limit 10', ["sepal_length"], "class", column_name_to_type, fetch_size=4) self.assertEqual(len([g for g in gen()]), 10)
def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append( get_dtype(feature_metas[name]["dtype"])) gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas) dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache( "cache/predict" if TF_VERSION_2 else "") return dataset
def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) if is_pai: pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) gen = db.pai_maxcompute_db_generator(formatted_pai_table, feature_column_names, None, feature_metas) else: gen = db.db_generator(conn.driver, conn, select, feature_column_names, None, feature_metas) dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset
def test_generate_fetch_size(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "mysql": user, password, host, port, database = testing_mysql_cfg() conn = connect(driver, database, user=user, password=password, host=host, port=port) column_name_to_type = { "sepal_length": { "feature_name": "sepal_length", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] } } label_spec = { "feature_name": "label", "shape": [], "delimiter": "" } gen = db_generator(driver, conn, 'SELECT * FROM iris.train limit 10', ["sepal_length"], label_spec, column_name_to_type, fetch_size=4) self.assertEqual(len([g for g in gen()]), 10)
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, label_meta, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): classifier = estimator(**model_params) conn = connect_with_data_source(datasource) def fast_input_fn(generator): feature_types = [] for name in feature_column_names: if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) def _inner_input_fn(): if is_pai: dataset = pai_maxcompute_input_fn(pai_table, datasource, feature_column_names, feature_metas, label_meta) else: dataset = tf.data.Dataset.from_generator( generator, (tuple(feature_types), eval( "tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper) dataset = dataset.batch(1).cache() iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return features return _inner_input_fn column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) fast_predictor = FastPredict(classifier, fast_input_fn) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas)(): result = fast_predictor.predict(features) row = [] for idx, _ in enumerate(feature_column_names): val = features[0][idx][0] row.append(str(val)) if "class_ids" in list(result)[0]: row.append(str(list(result)[0]["class_ids"][0])) else: # regression predictions row.append(str(list(result)[0]["predictions"][0])) w.write(row)
def xgb_dataset(conn, fn, dataset_sql, feature_column_name, label_name, feature_spec): gen = db_generator(conn.driver, conn, dataset_sql, feature_column_name, label_name, feature_spec) with open(fn, 'w') as f: for item in gen(): features, label = item row_data = [str(label[0])] + ["%d:%f" % (i, v) for i, v in enumerate(features)] f.write("\t".join(row_data) + "\n") # TODO(yancey1989): generate group and weight text file if necessary return xgb.DMatrix(fn)
def input_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=False, pai_table="", num_workers=1, worker_id=0): feature_types = [] shapes = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) shapes.append((None, None, None)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) shapes.append(feature_metas[name]["shape"]) if is_pai: pai_table = "odps://{}/tables/{}".format(*pai_table.split(".")) return pai_dataset(pai_table, feature_column_names, label_meta, feature_metas, slice_id=worker_id, slice_count=num_workers) selected_cols = db.pai_selected_cols(pai_table) else: conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn.driver, conn, select, feature_column_names, label_meta, feature_metas) selected_cols = db.selected_cols(conn.driver, conn, select) gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas) # Clustering model do not have label if not label_meta or label_meta["feature_name"] == "": dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ), (tuple(shapes), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) else: dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])), (tuple(shapes), label_meta["shape"])) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) return dataset.map(ds_mapper)
def xgb_shap_dataset(datasource, select, feature_column_names, label_name, feature_specs): conn = connect_with_data_source(datasource) stream = db_generator(conn.driver, conn, select, feature_column_names, label_name, feature_specs) xs = pd.DataFrame(columns=feature_column_names) ys = pd.DataFrame(columns=[label_name]) i = 0 for row in stream(): xs.loc[i] = [item[0] for item in row[0]] ys.loc[i] = row[1] i += 1 return xs
def xgb_dataset(datasource, fn, dataset_sql, feature_specs, feature_column_names, label_spec, is_pai=False, pai_table="", pai_single_file=False, cache=False, batch_size=None, epoch=1, rank=0, nworkers=1): if is_pai: for dmatrix in pai_dataset( fn, feature_specs, feature_column_names, label_spec, "odps://{}/tables/{}".format(*pai_table.split(".")), pai_single_file, cache, rank, nworkers, batch_size=batch_size): yield dmatrix return conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn.driver, conn, dataset_sql, feature_column_names, label_spec, feature_specs)() selected_cols = db.selected_cols(conn.driver, conn, dataset_sql) for i in range(epoch): step = 0 # the filename per batch is [filename]_[step] step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_specs, label_spec, selected_cols) while written_rows > 0: yield load_dmatrix('{0}#{0}.cache'.format(step_file_name) if cache else step_file_name) os.remove(step_file_name) step += 1 step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_specs, label_spec, selected_cols)
def input_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=False, pai_table=""): feature_types = [] shapes = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) shapes.append((None, None, None)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) shapes.append(feature_metas[name]["shape"]) if is_pai: pai_table_parts = pai_table.split(".") formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) gen = pai_maxcompute_db_generator(formated_pai_table, feature_column_names, label_meta["feature_name"], feature_metas) else: conn = connect_with_data_source(datasource) gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta, feature_metas) # Clustering model do not have label if label_meta["feature_name"] == "": dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ), (tuple(shapes), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) else: dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])), (tuple(shapes), label_meta["shape"])) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) return dataset.map(ds_mapper)
def test_generator(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "mysql": database = "iris" user, password, host, port, database = testing_mysql_cfg() conn = connect(driver, database, user=user, password=password, host=host, port=int(port)) # prepare test data execute(driver, conn, self.drop_statement) execute(driver, conn, self.create_statement) execute(driver, conn, self.insert_statement) column_name_to_type = { "features": { "feature_name": "features", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] } } label_spec = { "feature_name": "label", "shape": [], "delimiter": "" } gen = db_generator(driver, conn, "SELECT * FROM test_table_float_fea", ["features"], label_spec, column_name_to_type) idx = 0 for row, label in gen(): features = read_features_from_row(row, ["features"], ["features"], column_name_to_type) d = (features, label) if idx == 0: self.assertEqual(d, (((1.0, ), ), 0)) elif idx == 1: self.assertEqual(d, (((2.0, ), ), 1)) idx += 1 self.assertEqual(idx, 2)
def input_fn(datasetStr): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) gen = db_generator(conn.driver, conn, datasetStr, feature_column_names, label_meta["feature_name"], feature_metas) dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) return dataset.map(ds_mapper)
def xgb_dataset(datasource, fn, dataset_sql, feature_specs, feature_column_names, label_spec, is_pai=False, pai_table="", pai_single_file=False): if is_pai: pai_dataset(fn, feature_specs, feature_column_names, label_spec, "odps://{}/tables/{}".format(*pai_table.split(".")), pai_single_file) else: conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn.driver, conn, dataset_sql, feature_column_names, label_spec, feature_specs) dump_dmatrix(fn, gen, label_spec) return xgb.DMatrix(fn)
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs, is_pai, pai_explain_table): label_column_name = label_spec["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = pai_maxcompute_db_generator(formated_pai_table, feature_column_names, label_column_name, feature_specs) else: conn = connect_with_data_source(datasource) stream = db_generator(conn.driver, conn, select, feature_column_names, label_spec, feature_specs) xs = pd.DataFrame(columns=feature_column_names) i = 0 for row in stream(): xs.loc[i] = [item[0] for item in row[0]] i += 1 return xs
def xgb_dataset(datasource, fn, dataset_sql, feature_metas, feature_column_names, label_meta, is_pai=False, pai_table=""): if is_pai: pai_table_parts = pai_table.split(".") formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) if label_meta: label_column_name = label_meta['feature_name'] else: label_column_name = None gen = pai_maxcompute_db_generator(formated_pai_table, feature_column_names, label_column_name, feature_metas) else: conn = connect_with_data_source(datasource) gen = db_generator(conn.driver, conn, dataset_sql, feature_column_names, label_meta, feature_metas) with open(fn, 'w') as f: for item in gen(): if label_meta is None: row_data = ["%d:%f" % (i, v[0]) for i, v in enumerate(item[0])] else: features, label = item row_data = [str(label)] + [ "%d:%f" % (i, v[0]) for i, v in enumerate(features) ] f.write("\t".join(row_data) + "\n") # TODO(yancey1989): generate group and weight text file if necessary return xgb.DMatrix(fn)
def xgb_dataset(datasource, fn, dataset_sql, feature_specs, feature_column_names, label_spec, is_pai=False, pai_table="", pai_single_file=False, cache=False, batch_size=None, epoch=1, rank=0, nworkers=1, transform_fn=None, feature_column_code="", raw_data_dir=None): if raw_data_dir: # raw_data_dir is needed when predicting. Because we # should write the raw data from the source db into # the dest db, instead of the transformed data after # `transform_fn(features)` . If raw_data_dir is not # None, the raw data from the source db would be written # into another file. if os.path.exists(raw_data_dir): shutil.rmtree(raw_data_dir, ignore_errors=True) os.mkdir(raw_data_dir) if is_pai: for dmatrix in pai_dataset( fn, feature_specs, feature_column_names, label_spec, "odps://{}/tables/{}".format(*pai_table.split(".")), pai_single_file, cache, rank, nworkers, batch_size=batch_size, feature_column_code=feature_column_code, raw_data_dir=raw_data_dir): yield dmatrix return conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn.driver, conn, dataset_sql, feature_column_names, label_spec, feature_specs)() selected_cols = db.selected_cols(conn.driver, conn, dataset_sql) for _ in six.moves.range(epoch): step = 0 # the filename per batch is [filename]_[step] step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_specs, label_spec, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir) while written_rows > 0: yield load_dmatrix('{0}#{0}.cache'.format(step_file_name) if cache else step_file_name) os.remove(step_file_name) step += 1 step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_specs, label_spec, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def pred(is_keras_model, datasource, estimator, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", batch_size=1, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass=""): conn = connect_with_data_source(datasource) model_params.update(feature_columns) if not is_keras_model: model_params['model_dir'] = save classifier = estimator(**model_params) else: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas classifier = estimator(**model_params) classifier_pkg = sys.modules[estimator.__module__] if is_keras_model: def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append( get_dtype(feature_metas[name]["dtype"])) gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas) dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results # to insert into result table. pred_dataset = eval_input_fn(1) one_batch = pred_dataset.__iter__().next() # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models classifier.predict_on_batch(one_batch[0]) classifier.load_weights(save) del pred_dataset pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() buff_rows = [] column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: while True: try: features = pred_dataset.get_next() except tf.errors.OutOfRangeError: break result = classifier.predict_on_batch(features[0]) result = classifier_pkg.prepare_prediction_column(result[0]) row = [] for idx, name in enumerate(feature_column_names): val = features[0][name].numpy()[0] row.append(str(val)) row.append(str(result)) w.write(row) del pred_dataset else: def fast_input_fn(generator): feature_types = [] for name in feature_column_names: if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append( get_dtype(feature_metas[name]["dtype"])) def _inner_input_fn(): dataset = tf.data.Dataset.from_generator( generator, (tuple(feature_types), eval( "tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(1).cache() iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return features return _inner_input_fn column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) pred_gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas)() fast_predictor = FastPredict(classifier, fast_input_fn) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: while True: try: features = next(pred_gen) except StopIteration: break result = fast_predictor.predict(features) row = [] for idx, _ in enumerate(feature_column_names): val = features[0][idx] row.append(str(val)) if "class_ids" in list(result)[0]: row.append(str(list(result)[0]["class_ids"][0])) else: # regression predictions row.append(str(list(result)[0]["predictions"][0])) w.write(row) fast_predictor.close() print("Done predicting. Predict table : %s" % result_table)
def pred(is_keras_model, datasource, estimator, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", batch_size=1, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", is_pai=False, pai_table=""): global FLAGS define_tf_flags() if not is_pai: conn = connect_with_data_source(datasource) model_params.update(feature_columns) if is_keras_model: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas classifier = estimator(**model_params) classifier_pkg = sys.modules[estimator.__module__] def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas) dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial(parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results # to insert into result table. pred_dataset = eval_input_fn(1) one_batch = pred_dataset.__iter__().next() # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models classifier.predict_on_batch(one_batch[0]) classifier.load_weights(save) del pred_dataset pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() buff_rows = [] column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: while True: try: features = pred_dataset.get_next() except tf.errors.OutOfRangeError: break result = classifier.predict_on_batch(features[0]) result = classifier_pkg.prepare_prediction_column(result[0]) row = [] for idx, name in enumerate(feature_column_names): val = features[0][name].numpy()[0] row.append(str(val)) row.append(str(result)) w.write(row) del pred_dataset else: if is_pai: model_params["model_dir"] = FLAGS.checkpointDir else: model_params['model_dir'] = save classifier = estimator(**model_params) # FIXME(typhoonzero): copied from train.py def pai_maxcompute_input_fn(): table_parts = pai_table.split(".") if len(table_parts) == 2: database, table_name = table_parts elif len(table_parts) == 1: table_name = pai_table driver, dsn = datasource.split("://") database = parseMaxComputeDSN(dsn)[-1] else: raise ValueError("error database.table format: %s" % pai_table) tables = ["odps://%s/tables/%s" % (database, table_name)] record_defaults = [] for name in feature_column_names: dtype = get_dtype(feature_metas[name]["dtype"]) record_defaults.append(tf.constant(0, dtype=dtype, shape=feature_metas[name]["shape"])) dataset = tf.data.TableRecordDataset(tables, record_defaults=record_defaults, selected_cols=",".join(feature_column_names)) def tensor_to_dict(*args): num_features = len(feature_column_names) features_dict = dict() for idx in range(num_features): name = feature_column_names[idx] features_dict[name] = tf.reshape(args[idx], [-1]) return features_dict return dataset.map(tensor_to_dict) def fast_input_fn(generator): feature_types = [] for name in feature_column_names: if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) def _inner_input_fn(): if is_pai: dataset = pai_maxcompute_input_fn() else: dataset = tf.data.Dataset.from_generator(generator, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial(parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper) dataset = dataset.batch(1).cache() iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return features return _inner_input_fn column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) pred_gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas)() fast_predictor = FastPredict(classifier, fast_input_fn) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: while True: try: features = next(pred_gen) except StopIteration: break result = fast_predictor.predict(features) row = [] for idx, _ in enumerate(feature_column_names): val = features[0][idx] row.append(str(val)) if "class_ids" in list(result)[0]: row.append(str(list(result)[0]["class_ids"][0])) else: # regression predictions row.append(str(list(result)[0]["predictions"][0])) w.write(row) fast_predictor.close() print("Done predicting. Predict table : %s" % result_table)
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): if not is_pai: conn = db.connect_with_data_source(datasource) column_names = feature_column_names[:] column_names.append(result_col_name) if is_pai: driver = "pai_maxcompute" conn = None pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) predict_generator = db.pai_maxcompute_db_generator( formatted_pai_table, feature_column_names, None, feature_metas)() else: driver = conn.driver predict_generator = db.db_generator(conn.driver, conn, select, feature_column_names, None, feature_metas)() # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if "feature_columns" in feature_columns: idx = feature_column_names.index(feature_name) fc = feature_columns["feature_columns"][idx] else: # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param. idx = -1 try: idx = feature_column_names_map[ "dnn_feature_columns"].index(feature_name) fc = feature_columns["dnn_feature_columns"][idx] except: try: idx = feature_column_names_map[ "linear_feature_columns"].index(feature_name) fc = feature_columns["linear_feature_columns"][idx] except: pass if idx == -1: raise ValueError( "can not found feature %s in all feature columns") if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": numeric_type = type(tf.feature_column.numeric_column("tmp")) if type(fc) == numeric_type: example.features.feature[ feature_name].float_list.value.extend( (float(x[0][i][0]), )) else: example.features.feature[ feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in predict_generator: result = predict(features) row = [] for idx, _ in enumerate(feature_column_names): per_feature = features[0][idx] if isinstance(per_feature, tuple) or isinstance( per_feature, list): # is sparse feature: tuple (indices, values, shape) or scalar val = per_feature[0] elif isinstance(per_feature, np.ndarray): val = per_feature # val = features[0][idx][0] row.append(str(val)) if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): if not is_pai: conn = connect_with_data_source(datasource) column_names = feature_column_names[:] column_names.append(result_col_name) if is_pai: driver = "pai_maxcompute" conn = None pai_table_parts = pai_table.split(".") formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) predict_generator = pai_maxcompute_db_generator( formated_pai_table, feature_column_names, None, feature_metas)() else: driver = conn.driver predict_generator = db_generator(conn.driver, conn, select, feature_column_names, None, feature_metas)() # load from the exported model if save.startswith("oss://"): with open("exported_path", "r") as fn: export_path = fn.read() parts = save.split("?") export_path_oss = parts[0] + export_path if TF_VERSION_2: imported = tf.saved_model.load(export_path_oss) else: imported = tf.saved_model.load_v2(export_path_oss) else: with open("exported_path", "r") as fn: export_path = fn.read() if TF_VERSION_2: imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": # FIXME(typhoonzero): figure out why int64 features need to convert to float example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in predict_generator: result = predict(features) row = [] for idx, _ in enumerate(feature_column_names): per_feature = features[0][idx] if isinstance(per_feature, tuple) or isinstance( per_feature, list): # is sparse feature: tuple (indices, values, shape) or scalar val = per_feature[0] elif isinstance(per_feature, np.ndarray): val = per_feature # val = features[0][idx][0] row.append(str(val)) if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs, is_pai, pai_explain_table, transform_fn=None, feature_column_code=""): label_column_name = label_spec["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = db.pai_maxcompute_db_generator(formatted_pai_table, feature_column_names, label_column_name, feature_specs) selected_cols = db.pai_selected_cols(formatted_pai_table) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn.driver, conn, select, feature_column_names, label_spec, feature_specs) selected_cols = db.selected_cols(conn.driver, conn, select) if transform_fn: column_names = transform_fn.get_column_names() else: column_names = feature_column_names # NOTE(sneaxiy): pandas.DataFrame does not support Tensor whose rank is larger than 2. # But `INDICATOR` would generate one hot vector for each element, and pandas.DataFrame # would not accept `INDICATOR` results as its input. In a word, we do not support # `TO EXPLAIN` when using `INDICATOR`. xs = pd.DataFrame(columns=column_names) dtypes = [] i = 0 for row, label in stream(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_specs) if transform_fn: features = transform_fn(features) # TODO(sneaxiy): support sparse features in `TO EXPLAIN` features = [item[0] for item in features] xs.loc[i] = features if i == 0: for f in features: if isinstance(f, np.ndarray): if f.dtype == np.float32 or f.dtype == np.float64: dtypes.append('float32') elif f.dtype == np.int32 or f.dtype == np.int64: dtypes.append('int64') else: raise ValueError('Not supported data type {}'.format( f.dtype)) elif isinstance(f, (np.float32, np.float64, float)): dtypes.append('float32') elif isinstance(f, (np.int32, np.int64, six.integer_types)): dtypes.append('int64') else: raise ValueError('Not supported data type {}'.format( type(f))) i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) for dtype, name in zip(dtypes, column_names): xs[name] = xs[name].astype(dtype) return xs