def test_hive(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "hive": host = "127.0.0.1" port = "10000" conn = connect(driver, "iris", user="******", password="******", host=host, port=port) self._do_test(driver, conn, hdfs_namenode_addr="127.0.0.1:8020", hive_location="/sqlflow") conn.close() conn = connect_with_data_source( "hive://*****:*****@127.0.0.1:10000/iris") self._do_test(driver, conn) self._do_test_hive_specified_db( driver, conn, hdfs_namenode_addr="127.0.0.1:8020", hive_location="/sqlflow") conn.close()
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs, is_pai, pai_explain_table): label_column_name = label_spec["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = db.pai_maxcompute_db_generator(formated_pai_table, feature_column_names, label_column_name, feature_specs) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn.driver, conn, select, feature_column_names, label_spec, feature_specs) xs = pd.DataFrame(columns=feature_column_names) i = 0 for row in stream(): xs.loc[i] = [item[0] for item in row[0]] i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) for fname in feature_column_names: dtype = feature_specs[fname]["dtype"] xs[fname] = xs[fname].astype(dtype) return xs
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, is_pai, pai_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): def predict(d): def input_fn(): return tf.data.Dataset.from_tensor_slices( dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1) return np.array( [p['probabilities'][0] for p in estimator.predict(input_fn)]) shap_values = shap.KernelExplainer(predict, shap_dataset).shap_values(shap_dataset) print(shap_values) for row in shap_values: print(list(row)) print(len(list(row))) if result_table != "": if is_pai: write_shap_values(shap_values, "pai_maxcompute", None, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_shap_values(shap_values, conn.driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: explainer.plot_and_save(lambda: shap.summary_plot( shap_values, shap_dataset, show=False, plot_type=plot_type))
def train(datasource, select, model_params, train_params, feature_field_meta, label_field_meta, validation_select): conn = connect_with_data_source(datasource) # NOTE(tony): sorting is necessary to achieve consistent feature orders between training job and prediction/analysis job feature_column_name = [k["name"] for k in feature_field_meta] label_name = label_field_meta["name"] feature_spec = {k['name']: k for k in feature_field_meta} dtrain = xgb_dataset(conn, 'train.txt', select, feature_column_name, label_name, feature_spec) watchlist = [(dtrain, "train")] if len(validation_select.strip()) > 0: dvalidate = xgb_dataset(conn, 'validate.txt', validation_select, feature_column_name, label_name, feature_spec) watchlist.append((dvalidate, "validate")) re = dict() bst = xgb.train(model_params, dtrain, **train_params, evals=watchlist, evals_result=re) bst.save_model("my_model") print("Evaluation result: %s" % re)
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, is_pai, pai_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): def predict(d): def input_fn(): return tf.data.Dataset.from_tensor_slices( dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1000) return np.array( [p['probabilities'][-1] for p in estimator.predict(input_fn)]) if len(shap_dataset) > 100: # Reduce to 16 weighted samples to speed up shap_dataset_summary = shap.kmeans(shap_dataset, 16) else: shap_dataset_summary = shap_dataset shap_values = shap.KernelExplainer( predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic") if result_table != "": if is_pai: write_shap_values(shap_values, "pai_maxcompute", None, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_shap_values(shap_values, conn.driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: explainer.plot_and_save(lambda: shap.summary_plot( shap_values, shap_dataset, show=False, plot_type=plot_type))
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, label_meta, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): classifier = estimator(**model_params) conn = connect_with_data_source(datasource) def fast_input_fn(generator): feature_types = [] for name in feature_column_names: if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) def _inner_input_fn(): if is_pai: dataset = pai_maxcompute_input_fn(pai_table, datasource, feature_column_names, feature_metas, label_meta) else: dataset = tf.data.Dataset.from_generator( generator, (tuple(feature_types), eval( "tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper) dataset = dataset.batch(1).cache() iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return features return _inner_input_fn column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) fast_predictor = FastPredict(classifier, fast_input_fn) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas)(): result = fast_predictor.predict(features) row = [] for idx, _ in enumerate(feature_column_names): val = features[0][idx][0] row.append(str(val)) if "class_ids" in list(result)[0]: row.append(str(list(result)[0]["class_ids"][0])) else: # regression predictions row.append(str(list(result)[0]["predictions"][0])) w.write(row)
def pred(datasource, select, feature_metas, feature_column_names, label_meta, result_table, is_pai=False, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table=""): # TODO(typhoonzero): support running on PAI without MaxCompute AK/SK connection. if not is_pai: conn = db.connect_with_data_source(datasource) label_name = label_meta["feature_name"] dpred = xgb_dataset(datasource, 'predict.txt', select, feature_metas, feature_column_names, None, is_pai, pai_table, True) bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("Start predicting XGBoost model...") preds = bst.predict(dpred) # TODO(Yancey1989): using the train parameters to decide regression model or classifier model if len(preds.shape) == 2: # classifier result preds = np.argmax(np.array(preds), axis=1) feature_file_read = open("predict.txt", "r") result_column_names = feature_column_names result_column_names.append(label_name) line_no = 0 if is_pai: driver = "pai_maxcompute" conn = None else: driver = conn.driver with db.buffered_db_writer(driver, conn, result_table, result_column_names, 100, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location, hdfs_user=hdfs_user, hdfs_pass=hdfs_pass) as w: while True: line = feature_file_read.readline() if not line: break row = [i.split(":")[1] for i in line.replace("\n", "").split("\t")] row.append(str(preds[line_no])) w.write(row) line_no += 1 print("Done predicting. Predict table : %s" % result_table)
def keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, label_meta, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): classifier = estimator(**model_params) classifier_pkg = sys.modules[estimator.__module__] conn = connect_with_data_source(datasource) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas) dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results # to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models classifier.predict_on_batch(one_batch[0]) classifier.load_weights(save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() buff_rows = [] column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in pred_dataset: result = classifier.predict_on_batch(features[0]) result = classifier_pkg.prepare_prediction_column(result[0]) row = [] for idx, name in enumerate(feature_column_names): val = features[0][name].numpy()[0][0] row.append(str(val)) row.append(str(result)) w.write(row) del pred_dataset
def keras_train_and_save(estimator, model_params, save, feature_column_names, feature_metas, label_meta, datasource, select, validate_select, batch_size, epochs, verbose): classifier = estimator(**model_params) classifier_pkg = sys.modules[estimator.__module__] if hasattr(classifier_pkg, "eval_metrics_fn"): metrics_functions = classifier_pkg.eval_metrics_fn() metrics = [] for key, func in metrics_functions.items(): func.__name__ = key metrics.append(func) else: metrics = ["accuracy"] conn = connect_with_data_source(datasource) # FIXME(typhoonzero): find a way to cache to local file and avoid cache lockfile already exists issue. train_dataset = input_fn(select, conn, feature_column_names, feature_metas, label_meta) train_dataset = train_dataset.shuffle(SHUFFLE_SIZE).batch( batch_size).cache() if validate_select != "": validate_dataset = input_fn(validate_select, conn, feature_column_names, feature_metas, label_meta).batch(batch_size).cache() classifier.compile(optimizer=classifier_pkg.optimizer(), loss=classifier_pkg.loss, metrics=metrics) if hasattr(classifier, 'sqlflow_train_loop'): classifier.sqlflow_train_loop(train_dataset) else: if label_meta["feature_name"] != "" and validate_select != "": history = classifier.fit(train_dataset, epochs=epochs if epochs else classifier.default_training_epochs(), validation_data=validate_dataset, verbose=verbose) else: history = classifier.fit(train_dataset, epochs=epochs if epochs else classifier.default_training_epochs(), verbose=verbose) train_keys = [] val_keys = [] for k in history.history.keys(): if k.startswith("val_"): val_keys.append(k) else: train_keys.append(k) print("====== Result for training set: ======") for k in train_keys: print("%s: %s" % (k, history.history[k][-1])) print("====== Result for validation set: ======") for k in val_keys: print("%s: %s" % (k, history.history[k][-1])) classifier.save_weights(save, save_format="h5")
def _input_fn(): if is_pai: dataset = pai_maxcompute_input_fn(pai_table, datasource, feature_column_names, feature_metas, label_meta) else: conn = connect_with_data_source(datasource) dataset = input_fn(select, conn, feature_column_names, feature_metas, label_meta) return dataset.batch(1).cache()
def pred(datasource, select, feature_metas, feature_column_names, label_meta, result_table, is_pai=False, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table="", model_params=None, train_params=None, transform_fn=None, feature_column_code=""): if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = None label_name = label_meta["feature_name"] dpred = xgb_dataset( datasource=datasource, fn='predict.txt', dataset_sql=select, feature_specs=feature_metas, feature_column_names=feature_column_names, label_spec=None, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, cache=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, transform_fn=transform_fn, feature_column_code=feature_column_code, raw_data_dir="predict.raw.dir") # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("Start predicting XGBoost model...") if is_pai: pai_table = "odps://{}/tables/{}".format(*pai_table.split(".")) selected_cols = db.pai_selected_cols(pai_table) else: selected_cols = db.selected_cols(conn.driver, conn, select) feature_file_id = 0 for pred_dmatrix in dpred: predict_and_store_result(bst, pred_dmatrix, feature_file_id, model_params, selected_cols, label_name, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) feature_file_id += 1 print("Done predicting. Predict table : %s" % result_table)
def explain(datasource, select, feature_field_meta, feature_column_names, label_spec, summary_params, result_table="", is_pai=False, pai_explain_table="", hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): x = xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_field_meta, is_pai, pai_explain_table) shap_values, shap_interaction_values, expected_value = xgb_shap_values(x) if result_table != "": if is_pai: # TODO(typhoonzero): the shape of shap_values is (3, num_samples, num_features) # use the first dimension here, should find out how to use the other two. write_shap_values(shap_values[0], "pai_maxcompute", None, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_shap_values(shap_values[0], conn.driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) return if summary_params.get("plot_type") == "decision": explainer.plot_and_save( lambda: shap.decision_plot(expected_value, shap_interaction_values, x, show=False, feature_display_range=slice( None, -40, -1), alpha=1), is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: explainer.plot_and_save( lambda: shap.summary_plot( shap_values, x, show=False, **summary_params), is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def input_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=False, pai_table="", num_workers=1, worker_id=0): feature_types = [] shapes = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) shapes.append((None, None, None)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) shapes.append(feature_metas[name]["shape"]) if is_pai: pai_table = "odps://{}/tables/{}".format(*pai_table.split(".")) return pai_dataset(pai_table, feature_column_names, label_meta, feature_metas, slice_id=worker_id, slice_count=num_workers) selected_cols = db.pai_selected_cols(pai_table) else: conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn.driver, conn, select, feature_column_names, label_meta, feature_metas) selected_cols = db.selected_cols(conn.driver, conn, select) gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas) # Clustering model do not have label if not label_meta or label_meta["feature_name"] == "": dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ), (tuple(shapes), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) else: dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])), (tuple(shapes), label_meta["shape"])) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) return dataset.map(ds_mapper)
def validate_input_fn(): if is_pai: validate_dataset = pai_maxcompute_input_fn( pai_val_table, datasource, feature_column_names, feature_metas, label_meta, len(FLAGS.worker_hosts), FLAGS.task_index) else: conn = connect_with_data_source(datasource) validate_dataset = input_fn(validate_select, conn, feature_column_names, feature_metas, label_meta) validate_dataset = validate_dataset.batch(batch_size) return validate_dataset
def pred(datasource, select, feature_field_meta, label_field_meta, result_table, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass=""): conn = connect_with_data_source(datasource) feature_column_names = [k["name"] for k in feature_field_meta] label_name = label_field_meta["name"] feature_specs = {k['name']: k for k in feature_field_meta} dpred = xgb_dataset(conn, 'predict.txt', select, feature_column_names, label_name, feature_specs) bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data preds = bst.predict(dpred) # TODO(Yancey1989): using the train parameters to decide regression model or classifier model if len(preds.shape) == 2: # classifier result preds = np.argmax(np.array(preds), axis=1) feature_file_read = open("predict.txt", "r") result_column_names = feature_column_names result_column_names.append(label_name) line_no = 0 with buffered_db_writer(conn.driver, conn, result_table, result_column_names, 100, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location, hdfs_user=hdfs_user, hdfs_pass=hdfs_pass) as w: while True: line = feature_file_read.readline() if not line: break row = [ i.split(":")[1] for i in line.replace("\n", "").split("\t")[1:] ] row.append(str(preds[line_no])) w.write(row) line_no += 1 print("Done predicting. Predict table : %s" % result_table)
def xgb_shap_dataset(datasource, select, feature_column_names, label_name, feature_specs): conn = connect_with_data_source(datasource) stream = db_generator(conn.driver, conn, select, feature_column_names, label_name, feature_specs) xs = pd.DataFrame(columns=feature_column_names) ys = pd.DataFrame(columns=[label_name]) i = 0 for row in stream(): xs.loc[i] = [item[0] for item in row[0]] ys.loc[i] = row[1] i += 1 return xs
def test_mysql(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "mysql": user = os.environ.get('SQLFLOW_TEST_DB_MYSQL_USER') or "root" password = os.environ.get('SQLFLOW_TEST_DB_MYSQL_PASSWD') or "root" host = "127.0.0.1" port = "3306" database = "iris" conn = connect(driver, database, user=user, password=password, host=host, port=port) self._do_test(driver, conn) conn = connect_with_data_source("mysql://*****:*****@tcp(127.0.0.1:3306)/iris?maxAllowedPacket=0") self._do_test(driver, conn)
def xgb_dataset(datasource, fn, dataset_sql, feature_specs, feature_column_names, label_spec, is_pai=False, pai_table="", pai_single_file=False, cache=False, batch_size=None, epoch=1, rank=0, nworkers=1): if is_pai: for dmatrix in pai_dataset( fn, feature_specs, feature_column_names, label_spec, "odps://{}/tables/{}".format(*pai_table.split(".")), pai_single_file, cache, rank, nworkers, batch_size=batch_size): yield dmatrix return conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn.driver, conn, dataset_sql, feature_column_names, label_spec, feature_specs)() selected_cols = db.selected_cols(conn.driver, conn, dataset_sql) for i in range(epoch): step = 0 # the filename per batch is [filename]_[step] step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_specs, label_spec, selected_cols) while written_rows > 0: yield load_dmatrix('{0}#{0}.cache'.format(step_file_name) if cache else step_file_name) os.remove(step_file_name) step += 1 step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_specs, label_spec, selected_cols)
def pred(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, result_col_name, feature_metas={}, model_params={}, save="", batch_size=1, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", is_pai=False, pai_table=""): # import custom model package model_import_name = sqlflow_submitter.import_model_def(estimator_string) estimator = eval(estimator_string) if not is_pai: conn = db.connect_with_data_source(datasource) model_params.update(feature_columns) is_estimator = issubclass( estimator, (tf.estimator.Estimator, tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor)) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, is_pai, pai_table, feature_column_names, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table) print("Done predicting. Predict table : %s" % result_table)
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, is_pai, pai_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name): def predict(d): if len(d) == 1: # This is to make sure the progress bar of SHAP display properly: # 1. The newline makes the progress bar string captured in pipe # 2. The ASCII control code moves cursor up twice for alignment print("\033[A" * 2) def input_fn(): return tf.data.Dataset.from_tensor_slices( dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1000) if plot_type == 'bar': predictions = [ p['logits'] if 'logits' in p else p['predictions'] for p in estimator.predict(input_fn) ] else: predictions = [ p['logits'][-1] if 'logits' in p else p['predictions'][-1] for p in estimator.predict(input_fn) ] return np.array(predictions) if len(shap_dataset) > 100: # Reduce to 16 weighted samples to speed up shap_dataset_summary = shap.kmeans(shap_dataset, 16) else: shap_dataset_summary = shap_dataset shap_values = shap.KernelExplainer( predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic") if result_table != "": if is_pai: write_shap_values(shap_values, "pai_maxcompute", None, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_shap_values(shap_values, conn.driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) explainer.plot_and_save( lambda: shap.summary_plot( shap_values, shap_dataset, show=False, plot_type=plot_type), is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def test_mysql(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "mysql": user, password, host, port, database = testing_mysql_cfg() conn = connect(driver, database, user=user, password=password, host=host, port=port) self._do_test(driver, conn) conn = connect_with_data_source(testing_mysql_db_url()) self._do_test(driver, conn)
def train_input_fn(): # FIXME(typhoonzero): find a way to cache to local file and avoid cache lockfile already exists issue. if is_pai: train_dataset = pai_maxcompute_input_fn(pai_table, datasource, feature_column_names, feature_metas, label_meta, len(FLAGS.worker_hosts), FLAGS.task_index) else: conn = connect_with_data_source(datasource) train_dataset = input_fn(select, conn, feature_column_names, feature_metas, label_meta) train_dataset = train_dataset.shuffle(SHUFFLE_SIZE).batch( batch_size).cache().repeat(epochs if epochs else 1) return train_dataset
def pred(datasource, estimator, select, result_table, feature_columns, feature_column_names, result_col_name, feature_metas={}, model_params={}, save="", batch_size=1, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", is_pai=False, pai_table=""): if not is_pai: conn = connect_with_data_source(datasource) model_params.update(feature_columns) is_estimator = issubclass( estimator, (tf.estimator.Estimator, tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor)) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: if is_pai: FLAGS = define_tf_flags() model_params["model_dir"] = FLAGS.checkpointDir else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table) print("Done predicting. Predict table : %s" % result_table)
def explain_boosted_trees(datasource, estimator, input_fn, plot_type, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): result = estimator.experimental_predict_with_explanations(input_fn) pred_dicts = list(result) df_dfc = pd.DataFrame([pred['dfc'] for pred in pred_dicts]) dfc_mean = df_dfc.abs().mean() if result_table != "": conn = connect_with_data_source(datasource) gain = estimator.experimental_feature_importances(normalize=True) create_explain_result_table(conn, result_table) write_dfc_result(dfc_mean, gain, result_table, conn, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) explainer.plot_and_save(lambda: eval(plot_type)(df_dfc))
def input_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=False, pai_table=""): feature_types = [] shapes = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) shapes.append((None, None, None)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) shapes.append(feature_metas[name]["shape"]) if is_pai: pai_table_parts = pai_table.split(".") formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) gen = pai_maxcompute_db_generator(formated_pai_table, feature_column_names, label_meta["feature_name"], feature_metas) else: conn = connect_with_data_source(datasource) gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta, feature_metas) # Clustering model do not have label if label_meta["feature_name"] == "": dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ), (tuple(shapes), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) else: dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])), (tuple(shapes), label_meta["shape"])) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) return dataset.map(ds_mapper)
def evaluate(datasource, select, feature_metas, feature_column_names, label_meta, result_table, validation_metrics=["accuracy_score"], is_pai=False, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table="", model_params=None, transform_fn=None, feature_column_code=""): if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = None dpred = xgb_dataset(datasource, 'predict.txt', select, feature_metas, feature_column_names, label_meta, is_pai, pai_table, True, True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, transform_fn=transform_fn, feature_column_code=feature_column_code ) # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load model print("Start evaluating XGBoost model...") feature_file_id = 0 for pred_dmatrix in dpred: evaluate_and_store_result(bst, pred_dmatrix, feature_file_id, validation_metrics, model_params, feature_column_names, label_meta, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) feature_file_id += 1 print("Done evaluating. Result table : %s" % result_table)
def pred(datasource, select, feature_metas, feature_column_names, label_meta, result_table, is_pai=False, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table="", model_params=None, train_params=None): # TODO(typhoonzero): support running on PAI without MaxCompute AK/SK connection. if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = None label_name = label_meta["feature_name"] dpred = xgb_dataset(datasource, 'predict.txt', select, feature_metas, feature_column_names, None, is_pai, pai_table, True, True, batch_size=DEFAULT_PREDICT_BATCH_SIZE ) # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("Start predicting XGBoost model...") feature_file_id = 0 for pred_dmatrix in dpred: predict_and_store_result(bst, pred_dmatrix, feature_file_id, model_params, feature_column_names, label_name, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) feature_file_id += 1 print("Done predicting. Predict table : %s" % result_table)
def pred(is_keras_model, datasource, estimator, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", batch_size=1, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", is_pai=False, pai_table=""): if not is_pai: conn = connect_with_data_source(datasource) model_params.update(feature_columns) if is_keras_model: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, label_meta, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: if is_pai: FLAGS = define_tf_flags() model_params["model_dir"] = FLAGS.checkpointDir else: model_params['model_dir'] = save estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, label_meta, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table) print("Done predicting. Predict table : %s" % result_table)
def xgb_dataset(datasource, fn, dataset_sql, feature_specs, feature_column_names, label_spec, is_pai=False, pai_table="", pai_single_file=False): if is_pai: pai_dataset(fn, feature_specs, feature_column_names, label_spec, "odps://{}/tables/{}".format(*pai_table.split(".")), pai_single_file) else: conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn.driver, conn, dataset_sql, feature_column_names, label_spec, feature_specs) dump_dmatrix(fn, gen, label_spec) return xgb.DMatrix(fn)
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs, is_pai, pai_explain_table): label_column_name = label_spec["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = pai_maxcompute_db_generator(formated_pai_table, feature_column_names, label_column_name, feature_specs) else: conn = connect_with_data_source(datasource) stream = db_generator(conn.driver, conn, select, feature_column_names, label_spec, feature_specs) xs = pd.DataFrame(columns=feature_column_names) i = 0 for row in stream(): xs.loc[i] = [item[0] for item in row[0]] i += 1 return xs