def save_to_oss(self, oss_model_dir, local_dir=None): """ This save function would archive all the files on local_dir into a tarball, and save it into OSS model directory. Args: oss_model_dir (str): the OSS model directory to save. It is in the format of oss://bucket/path/to/dir/. local_dir (str): the local directory to save. Returns: None. """ if local_dir is None: local_dir = os.getcwd() with temp_file.TemporaryDirectory() as tmp_dir: tarball = os.path.join(tmp_dir, TARBALL_NAME) self._zip(local_dir, tarball) oss.save_file(oss_model_dir, tarball, TARBALL_NAME) with temp_file.TemporaryDirectory() as tmp_dir: model_obj_file = os.path.join(tmp_dir, MODEL_OBJ_FILE_NAME) with open(model_obj_file, "w") as f: f.write( json.dumps(self._to_dict(), cls=JSONEncoderWithFeatureColumn)) oss.save_file(oss_model_dir, model_obj_file, MODEL_OBJ_FILE_NAME)
def test_train(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softmax", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = { "num_boost_round": 20, } model_params = {"num_class": 3, "objective": "multi:softmax"} with temp_file.TemporaryDirectory(as_cwd=True): eval_result = train(ds, original_sql, select, val_select, "xgboost.gbtree", "", None, NumericColumn(FieldDesc(name="class")), model_params, train_params, None, "iris.xgboost_train_model_test", None) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01) with temp_file.TemporaryDirectory(as_cwd=True): pred_original_sql = """SELECT * FROM iris.test TO PREDICT iris.xgboost_pred_result.pred_val USING iris.xgboost_train_model_test;""" pred(ds, pred_original_sql, "SELECT * FROM iris.test", "iris.xgboost_train_model_test", "pred_val", model_params, "iris.xgboost_pred_result") with temp_file.TemporaryDirectory(as_cwd=True): explain_original_sql = """SELECT * FROM iris.test TO EXPLAIN iris.xgboost_train_model_test INTO iris.xgboost_explain_result;""" explain(ds, explain_original_sql, "SELECT * FROM iris.test", "iris.xgboost_train_model_test", model_params, "iris.xgboost_explain_result") with temp_file.TemporaryDirectory(as_cwd=True): evaluate_original_sql = """SELECT * FROM iris.test TO EVALUATE iris.xgboost_train_model_test WITH label_col=class INTO iris.xgboost_evaluate_result;""" evaluate(ds, evaluate_original_sql, "SELECT * FROM iris.test", "class", "iris.xgboost_train_model_test", model_params, "iris.xgboost_evaluate_result")
def test_save_load_db(self): table = "sqlflow_models.test_model" meta = {"model_params": {"n_classes": 3}} m = Model(EstimatorType.XGBOOST, meta) datasource = get_datasource() # save mode with temp_file.TemporaryDirectory() as d: m.save_to_db(datasource, table, d) # load model with temp_file.TemporaryDirectory() as d: m = Model.load_from_db(datasource, table, d) self.assertEqual(m._meta, meta)
def load_from_oss(oss_model_dir, local_dir=None): """ Load the saved model from OSS and unzip it on local_dir. Args: oss_model_dir (str): the OSS model directory to load. It is in the format of oss://bucket/path/to/dir/. local_dir (str): the local directory to load. Returns: Model: a Model object represent the model type and meta information. """ if local_dir is None: local_dir = os.getcwd() with temp_file.TemporaryDirectory() as tmp_dir: tarball = os.path.join(tmp_dir, TARBALL_NAME) oss.load_file(oss_model_dir, tarball, TARBALL_NAME) Model._unzip(local_dir, tarball) model_obj_file = os.path.join(tmp_dir, MODEL_OBJ_FILE_NAME) oss.load_file(oss_model_dir, model_obj_file, MODEL_OBJ_FILE_NAME) with open(model_obj_file, "r") as f: d = json.loads(f.read(), cls=JSONDecoderWithFeatureColumn) model = Model._from_dict(d) return model
def test_tar(self): with temp_file.TemporaryDirectory(as_cwd=True): # create the test file tree: # # |-sqlflow_tar # |-sqlflow_sub_dir # |-hello.py test_dir = "sqlflow_tar" test_sub_dir = "sqlflow_sub_dir" test_py_file = "hello.py" test_py_content = "print('hello SQLFlow!')" fullpath = os.path.join(test_dir, test_sub_dir) os.makedirs(fullpath) with open(os.path.join(fullpath, test_py_file), "w") as f: f.write(test_py_content) zip_dir(fullpath, "sqlflow.tar.gz") unzip_dir("sqlflow.tar.gz", "output") self.assertTrue( os.path.isdir("output/sqlflow_tar/sqlflow_sub_dir")) self.assertTrue( os.path.isfile("output/sqlflow_tar/sqlflow_sub_dir/hello.py")) with open(os.path.join(fullpath, test_py_file), "r") as f: self.assertEqual(f.read(), test_py_content)
def load_from_db(datasource, table, local_dir=None): """ Load the saved model from DBMS and unzip it on local_dir. Args: datasource (str): the connection string to DBMS table (str): the table name which saved in DBMS local_dir (str): the local directory to load. Returns: Model: a Model object represent the model type and meta information. """ if local_dir is None: local_dir = os.getcwd() model_zoo_addr, table, tag = _decompose_model_name(table) if model_zoo_addr: gen, metadata = load_model_from_model_zoo(model_zoo_addr, table, tag) else: gen, metadata = read_with_generator_and_metadata(datasource, table) with temp_file.TemporaryDirectory() as tmp_dir: tarball = os.path.join(tmp_dir, TARBALL_NAME) with open(tarball, "wb") as f: for data in gen(): f.write(bytes(data)) Model._unzip(local_dir, tarball, load_from_db=True) return Model._from_dict(metadata)
def save_to_db(self, datasource, table, local_dir=None): """ This save function would archive all the files on local_dir into a tarball, and save it into DBMS with the specified table name. Args: datasource (str): the connection string to DBMS. table (str): the saved table name. local_dir (str): the local directory to save. Returns: None. """ if local_dir is None: local_dir = os.getcwd() with temp_file.TemporaryDirectory() as tmp_dir: tarball = os.path.join(tmp_dir, TARBALL_NAME) self._zip(local_dir, tarball) def _bytes_reader(filename, buf_size=8 * 32): def _gen(): with open(filename, "rb") as f: while True: data = f.read(buf_size) if data: yield data else: break return _gen write_with_generator(datasource, table, _bytes_reader(tarball))
def print_oss_image(oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name): auth = oss2.Auth(oss_ak, oss_sk) bucket = oss2.Bucket(auth, oss_endpoint, oss_bucket_name) with temp_file.TemporaryDirectory(as_cwd=True): local_file_name = "summary.png" bucket.get_object_to_file(oss_dest, local_file_name) print_image_as_base64_html(local_file_name)
def shap_explain(booster, datasource, dataset, summary_params, result_table): tree_explainer = shap.TreeExplainer(booster) shap_values = tree_explainer.shap_values(dataset) if result_table: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values columns = list(dataset.columns) dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32) ] * len(columns) _create_table(conn, result_table, columns, dtypes) with db.buffered_db_writer(conn, result_table, columns) as w: for row in to_write: w.write(list(row)) conn.close() if summary_params.get("plot_type") == "decision": shap_interaction_values = tree_explainer.shap_interaction_values( dataset) expected_value = tree_explainer.expected_value if isinstance(shap_interaction_values, list): shap_interaction_values = shap_interaction_values[0] if isinstance(expected_value, list): expected_value = expected_value[0] plot_func = lambda: shap.decision_plot( # noqa: E731 expected_value, shap_interaction_values, dataset, show=False, feature_display_range=slice(None, -40, -1), alpha=1) else: plot_func = lambda: shap.summary_plot( # noqa: E731 shap_values, dataset, show=False, **summary_params) filename = 'summary.png' with temp_file.TemporaryDirectory(as_cwd=True): explainer.plot_and_save(plot_func, filename=filename) with open(filename, 'rb') as f: img = f.read() img = base64.b64encode(img) if six.PY3: img = img.decode('utf-8') img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \ % img print(img)
def save_to_db(self, datasource, table, local_dir=None, oss_model_dir=None): """ This save function would archive all the files on local_dir into a tarball, and save it into DBMS with the specified table name. Args: datasource (str): the connection string to DBMS. table (str): the saved table name. local_dir (str): the local directory to save. Returns: None. """ if local_dir is None: local_dir = os.getcwd() conn = connect_with_data_source(datasource) if oss_model_dir: cur_dir = os.getcwd() os.chdir(local_dir) oss.load_dir(oss_model_dir) os.chdir(cur_dir) if "." not in table: project_name = conn.param("database") table = project_name + "." + table with temp_file.TemporaryDirectory() as tmp_dir: tarball = os.path.join(tmp_dir, TARBALL_NAME) self._zip(local_dir, tarball) def _bytes_reader(filename, buf_size=8 * 32): def _gen(): with open(filename, "rb") as f: while True: data = f.read(buf_size) if data: yield data else: break return _gen write_with_generator_and_metadata(datasource, table, _bytes_reader(tarball), self._to_dict()) conn.persist_table(table) conn.close() return table
def save_to_oss(self, oss_model_dir, local_dir=None): """ This save function would archive all the files on local_dir into a tarball, and save it into OSS model directory. Args: oss_model_dir (str): the OSS model directory to save. It is in the format of oss://bucket/path/to/dir/. local_dir (str): the local directory to save. Returns: None. """ if local_dir is None: local_dir = os.getcwd() with temp_file.TemporaryDirectory() as tmp_dir: tarball = os.path.join(tmp_dir, TARBALL_NAME) self._zip(local_dir, tarball) oss.save_file(oss_model_dir, tarball, TARBALL_NAME)
def load_from_oss(oss_model_dir, local_dir=None): """ Load the saved model from OSS and unzip it on local_dir. Args: oss_model_dir (str): the OSS model directory to load. It is in the format of oss://bucket/path/to/dir/. local_dir (str): the local directory to load. Returns: Model: a Model object represent the model type and meta information. """ if local_dir is None: local_dir = os.getcwd() with temp_file.TemporaryDirectory() as tmp_dir: tarball = os.path.join(tmp_dir, TARBALL_NAME) oss.load_file(oss_model_dir, tarball, TARBALL_NAME) return Model._unzip(local_dir, tarball)
def load_from_oss(): with temp_file.TemporaryDirectory() as d: return Model.load_from_oss(oss_model_path, d)
def submit_pai_explain(datasource, original_sql, select, model, model_params, result_table, explainer="TreeExplainer", user=""): """This function pack need params and resource to a tarball and submit a explain task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table: if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table # used to save the explain image timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") params["oss_dest"] = "explain_images/%s/%s" % (user, timestamp) add_env_to_params(params, "SQLFLOW_OSS_AK", "oss_ak") add_env_to_params(params, "SQLFLOW_OSS_SK", "oss_sk") add_env_to_params(params, "SQLFLOW_OSS_ALISA_ENDPOINT", "oss_endpoint") add_env_to_params(params, "SQLFLOW_OSS_ALISA_BUCKET", "oss_bucket_name") meta = Model.load_metadata_from_db(datasource, model) model_type = meta.get_type() estimator = meta.get_meta("class_name") label_name = model_params.get("label_col") if label_name is None: label_column = meta.get_meta("label") if label_column is not None: label_name = label_column.get_field_desc()[0].name setup_explain_entry(params, model_type) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: params["pai_table"] = data_table # Create explain result table if result_table: conn = db.connect_with_data_source(datasource) feature_columns = meta.get_meta("features") estimator_string = meta.get_meta("class_name") field_descs = get_ordered_field_descs(feature_columns) feature_column_names = [fd.name for fd in field_descs] create_explain_table(conn, meta.get_type(), explainer, estimator_string, result_table, feature_column_names) conn.close() if not try_pai_local_run(params, oss_model_path): with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_explain_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), label_name) submit_pai_task(cmd, datasource) if result_table: print('Saved result into: {}'.format(result_table)) else: print_oss_image(params["oss_dest"], params["oss_ak"], params["oss_sk"], params["oss_endpoint"], params["oss_bucket_name"])
def test_main(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softprob", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" LABEL class INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = {"num_boost_round": 20} model_params = {"num_class": 3, "objective": "multi:softprob"} save_name = "iris.xgboost_train_model_test" class_name = "class" with temp_file.TemporaryDirectory(as_cwd=True): eval_result = train(datasource=ds, original_sql=original_sql, select=select, validation_select=val_select, estimator_string="xgboost.gbtree", model_image="sqlflow:step", feature_column_map=None, label_column=NumericColumn( FieldDesc(name=class_name)), model_params=model_params, train_params=train_params, validation_params=None, save=save_name, load=None) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01) conn = db.connect_with_data_source(ds) pred_select = "SELECT * FROM iris.test" with temp_file.TemporaryDirectory(as_cwd=True): result_column_names, train_label_idx = create_predict_table( conn, select, "iris.predict_result_table", FieldDesc(name=class_name), "class") predict(ds, pred_select, "iris.predict_result_table", result_column_names, train_label_idx, save_name) self.assertEqual( self.get_table_row_count(conn, "iris.test"), self.get_table_row_count(conn, "iris.predict_result_table")) schema1 = self.get_table_schema(conn, "iris.test") schema2 = self.get_table_schema(conn, "iris.predict_result_table") self.assertEqual(len(schema1), len(schema2)) for name in schema1: if name == 'class': self.assertEqual(schema2[name], "BIGINT") continue self.assertTrue(name in schema2) self.assertEqual(schema1[name], schema2[name]) diff_schema = schema2.keys() - schema1.keys() self.assertEqual(len(diff_schema), 0) with temp_file.TemporaryDirectory(as_cwd=True): result_column_names = create_evaluate_table( conn, "iris.evaluate_result_table", ["accuracy_score"]) evaluate(ds, pred_select, "iris.evaluate_result_table", save_name, label_name='class', model_params={'validation.metrics': 'accuracy_score'}, result_column_names=result_column_names) eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table") self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score'])) with temp_file.TemporaryDirectory(as_cwd=True): feature_column_names = [ "petal_width", "petal_length", "sepal_width", "sepal_length" ] create_explain_table(conn, EstimatorType.XGBOOST, "TreeExplainer", "xgboost.gbtree", "iris.explain_result_table", feature_column_names) explain(ds, select, "TreeExplainer", {"plot_type": "decision"}, "iris.explain_result_table", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table") self.assertEqual(explain_schema.keys(), set(feature_column_names)) with temp_file.TemporaryDirectory(as_cwd=True): create_explain_table(conn, EstimatorType.XGBOOST, "XGBoostExplainer", "xgboost.gbtree", "iris.explain_result_table_2", feature_column_names) explain(ds, select, "XGBoostExplainer", {}, "iris.explain_result_table_2", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table_2") self.assertEqual(explain_schema.keys(), set(['feature', 'fscore', 'gain'])) conn.close()
def pred(datasource, select, result_table, pred_label_name, model): """ Do prediction using a trained model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. pred_label_name (str): the output label name to predict. model (Model|str): the model object or where to load the model. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, pred_label_name) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir) # NOTE: default to use external memory print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def submit_pai_evaluate(datasource, original_sql, select, label_name, model, model_params, result_table, user=""): """Submit a PAI evaluation task Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string The label name to evaluate. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) model_type, estimator = pai_model.get_saved_model_type_and_estimator( datasource, model) if model_type == EstimatorType.PAIML: raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.") if model_type == EstimatorType.XGBOOST: params["entry_type"] = "evaluate_xgb" validation_metrics = model_params.get("validation.metrics", "accuracy_score") else: params["entry_type"] = "evaluate_tf" validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(",")] with db.connect_with_data_source(datasource) as conn: result_column_names = create_evaluate_table(conn, result_table, validation_metrics) with table_ops.create_tmp_tables_guard(select, datasource) as data_table: params["pai_table"] = data_table params["result_column_names"] = result_column_names if try_pai_local_run(params, oss_model_path): return conf = cluster_conf.get_cluster_config(model_params) with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_tf_cmd( conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model, oss_model_path, data_table, "", result_table, project) submit_pai_task(cmd, datasource)
def predict(datasource, select, result_table, result_column_names, train_label_idx, model, extra_result_cols=[], pai_table=None): """TBD """ bst = xgb.Booster() if isinstance(model, six.string_types): # NOTE(typhoonzero): must run Model.load_from_db in a temp # directory, calling pyodps in current directory on PAI # workers will cause paiio fails. with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) is_pai = True if pai_table else False if is_pai: conn = PaiIOConnection.from_table(pai_table) else: conn = db.connect_with_data_source(datasource) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset(datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, feature_column_code=fc_map_ir) print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): if is_pai: feature_file_name = os.path.join(tmp_dir_name, "predict.txt.raw") else: feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def submit_pai_predict(datasource, original_sql, select, model, label_name, pred_params, result_table, user=""): """This function pack needed params and resource to a tarball and submit a prediction task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string Name of the label column, if not exist in select. pred_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) model_metas = Model.load_metadata_from_db(datasource, model) model_type = model_metas.get_type() estimator = model_metas.get_meta("class_name") setup_predict_entry(params, model_type) train_label = model_metas.get_meta("label") if train_label is not None: train_label_desc = train_label.get_field_desc()[0] else: train_label_desc = None if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("predict.extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] with db.connect_with_data_source(datasource) as conn: result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name, extra_result_cols) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: del params["label_name"] params["pai_table"] = data_table params["result_column_names"] = result_column_names params["train_label_idx"] = train_label_idx params["extra_result_cols"] = extra_result_cols if try_pai_local_run(params, oss_model_path): return with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_predict_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, pred_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE)) submit_pai_task(cmd, datasource)
def save_to_oss(): with temp_file.TemporaryDirectory() as d: m.save_to_oss(oss_model_path, d)
def evaluate(datasource, select, result_table, model, pred_label_name=None, model_params=None): """ Do evaluation to a trained XGBoost model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. model (Model|str): the model object or where to load the model. pred_label_name (str): the label column name. model_params (dict): the parameters for evaluation. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(",")] model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] if pred_label_name: train_label_desc.name = pred_label_name field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names = create_evaluate_table(conn, result_table, validation_metrics) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn) for i, pred_dmatrix in enumerate(dpred): feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()
def local_train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, feature_metas, feature_column_names, feature_column_map, label_column, transform_fn, save, load="", is_pai=False, oss_model_dir=""): disk_cache = train_params.pop("disk_cache", False) batch_size = train_params.pop("batch_size", None) if batch_size is not None and batch_size < 0: batch_size = None epoch = train_params.pop("epoch", 1) num_workers = train_params.pop("num_workers", 1) label_meta_dict = label_column.get_field_desc()[0].to_dict( dtype_to_string=True) def build_dataset(fn, slct): return xgb_dataset(datasource, fn, slct, feature_metas, feature_column_names, label_meta_dict, cache=disk_cache, batch_size=batch_size, epoch=epoch, transform_fn=transform_fn) file_name = "my_model" if load: Model.load_from_db(datasource, load) bst = xgb.Booster() bst.load_model(file_name) else: bst = None with temp_file.TemporaryDirectory() as tmp_dir_name: train_fn = os.path.join(tmp_dir_name, 'train.txt') val_fn = os.path.join(tmp_dir_name, 'val.txt') train_dataset = build_dataset(train_fn, select) if validation_select: val_dataset = build_dataset(val_fn, validation_select) else: val_dataset = None eval_result = dict() watchlist = [None] if val_dataset: # The `xgboost.train` API only accepts the XGBoost DMatrix # object as the training or validation dataset, so we should # convert the generator to DMatrix. if isinstance(val_dataset, types.GeneratorType): val_dataset = list(val_dataset)[0] watchlist.append((val_dataset, "validate")) for per_batch_dmatrix in train_dataset: watchlist[0] = (per_batch_dmatrix, "train") bst = xgb.train(model_params, per_batch_dmatrix, evals=watchlist, evals_result=eval_result, xgb_model=bst, **train_params) print("Evaluation result: %s" % eval_result) meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_image, class_name=estimator_string, attributes=model_params, features=feature_column_map, label=label_column, evaluation=eval_result, num_workers=num_workers) save_model_to_local_file(bst, model_params, file_name) model = Model(EstimatorType.XGBOOST, meta) model.save_to_db(datasource, save) if is_pai and len(oss_model_dir) > 0: # TODO(typhoonzero): remove this since we are saving metas into db now. save_model(oss_model_dir, "my_model", model_params, train_params, feature_metas, feature_column_names, label_meta_dict, feature_column_map) return eval_result
def test_main(self): ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN xgboost.gbtree WITH objective="multi:softmax", num_boost_round=20, num_class=3, validation.select="SELECT * FROM iris.test" INTO iris.xgboost_train_model_test; """ select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = {"num_boost_round": 20} model_params = {"num_class": 3, "objective": "multi:softmax"} save_name = "iris.xgboost_train_model_test" class_name = "class" with temp_file.TemporaryDirectory(as_cwd=True): eval_result = train(original_sql=original_sql, model_image="sqlflow:step", estimator_string="xgboost.gbtree", datasource=ds, select=select, validation_select=val_select, model_params=model_params, train_params=train_params, feature_column_map=None, label_column=NumericColumn( FieldDesc(name=class_name)), save=save_name) self.assertLess(eval_result['train']['merror'][-1], 0.01) self.assertLess(eval_result['validate']['merror'][-1], 0.01) conn = db.connect_with_data_source(ds) pred_select = "SELECT * FROM iris.test" pred(ds, pred_select, "iris.predict_result_table", class_name, save_name) self.assertEqual( self.get_table_row_count(conn, "iris.test"), self.get_table_row_count(conn, "iris.predict_result_table")) schema1 = self.get_table_schema(conn, "iris.test") schema2 = self.get_table_schema(conn, "iris.predict_result_table") self.assertEqual(len(schema1), len(schema2)) for name in schema1: if name == 'class': self.assertEqual(schema2[name], "BIGINT") continue self.assertTrue(name in schema2) self.assertEqual(schema1[name], schema2[name]) diff_schema = schema2.keys() - schema1.keys() self.assertEqual(len(diff_schema), 0) evaluate(ds, pred_select, "iris.evaluate_result_table", save_name, 'class', ['accuracy_score']) eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table") self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score']))
def submit_pai_predict(datasource, original_sql, select, model, label_name, model_params, result_table, user=""): """This function pack needed params and resource to a tarball and submit a prediction task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string Name of the label column, if not exist in select. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) model_type, estimator = \ pai_model.get_saved_model_type_and_estimator( datasource, model) setup_predict_entry(params, model_type) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: params["pai_table"] = data_table params["oss_model_path"] = oss_model_path params["model"] = "" if try_pai_local_run(params, oss_model_path): return with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_predict_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE)) submit_pai_task(cmd, datasource)
def shap_explain(booster, datasource, select, summary_params, result_table, model): train_fc_map = model.get_meta("features") label_meta = model.get_meta("label").get_field_desc()[0].to_dict( dtype_to_string=True) field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) dataset = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, transform_fn) tree_explainer = shap.TreeExplainer(booster) shap_values = tree_explainer.shap_values(dataset) if result_table: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values columns = list(dataset.columns) dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32) ] * len(columns) _create_table(conn, result_table, columns, dtypes) with db.buffered_db_writer(conn, result_table, columns) as w: for row in to_write: w.write(list(row)) conn.close() if summary_params.get("plot_type") == "decision": shap_interaction_values = tree_explainer.shap_interaction_values( dataset) expected_value = tree_explainer.expected_value if isinstance(shap_interaction_values, list): shap_interaction_values = shap_interaction_values[0] if isinstance(expected_value, list): expected_value = expected_value[0] plot_func = lambda: shap.decision_plot( # noqa: E731 expected_value, shap_interaction_values, dataset, show=False, feature_display_range=slice(None, -40, -1), alpha=1) else: plot_func = lambda: shap.summary_plot( # noqa: E731 shap_values, dataset, show=False, **summary_params) filename = 'summary.png' with temp_file.TemporaryDirectory(as_cwd=True): explainer.plot_and_save(plot_func, filename=filename) with open(filename, 'rb') as f: img = f.read() img = base64.b64encode(img) if six.PY3: img = img.decode('utf-8') img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \ % img print(img)
def evaluate(datasource, select, result_table, model, label_name=None, model_params=None, result_column_names=[], pai_table=None): """TBD """ if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "accuracy_score") validation_metrics = [m.strip() for m in validation_metrics.split(",")] bst = xgb.Booster() if isinstance(model, six.string_types): with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label = model.get_meta("label") train_label_desc = train_label.get_field_desc()[0] if label_name: train_label_desc.name = label_name feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) is_pai = True if pai_table else False if is_pai: conn = PaiIOConnection.from_table(pai_table) else: conn = db.connect_with_data_source(datasource) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, feature_column_code=fc_map_ir) for i, pred_dmatrix in enumerate(dpred): if is_pai: feature_file_name = pred_fn else: feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()
def explain(datasource, select, explainer, model_params, result_table, model, pai_table="", oss_model_path="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): """TBD """ if model_params is None: model_params = {} summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] bst = xgb.Booster() if isinstance(model, six.string_types): with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") fc_map_ir = model.get_meta("features") label_meta = model.get_meta("label").get_field_desc()[0].to_dict( dtype_to_string=True) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) is_pai = True if pai_table else False # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) dataset = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, is_pai, pai_table, transform_fn) if explainer == "XGBoostExplainer": xgb_native_explain(bst, datasource, result_table) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(bst, datasource, dataset, summary_params, result_table, is_pai=is_pai, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name)
def check_main_impl(self, estimator): if testing.get_driver() != "mysql": return ds = testing.get_datasource() original_sql = """SELECT * FROM iris.train TO TRAIN %s WITH model.hidden_units=[32,64], model.n_classes=3, validation.select="SELECT * FROM iris.test" LABEL class INTO iris.tensorflow_train_model_test; """ % estimator select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" train_params = {"batch_size": 10} model_params = {"n_classes": 3, "hidden_units": [32, 64]} save_name = "iris.tensorflow_train_model_test" class_name = "class" with temp_file.TemporaryDirectory(as_cwd=True): train(original_sql=original_sql, model_image="sqlflow:step", estimator_string=estimator, datasource=ds, select=select, validation_select=val_select, model_params=model_params, train_params=train_params, validation_params=None, feature_column_map=None, label_column=NumericColumn( FieldDesc(name=class_name, shape=[])), save=save_name, load=None) conn = db.connect_with_data_source(ds) pred_select = "SELECT * FROM iris.test" with temp_file.TemporaryDirectory(as_cwd=True): pred(ds, pred_select, "iris.predict_result_table", class_name, save_name) self.assertEqual( self.get_table_row_count(conn, "iris.test"), self.get_table_row_count(conn, "iris.predict_result_table")) schema1 = self.get_table_schema(conn, "iris.test") schema2 = self.get_table_schema(conn, "iris.predict_result_table") self.assertEqual(len(schema1), len(schema2)) for name in schema1: if name == 'class': self.assertEqual(schema2[name], "BIGINT") continue self.assertTrue(name in schema2) self.assertEqual(schema1[name], schema2[name]) diff_schema = schema2.keys() - schema1.keys() self.assertEqual(len(diff_schema), 0) with temp_file.TemporaryDirectory(as_cwd=True): evaluate(ds, select, "iris.evaluate_result_table", save_name, class_name, {'validation.metrics': 'Accuracy'}) eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table") eval_schema = set([k.lower() for k in eval_schema.keys()]) self.assertEqual(eval_schema, set(['loss', 'accuracy'])) with temp_file.TemporaryDirectory(as_cwd=True): explain(ds, select, None, {"plot_type": "bar"}, "iris.explain_result_table", save_name) explain_schema = self.get_table_schema(conn, "iris.explain_result_table") self.assertEqual( explain_schema.keys(), set(['petal_length', 'petal_width', 'sepal_length', 'sepal_width'])) conn.close()
def predict(datasource, select, result_table, label_name, model, pai_table="", oss_model_path=""): """PAI XGBoost prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_name: prediction label column oss_model_path: the model path on OSS """ is_pai = True if pai_table != "" else False if is_pai: # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, train_label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir) # NOTE: default to use external memory print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, feature_column_map, label_column, save, load=None): """ Train, evaluate and save the XGBoost model locally. Args: original_sql (str): the original SQL statement. model_image (str): the model repo docker image. estimator (str): the XGBoost booster type like xgboost.gbtree. datasource (str): the database connection URI. select (str): the SQL statement for training. validation_select (str): the SQL statement for evaluation. model_params (dict): the XGBoost model parameters. train_params (dict): the training parameters, can have disk_cache(bool), batch_size(int), epoch(int) settings in the dict. feature_column_map (dict): the feature column map to do derivation. label_column (FeatureColumn): the label column. save (str): the table name to save the trained model and meta. load (str): the table name to load the pretrained model. Returns: A dict which indicates the evaluation result. """ conn = db.connect_with_data_source(datasource) fc_map_ir, fc_label_ir = infer_feature_columns(conn, select, feature_column_map, label_column, n=1000) fc_map = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) feature_column_list = fc_map["feature_columns"] field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) label_meta = label_column.get_field_desc()[0].to_dict() # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_list) disk_cache = False batch_size = None epoch = 1 if "disk_cache" in train_params: disk_cache = train_params.pop("disk_cache") if "batch_size" in train_params: batch_size = train_params.pop("batch_size") if "epoch" in train_params: epoch = train_params.pop("epoch") def build_dataset(fn, slct): return xgb_dataset(datasource, fn, slct, feature_metas, feature_column_names, label_meta, cache=disk_cache, batch_size=batch_size, epoch=epoch, transform_fn=transform_fn) file_name = "my_model" if load: Model.load_from_db(datasource, load) bst = xgb.Booster() bst.load_model(file_name) else: bst = None with temp_file.TemporaryDirectory() as tmp_dir_name: train_fn = os.path.join(tmp_dir_name, 'train.txt') val_fn = os.path.join(tmp_dir_name, 'val.txt') train_dataset = build_dataset(train_fn, select) if validation_select: val_dataset = build_dataset(val_fn, validation_select) else: val_dataset = None eval_result = dict() watchlist = [None] if val_dataset: # The `xgboost.train` API only accepts the XGBoost DMatrix # object as the training or validation dataset, so we should # convert the generator to DMatrix. if isinstance(val_dataset, types.GeneratorType): val_dataset = list(val_dataset)[0] watchlist.append((val_dataset, "validate")) for per_batch_dmatrix in train_dataset: watchlist[0] = (per_batch_dmatrix, "train") bst = xgb.train(model_params, per_batch_dmatrix, evals=watchlist, evals_result=eval_result, xgb_model=bst, **train_params) print("Evaluation result: %s" % eval_result) meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_image, class_name=estimator_string, attributes=model_params, features=fc_map_ir, label=fc_label_ir, evaluation=eval_result, num_workers=1) save_model_to_local_file(bst, model_params, file_name) model = Model(EstimatorType.XGBOOST, meta) model.save_to_db(datasource, save) return eval_result