def submit_pai_explain(datasource, select, result_table, model_name, model_params, user=""): """This function pack need params and resource to a tarball and submit a explain task to PAI Args: datasource: current datasource select: sql statement to get explain data set result_table: the table name to save result model_name: model used to do prediction model_params: dict, Params for training, crossponding to WITH clause """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) params["oss_model_path"] = oss_model_path label_column = model_params.get("label_col") params["label_column"] = label_column create_explain_result_table(datasource, data_table, result_table, model_type, estimator, label_column) setup_explain_entry(params, model_type) prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_explain_cmd(datasource, project, oss_model_path, model_name, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), label_column, cwd) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)
def submit_pai_evaluate(datasource, model_name, select, result_table, model_attrs, user=""): """Submit a PAI evaluation task Args: datasource: current datasource model_name: model used to do evaluation select: sql statement to get evaluate data set result_table: the table name to save result model_params: dict, Params for training, crossponding to WITH claus """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) params["oss_model_path"] = oss_model_path model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) if model_type == EstimatorType.PAIML: raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.") data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table metrics = get_evaluate_metrics(model_type, model_attrs) params["metrics"] = metrics create_evaluate_result_table(datasource, result_table, metrics) conf = cluster_conf.get_cluster_config(model_attrs) if model_type == EstimatorType.XGBOOST: params["entry_type"] = "evaluate_xgb" else: params["entry_type"] = "evaluate_tf" prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_tf_cmd(conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model_name, oss_model_path, data_table, "", result_table, project) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)
def get_pai_train_cmd(datasource, estimator_string, model_name, train_table, val_table, model_params, train_params, path_to_save, job_file, params_file, cwd): """Get train model comman for PAI Args: datasource: current datasource estimator_string: estimator name, Keras class name, or XGBoost model_name: the model name to train train_table: data table from which to load train data val_table: data table from which to load evaluate data model_params: params for training, crossponding to WITH clause train_params: parmas for the trainning process path_to_save: path to save the model job_file: tar file incldue code and libs to execute on PAI params_file: extra params file cwd: current working dir Returns: The command to submit a PAI train task """ project = table_ops.get_project(datasource) conf = cluster_conf.get_cluster_config(model_params) if estimator_string.lower() == "randomforests": cmd = get_train_random_forest_pai_cmd( model_name, train_table, model_params, train_params["feature_column_names"], train_params["label_meta"]["feature_name"]) elif estimator_string.lower() == "kmeans": cmd = get_train_kmeans_pai_cmd(datasource, model_name, train_table, model_params, train_params["feature_column_names"]) else: cmd = get_pai_tf_cmd(conf, job_file, params_file, ENTRY_FILE, model_name, path_to_save, train_table, val_table, "", project) return cmd
def submit_pai_explain(datasource, original_sql, select, model_name, model_params, result_table, explainer="TreeExplainer", user=""): """This function pack need params and resource to a tarball and submit a explain task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model_name: string Model to load and do prediction. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table params["explainer"] = explainer # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) params["oss_model_path"] = oss_model_path model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) params["load"] = model_name label_column = model_params.get("label_col") params["label_column"] = label_column create_explain_result_table(datasource, data_table, result_table, model_type, estimator, label_column) setup_explain_entry(params, model_type) prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_explain_cmd(datasource, project, oss_model_path, model_name, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), label_column, cwd) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)
def submit_pai_evaluate(datasource, original_sql, select, label_name, model, model_params, result_table, user=""): """Submit a PAI evaluation task Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string The label name to evaluate. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) model_type, estimator = pai_model.get_saved_model_type_and_estimator( datasource, model) if model_type == EstimatorType.PAIML: raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.") if model_type == EstimatorType.XGBOOST: params["entry_type"] = "evaluate_xgb" validation_metrics = model_params.get("validation.metrics", "accuracy_score") else: params["entry_type"] = "evaluate_tf" validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(",")] with db.connect_with_data_source(datasource) as conn: result_column_names = create_evaluate_table(conn, result_table, validation_metrics) with table_ops.create_tmp_tables_guard(select, datasource) as data_table: params["pai_table"] = data_table params["result_column_names"] = result_column_names if try_pai_local_run(params, oss_model_path): return conf = cluster_conf.get_cluster_config(model_params) with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_tf_cmd( conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model, oss_model_path, data_table, "", result_table, project) submit_pai_task(cmd, datasource)
def submit_pai_evaluate(datasource, original_sql, select, model_name, model_params, result_table, user=""): """Submit a PAI evaluation task Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model_name: string Model to load and do prediction. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) params["oss_model_path"] = oss_model_path model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) if model_type == EstimatorType.PAIML: raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.") data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table metrics = get_evaluate_metrics(model_type, model_params) params["metrics"] = metrics create_evaluate_result_table(datasource, result_table, metrics) conf = cluster_conf.get_cluster_config(model_params) if model_type == EstimatorType.XGBOOST: params["entry_type"] = "evaluate_xgb" else: params["entry_type"] = "evaluate_tf" prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_tf_cmd(conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model_name, oss_model_path, data_table, "", result_table, project) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)
def submit_pai_predict(datasource, original_sql, select, model, label_name, model_params, result_table, user=""): """This function pack needed params and resource to a tarball and submit a prediction task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string Name of the label column, if not exist in select. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) model_type, estimator = \ pai_model.get_saved_model_type_and_estimator( datasource, model) setup_predict_entry(params, model_type) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: params["pai_table"] = data_table params["oss_model_path"] = oss_model_path params["model"] = "" if try_pai_local_run(params, oss_model_path): return with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_predict_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE)) submit_pai_task(cmd, datasource)
def submit_pai_predict(datasource, original_sql, select, model, label_name, pred_params, result_table, user=""): """This function pack needed params and resource to a tarball and submit a prediction task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string Name of the label column, if not exist in select. pred_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) model_metas = Model.load_metadata_from_db(datasource, model) model_type = model_metas.get_type() estimator = model_metas.get_meta("class_name") setup_predict_entry(params, model_type) train_label = model_metas.get_meta("label") if train_label is not None: train_label_desc = train_label.get_field_desc()[0] else: train_label_desc = None if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("predict.extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] with db.connect_with_data_source(datasource) as conn: result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name, extra_result_cols) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: del params["label_name"] params["pai_table"] = data_table params["result_column_names"] = result_column_names params["train_label_idx"] = train_label_idx params["extra_result_cols"] = extra_result_cols if try_pai_local_run(params, oss_model_path): return with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_predict_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, pred_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE)) submit_pai_task(cmd, datasource)
def submit_pai_explain(datasource, original_sql, select, model, model_params, result_table, explainer="TreeExplainer", user=""): """This function pack need params and resource to a tarball and submit a explain task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table: if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table # used to save the explain image timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") params["oss_dest"] = "explain_images/%s/%s" % (user, timestamp) add_env_to_params(params, "SQLFLOW_OSS_AK", "oss_ak") add_env_to_params(params, "SQLFLOW_OSS_SK", "oss_sk") add_env_to_params(params, "SQLFLOW_OSS_ALISA_ENDPOINT", "oss_endpoint") add_env_to_params(params, "SQLFLOW_OSS_ALISA_BUCKET", "oss_bucket_name") meta = Model.load_metadata_from_db(datasource, model) model_type = meta.get_type() estimator = meta.get_meta("class_name") label_name = model_params.get("label_col") if label_name is None: label_column = meta.get_meta("label") if label_column is not None: label_name = label_column.get_field_desc()[0].name setup_explain_entry(params, model_type) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: params["pai_table"] = data_table # Create explain result table if result_table: conn = db.connect_with_data_source(datasource) feature_columns = meta.get_meta("features") estimator_string = meta.get_meta("class_name") field_descs = get_ordered_field_descs(feature_columns) feature_column_names = [fd.name for fd in field_descs] create_explain_table(conn, meta.get_type(), explainer, estimator_string, result_table, feature_column_names) conn.close() if not try_pai_local_run(params, oss_model_path): with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_explain_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), label_name) submit_pai_task(cmd, datasource) if result_table: print('Saved result into: {}'.format(result_table)) else: print_oss_image(params["oss_dest"], params["oss_ak"], params["oss_sk"], params["oss_endpoint"], params["oss_bucket_name"])