def submit_pai_explain(datasource, select, result_table, model_name, model_params, user=""): """This function pack need params and resource to a tarball and submit a explain task to PAI Args: datasource: current datasource select: sql statement to get explain data set result_table: the table name to save result model_name: model used to do prediction model_params: dict, Params for training, crossponding to WITH clause """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) params["oss_model_path"] = oss_model_path label_column = model_params.get("label_col") params["label_column"] = label_column create_explain_result_table(datasource, data_table, result_table, model_type, estimator, label_column) setup_explain_entry(params, model_type) prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_explain_cmd(datasource, project, oss_model_path, model_name, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), label_column, cwd) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)
def submit_pai_evaluate(datasource, model_name, select, result_table, model_attrs, user=""): """Submit a PAI evaluation task Args: datasource: current datasource model_name: model used to do evaluation select: sql statement to get evaluate data set result_table: the table name to save result model_params: dict, Params for training, crossponding to WITH claus """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) params["oss_model_path"] = oss_model_path model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) if model_type == EstimatorType.PAIML: raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.") data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table metrics = get_evaluate_metrics(model_type, model_attrs) params["metrics"] = metrics create_evaluate_result_table(datasource, result_table, metrics) conf = cluster_conf.get_cluster_config(model_attrs) if model_type == EstimatorType.XGBOOST: params["entry_type"] = "evaluate_xgb" else: params["entry_type"] = "evaluate_tf" prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_tf_cmd(conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model_name, oss_model_path, data_table, "", result_table, project) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)
def create_evaluate_result_table(datasource, result_table, metrics): """Create a table to hold the evaluation result Args: datasource: current datasource result_table: the table name to save result metrics: list of evaluation metrics names """ table_ops.drop_tables([result_table], datasource) # Always add loss ext_metrics = ["loss"] if isinstance(metrics, list): ext_metrics.extend(metrics) fields = ["%s STRING" % m for m in ext_metrics] sql = "CREATE TABLE IF NOT EXISTS %s (%s);" % (result_table, ",".join(fields)) conn = db.connect_with_data_source(datasource) conn.execute(sql)
def submit_pai_explain(datasource, original_sql, select, model_name, model_params, result_table, explainer="TreeExplainer", user=""): """This function pack need params and resource to a tarball and submit a explain task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model_name: string Model to load and do prediction. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table params["explainer"] = explainer # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) params["oss_model_path"] = oss_model_path model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) params["load"] = model_name label_column = model_params.get("label_col") params["label_column"] = label_column create_explain_result_table(datasource, data_table, result_table, model_type, estimator, label_column) setup_explain_entry(params, model_type) prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_explain_cmd(datasource, project, oss_model_path, model_name, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), label_column, cwd) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)
def submit_pai_train(datasource, original_sql, select, validation_select, estimator_string, model_image, feature_column_map, label_column, model_params, train_params, save, load, user=""): """This function submit PAI-TF train task to the PAI platform. Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original statement used for generate train code. select: string The SQL statement for selecting data for train. validation_select: string Ths SQL statement for selecting data for validation. estimator_string: string TensorFlow estimator name, Keras class name, or XGBoost. model_image: string Docker image that is used to train the model. If it's empty, use default image sqlflow/sqlflow:step feature_column_map: dict A dict, key is the Estimator/Keras Model param name, value is runtime.feature.column. label_column: runtime.feature.column.FeatureColumn FeatureColumn describing the label. model_params: dict Params to construct the estimator/Keras Model. train_params: dict Params used to run the training. save: string Model name to save. load: string The pre-trained model name to load before training. user: string A string to identify the user, used to store models in the user's directory. """ # prepare params for to call runtime.pai.xxx_submitter.train_step(...), # the params will be pickled into train_params.pkl params = dict(locals()) if estimator_string.lower().startswith("xgboost"): params["entry_type"] = "train_xgb" else: params["entry_type"] = "train_tf" cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") train_table, val_table = table_ops.create_train_and_eval_tmp_table( select, validation_select, datasource) params["pai_table"], params["pai_val_table"] = train_table, val_table # clean target dir oss_path_to_save = pai_model.get_oss_model_save_path(datasource, save, user=user) oss_path_to_load = pai_model.get_oss_model_save_path(datasource, load, user=user) if oss_path_to_load == "" or oss_path_to_load != oss_path_to_save: pai_model.clean_oss_model_path(oss_path_to_save + "/") train_params["oss_path_to_load"] = oss_path_to_load # zip all required resource to a tarball prepare_archive(cwd, estimator_string, oss_path_to_save, params) # submit pai task to execute the training cmd = get_pai_train_cmd(datasource, estimator_string, save, train_table, val_table, model_params, train_params, oss_path_to_save, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), cwd) submit_pai_task(cmd, datasource) table_ops.drop_tables([train_table, val_table], datasource)
def submit_pai_train(datasource, estimator_string, select, validation_select, model_params, save, load, **train_params): """This function submit PAI-TF train task to PAI platform Args: datasource: string Like: odps://access_id:[email protected]/api? curr_project=test_ci&scheme=http estimator_string: string TensorFlow estimator name, Keras class name, or XGBoost select: string The SQL statement for selecting data for train validation_select: string Ths SQL statement for selecting data for validation model_params: dict Params for training, crossponding to WITH clause load: string The pre-trained model name to load train_params: dict Extra train params, will be passed to runtime.tensorflow.train. """ # prepare params for tensorflow train, # the params will be pickled into train_params.pkl params = dict(locals()) del params["train_params"] params.update(train_params) if estimator_string.lower().startswith("xgboost"): params["entry_type"] = "train_xgb" else: params["entry_type"] = "train_tf" cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") train_table, val_table = table_ops.create_train_and_eval_tmp_table( select, validation_select, datasource) params["pai_table"], params["pai_val_table"] = train_table, val_table # FIXME(typhoonzero): get user from session user = "" if "user" in params: user = params["user"] # clean target dir path_to_save = pai_model.get_oss_model_save_path(datasource, save, user=user) path_to_load = pai_model.get_oss_model_save_path(datasource, load, user=user) params["oss_model_dir"] = path_to_save if path_to_load == "" or path_to_load != path_to_save: pai_model.clean_oss_model_path(path_to_save + "/") # zip all required resource to a tarball prepare_archive(cwd, estimator_string, path_to_save, params) # submit pai task to execute the training cmd = get_pai_train_cmd(datasource, estimator_string, save, train_table, val_table, model_params, train_params, path_to_save, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), cwd) submit_pai_task(cmd, datasource) # save trained model to sqlfs pai_model.save_model_to_sqlfs(datasource, path_to_save, save) table_ops.drop_tables([train_table, val_table], datasource)
def submit_pai_evaluate(datasource, original_sql, select, model_name, model_params, result_table, user=""): """Submit a PAI evaluation task Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model_name: string Model to load and do prediction. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) params["oss_model_path"] = oss_model_path model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) if model_type == EstimatorType.PAIML: raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.") data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table metrics = get_evaluate_metrics(model_type, model_params) params["metrics"] = metrics create_evaluate_result_table(datasource, result_table, metrics) conf = cluster_conf.get_cluster_config(model_params) if model_type == EstimatorType.XGBOOST: params["entry_type"] = "evaluate_xgb" else: params["entry_type"] = "evaluate_tf" prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_tf_cmd(conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model_name, oss_model_path, data_table, "", result_table, project) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)