예제 #1
0
def submit_pai_explain(datasource,
                       select,
                       result_table,
                       model_name,
                       model_params,
                       user=""):
    """This function pack need params and resource to a tarball
    and submit a explain task to PAI

    Args:
        datasource: current datasource
        select: sql statement to get explain data set
        result_table: the table name to save result
        model_name: model used to do prediction
        model_params: dict, Params for training, crossponding to WITH clause
    """
    params = dict(locals())

    cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp")
    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement
    # is like: "SELECT fields,... FROM table"
    data_table = table_ops.create_tmp_table_from_select(select, datasource)
    params["data_table"] = data_table

    # format resultTable name to "db.table" to let the codegen form a
    # submitting argument of format "odps://project/tables/table_name"
    project = table_ops.get_project(datasource)
    if result_table.count(".") == 0:
        result_table = "%s.%s" % (project, result_table)

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model_name,
                                                       user=user)
    model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator(
        oss_model_path, project)
    params["oss_model_path"] = oss_model_path

    label_column = model_params.get("label_col")
    params["label_column"] = label_column
    create_explain_result_table(datasource, data_table, result_table,
                                model_type, estimator, label_column)

    setup_explain_entry(params, model_type)
    prepare_archive(cwd, estimator, oss_model_path, params)

    cmd = get_pai_explain_cmd(datasource, project, oss_model_path, model_name,
                              data_table, result_table, model_type,
                              model_params,
                              "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                              "file://" + os.path.join(cwd, PARAMS_FILE),
                              label_column, cwd)

    submit_pai_task(cmd, datasource)
    table_ops.drop_tables([data_table], datasource)
예제 #2
0
def submit_pai_evaluate(datasource,
                        model_name,
                        select,
                        result_table,
                        model_attrs,
                        user=""):
    """Submit a PAI evaluation task

    Args:
        datasource: current datasource
        model_name: model used to do evaluation
        select: sql statement to get evaluate data set
        result_table: the table name to save result
        model_params: dict, Params for training, crossponding to WITH claus
    """

    params = dict(locals())
    cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp")

    project = table_ops.get_project(datasource)
    if result_table.count(".") == 0:
        result_table = "%s.%s" % (project, result_table)
    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model_name,
                                                       user=user)
    params["oss_model_path"] = oss_model_path

    model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator(
        oss_model_path, project)
    if model_type == EstimatorType.PAIML:
        raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.")

    data_table = table_ops.create_tmp_table_from_select(select, datasource)
    params["data_table"] = data_table

    metrics = get_evaluate_metrics(model_type, model_attrs)
    params["metrics"] = metrics
    create_evaluate_result_table(datasource, result_table, metrics)

    conf = cluster_conf.get_cluster_config(model_attrs)

    if model_type == EstimatorType.XGBOOST:
        params["entry_type"] = "evaluate_xgb"
    else:
        params["entry_type"] = "evaluate_tf"
    prepare_archive(cwd, estimator, oss_model_path, params)
    cmd = get_pai_tf_cmd(conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                         "file://" + os.path.join(cwd, PARAMS_FILE),
                         ENTRY_FILE, model_name, oss_model_path, data_table,
                         "", result_table, project)
    submit_pai_task(cmd, datasource)
    table_ops.drop_tables([data_table], datasource)
예제 #3
0
def submit_pai_explain(datasource,
                       original_sql,
                       select,
                       model_name,
                       model_params,
                       result_table,
                       explainer="TreeExplainer",
                       user=""):
    """This function pack need params and resource to a tarball
    and submit a explain task to PAI

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original "TO PREDICT" statement.
        select: string
            SQL statement to get prediction data set.
        model_name: string
            Model to load and do prediction.
        model_params: dict
            Params for training, crossponding to WITH clause.
        result_table: string
            The table name to save prediction result.
        user: string
            A string to identify the user, used to load model from the user's
            directory.
    """
    params = dict(locals())

    cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp")
    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement
    # is like: "SELECT fields,... FROM table"
    data_table = table_ops.create_tmp_table_from_select(select, datasource)
    params["data_table"] = data_table
    params["explainer"] = explainer

    # format resultTable name to "db.table" to let the codegen form a
    # submitting argument of format "odps://project/tables/table_name"
    project = table_ops.get_project(datasource)
    if result_table.count(".") == 0:
        result_table = "%s.%s" % (project, result_table)
    params["result_table"] = result_table

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model_name,
                                                       user=user)
    params["oss_model_path"] = oss_model_path
    model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator(
        oss_model_path, project)
    params["load"] = model_name

    label_column = model_params.get("label_col")
    params["label_column"] = label_column
    create_explain_result_table(datasource, data_table, result_table,
                                model_type, estimator, label_column)

    setup_explain_entry(params, model_type)
    prepare_archive(cwd, estimator, oss_model_path, params)

    cmd = get_pai_explain_cmd(datasource, project, oss_model_path, model_name,
                              data_table, result_table, model_type,
                              model_params,
                              "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                              "file://" + os.path.join(cwd, PARAMS_FILE),
                              label_column, cwd)

    submit_pai_task(cmd, datasource)
    table_ops.drop_tables([data_table], datasource)
예제 #4
0
def submit_pai_train(datasource,
                     original_sql,
                     select,
                     validation_select,
                     estimator_string,
                     model_image,
                     feature_column_map,
                     label_column,
                     model_params,
                     train_params,
                     save,
                     load,
                     user=""):
    """This function submit PAI-TF train task to the PAI platform.

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original statement used for generate train code.
        select: string
            The SQL statement for selecting data for train.
        validation_select: string
            Ths SQL statement for selecting data for validation.
        estimator_string: string
            TensorFlow estimator name, Keras class name, or XGBoost.
        model_image: string
            Docker image that is used to train the model. If it's empty,
            use default image sqlflow/sqlflow:step
        feature_column_map: dict
            A dict, key is the Estimator/Keras Model param name, value
            is runtime.feature.column.
        label_column: runtime.feature.column.FeatureColumn
            FeatureColumn describing the label.
        model_params: dict
            Params to construct the estimator/Keras Model.
        train_params: dict
            Params used to run the training.
        save: string
            Model name to save.
        load: string
            The pre-trained model name to load before training.
        user: string
            A string to identify the user, used to store models in the user's
            directory.
    """
    # prepare params for to call runtime.pai.xxx_submitter.train_step(...),
    # the params will be pickled into train_params.pkl
    params = dict(locals())

    if estimator_string.lower().startswith("xgboost"):
        params["entry_type"] = "train_xgb"
    else:
        params["entry_type"] = "train_tf"

    cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp")

    train_table, val_table = table_ops.create_train_and_eval_tmp_table(
        select, validation_select, datasource)
    params["pai_table"], params["pai_val_table"] = train_table, val_table

    # clean target dir
    oss_path_to_save = pai_model.get_oss_model_save_path(datasource,
                                                         save,
                                                         user=user)
    oss_path_to_load = pai_model.get_oss_model_save_path(datasource,
                                                         load,
                                                         user=user)
    if oss_path_to_load == "" or oss_path_to_load != oss_path_to_save:
        pai_model.clean_oss_model_path(oss_path_to_save + "/")
    train_params["oss_path_to_load"] = oss_path_to_load

    # zip all required resource to a tarball
    prepare_archive(cwd, estimator_string, oss_path_to_save, params)

    # submit pai task to execute the training
    cmd = get_pai_train_cmd(datasource, estimator_string, save, train_table,
                            val_table, model_params, train_params,
                            oss_path_to_save,
                            "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                            "file://" + os.path.join(cwd, PARAMS_FILE), cwd)

    submit_pai_task(cmd, datasource)
    table_ops.drop_tables([train_table, val_table], datasource)
def submit_pai_evaluate(datasource,
                        original_sql,
                        select,
                        label_name,
                        model,
                        model_params,
                        result_table,
                        user=""):
    """Submit a PAI evaluation task

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original "TO PREDICT" statement.
        select: string
            SQL statement to get prediction data set.
        model: string
            Model to load and do prediction.
        label_name: string
            The label name to evaluate.
        model_params: dict
            Params for training, crossponding to WITH clause.
        result_table: string
            The table name to save prediction result.
        user: string
            A string to identify the user, used to load model from the user's
            directory.
    """

    params = dict(locals())
    project = table_ops.get_project(datasource)
    if result_table.count(".") == 0:
        result_table = "%s.%s" % (project, result_table)
    params["result_table"] = result_table

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model,
                                                       user=user)

    model_type, estimator = pai_model.get_saved_model_type_and_estimator(
        datasource, model)
    if model_type == EstimatorType.PAIML:
        raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.")

    if model_type == EstimatorType.XGBOOST:
        params["entry_type"] = "evaluate_xgb"
        validation_metrics = model_params.get("validation.metrics",
                                              "accuracy_score")
    else:
        params["entry_type"] = "evaluate_tf"
        validation_metrics = model_params.get("validation.metrics", "Accuracy")

    validation_metrics = [m.strip() for m in validation_metrics.split(",")]
    with db.connect_with_data_source(datasource) as conn:
        result_column_names = create_evaluate_table(conn, result_table,
                                                    validation_metrics)

    with table_ops.create_tmp_tables_guard(select, datasource) as data_table:
        params["pai_table"] = data_table
        params["result_column_names"] = result_column_names

        if try_pai_local_run(params, oss_model_path):
            return

        conf = cluster_conf.get_cluster_config(model_params)
        with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd:
            prepare_archive(cwd, estimator, oss_model_path, params)
            cmd = get_pai_tf_cmd(
                conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model,
                oss_model_path, data_table, "", result_table, project)
            submit_pai_task(cmd, datasource)
예제 #6
0
def submit_pai_train(datasource, estimator_string, select, validation_select,
                     model_params, save, load, **train_params):
    """This function submit PAI-TF train task to PAI platform

    Args:
        datasource: string
            Like: odps://access_id:[email protected]/api?
                         curr_project=test_ci&scheme=http
        estimator_string: string
            TensorFlow estimator name, Keras class name, or XGBoost
        select: string
            The SQL statement for selecting data for train
        validation_select: string
            Ths SQL statement for selecting data for validation
        model_params: dict
            Params for training, crossponding to WITH clause
        load: string
            The pre-trained model name to load
        train_params: dict
            Extra train params, will be passed to runtime.tensorflow.train.
    """
    # prepare params for tensorflow train,
    # the params will be pickled into train_params.pkl
    params = dict(locals())
    del params["train_params"]
    params.update(train_params)

    if estimator_string.lower().startswith("xgboost"):
        params["entry_type"] = "train_xgb"
    else:
        params["entry_type"] = "train_tf"

    cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp")

    train_table, val_table = table_ops.create_train_and_eval_tmp_table(
        select, validation_select, datasource)
    params["pai_table"], params["pai_val_table"] = train_table, val_table

    # FIXME(typhoonzero): get user from session
    user = ""
    if "user" in params:
        user = params["user"]

    # clean target dir
    path_to_save = pai_model.get_oss_model_save_path(datasource,
                                                     save,
                                                     user=user)
    path_to_load = pai_model.get_oss_model_save_path(datasource,
                                                     load,
                                                     user=user)
    params["oss_model_dir"] = path_to_save

    if path_to_load == "" or path_to_load != path_to_save:
        pai_model.clean_oss_model_path(path_to_save + "/")

    # zip all required resource to a tarball
    prepare_archive(cwd, estimator_string, path_to_save, params)

    # submit pai task to execute the training
    cmd = get_pai_train_cmd(datasource, estimator_string, save, train_table,
                            val_table, model_params, train_params,
                            path_to_save,
                            "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                            "file://" + os.path.join(cwd, PARAMS_FILE), cwd)

    submit_pai_task(cmd, datasource)

    # save trained model to sqlfs
    pai_model.save_model_to_sqlfs(datasource, path_to_save, save)
    table_ops.drop_tables([train_table, val_table], datasource)
예제 #7
0
def submit_pai_evaluate(datasource,
                        original_sql,
                        select,
                        model_name,
                        model_params,
                        result_table,
                        user=""):
    """Submit a PAI evaluation task

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original "TO PREDICT" statement.
        select: string
            SQL statement to get prediction data set.
        model_name: string
            Model to load and do prediction.
        model_params: dict
            Params for training, crossponding to WITH clause.
        result_table: string
            The table name to save prediction result.
        user: string
            A string to identify the user, used to load model from the user's
            directory.
    """

    params = dict(locals())
    cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp")

    project = table_ops.get_project(datasource)
    if result_table.count(".") == 0:
        result_table = "%s.%s" % (project, result_table)
    params["result_table"] = result_table

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model_name,
                                                       user=user)
    params["oss_model_path"] = oss_model_path

    model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator(
        oss_model_path, project)
    if model_type == EstimatorType.PAIML:
        raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.")

    data_table = table_ops.create_tmp_table_from_select(select, datasource)
    params["data_table"] = data_table

    metrics = get_evaluate_metrics(model_type, model_params)
    params["metrics"] = metrics
    create_evaluate_result_table(datasource, result_table, metrics)

    conf = cluster_conf.get_cluster_config(model_params)

    if model_type == EstimatorType.XGBOOST:
        params["entry_type"] = "evaluate_xgb"
    else:
        params["entry_type"] = "evaluate_tf"
    prepare_archive(cwd, estimator, oss_model_path, params)
    cmd = get_pai_tf_cmd(conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                         "file://" + os.path.join(cwd, PARAMS_FILE),
                         ENTRY_FILE, model_name, oss_model_path, data_table,
                         "", result_table, project)
    submit_pai_task(cmd, datasource)
    table_ops.drop_tables([data_table], datasource)
예제 #8
0
def submit_pai_predict(datasource,
                       original_sql,
                       select,
                       model,
                       label_name,
                       model_params,
                       result_table,
                       user=""):
    """This function pack needed params and resource to a tarball
    and submit a prediction task to PAI

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original "TO PREDICT" statement.
        select: string
            SQL statement to get prediction data set.
        model: string
            Model to load and do prediction.
        label_name: string
            Name of the label column, if not exist in select.
        model_params: dict
            Params for training, crossponding to WITH clause.
        result_table: string
            The table name to save prediction result.
        user: string
            A string to identify the user, used to load model from the user's
            directory.
    """
    params = dict(locals())

    # format resultTable name to "db.table" to let the codegen form a
    # submitting argument of format "odps://project/tables/table_name"
    project = table_ops.get_project(datasource)
    if result_table.count(".") == 0:
        result_table = "%s.%s" % (project, result_table)

    model_type, estimator = \
        pai_model.get_saved_model_type_and_estimator(
            datasource, model)
    setup_predict_entry(params, model_type)

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model,
                                                       user=user)

    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement
    # is like: "SELECT fields,... FROM table"
    with table_ops.create_tmp_tables_guard(select, datasource) as data_table:
        params["pai_table"] = data_table
        params["oss_model_path"] = oss_model_path
        params["model"] = ""

        if try_pai_local_run(params, oss_model_path):
            return

        with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd:
            prepare_archive(cwd, estimator, oss_model_path, params)

            cmd = get_pai_predict_cmd(
                datasource, project, oss_model_path, model, data_table,
                result_table, model_type, model_params,
                "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                "file://" + os.path.join(cwd, PARAMS_FILE))
            submit_pai_task(cmd, datasource)
예제 #9
0
def submit_pai_predict(datasource,
                       original_sql,
                       select,
                       model,
                       label_name,
                       pred_params,
                       result_table,
                       user=""):
    """This function pack needed params and resource to a tarball
    and submit a prediction task to PAI

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original "TO PREDICT" statement.
        select: string
            SQL statement to get prediction data set.
        model: string
            Model to load and do prediction.
        label_name: string
            Name of the label column, if not exist in select.
        pred_params: dict
            Params for training, crossponding to WITH clause.
        result_table: string
            The table name to save prediction result.
        user: string
            A string to identify the user, used to load model from the user's
            directory.
    """
    params = dict(locals())

    # format resultTable name to "db.table" to let the codegen form a
    # submitting argument of format "odps://project/tables/table_name"
    project = table_ops.get_project(datasource)
    if result_table.count(".") == 0:
        result_table = "%s.%s" % (project, result_table)

    model_metas = Model.load_metadata_from_db(datasource, model)
    model_type = model_metas.get_type()
    estimator = model_metas.get_meta("class_name")
    setup_predict_entry(params, model_type)

    train_label = model_metas.get_meta("label")
    if train_label is not None:
        train_label_desc = train_label.get_field_desc()[0]
    else:
        train_label_desc = None

    if pred_params is None:
        extra_result_cols = []
    else:
        extra_result_cols = pred_params.get("predict.extra_outputs", "")
        extra_result_cols = [
            c.strip() for c in extra_result_cols.split(",") if c.strip()
        ]

    with db.connect_with_data_source(datasource) as conn:
        result_column_names, train_label_idx = create_predict_table(
            conn, select, result_table, train_label_desc, label_name,
            extra_result_cols)

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model,
                                                       user=user)

    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement
    # is like: "SELECT fields,... FROM table"
    with table_ops.create_tmp_tables_guard(select, datasource) as data_table:
        del params["label_name"]
        params["pai_table"] = data_table
        params["result_column_names"] = result_column_names
        params["train_label_idx"] = train_label_idx
        params["extra_result_cols"] = extra_result_cols

        if try_pai_local_run(params, oss_model_path):
            return

        with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd:
            prepare_archive(cwd, estimator, oss_model_path, params)

            cmd = get_pai_predict_cmd(
                datasource, project, oss_model_path, model, data_table,
                result_table, model_type, pred_params,
                "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                "file://" + os.path.join(cwd, PARAMS_FILE))
            submit_pai_task(cmd, datasource)
예제 #10
0
def submit_pai_explain(datasource,
                       original_sql,
                       select,
                       model,
                       model_params,
                       result_table,
                       explainer="TreeExplainer",
                       user=""):
    """This function pack need params and resource to a tarball
    and submit a explain task to PAI

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original "TO PREDICT" statement.
        select: string
            SQL statement to get prediction data set.
        model: string
            Model to load and do prediction.
        model_params: dict
            Params for training, crossponding to WITH clause.
        result_table: string
            The table name to save prediction result.
        user: string
            A string to identify the user, used to load model from the user's
            directory.
    """
    params = dict(locals())

    # format resultTable name to "db.table" to let the codegen form a
    # submitting argument of format "odps://project/tables/table_name"
    project = table_ops.get_project(datasource)
    if result_table:
        if result_table.count(".") == 0:
            result_table = "%s.%s" % (project, result_table)
        params["result_table"] = result_table

    # used to save the explain image
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    params["oss_dest"] = "explain_images/%s/%s" % (user, timestamp)
    add_env_to_params(params, "SQLFLOW_OSS_AK", "oss_ak")
    add_env_to_params(params, "SQLFLOW_OSS_SK", "oss_sk")
    add_env_to_params(params, "SQLFLOW_OSS_ALISA_ENDPOINT", "oss_endpoint")
    add_env_to_params(params, "SQLFLOW_OSS_ALISA_BUCKET", "oss_bucket_name")

    meta = Model.load_metadata_from_db(datasource, model)
    model_type = meta.get_type()
    estimator = meta.get_meta("class_name")
    label_name = model_params.get("label_col")
    if label_name is None:
        label_column = meta.get_meta("label")
        if label_column is not None:
            label_name = label_column.get_field_desc()[0].name

    setup_explain_entry(params, model_type)

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model,
                                                       user=user)

    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement
    # is like: "SELECT fields,... FROM table"
    with table_ops.create_tmp_tables_guard(select, datasource) as data_table:
        params["pai_table"] = data_table

        # Create explain result table
        if result_table:
            conn = db.connect_with_data_source(datasource)
            feature_columns = meta.get_meta("features")
            estimator_string = meta.get_meta("class_name")
            field_descs = get_ordered_field_descs(feature_columns)
            feature_column_names = [fd.name for fd in field_descs]
            create_explain_table(conn, meta.get_type(), explainer,
                                 estimator_string, result_table,
                                 feature_column_names)
            conn.close()

        if not try_pai_local_run(params, oss_model_path):
            with temp_file.TemporaryDirectory(prefix="sqlflow",
                                              dir="/tmp") as cwd:
                prepare_archive(cwd, estimator, oss_model_path, params)
                cmd = get_pai_explain_cmd(
                    datasource, project, oss_model_path, model, data_table,
                    result_table, model_type, model_params,
                    "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                    "file://" + os.path.join(cwd, PARAMS_FILE), label_name)
                submit_pai_task(cmd, datasource)

    if result_table:
        print('Saved result into: {}'.format(result_table))
    else:
        print_oss_image(params["oss_dest"], params["oss_ak"], params["oss_sk"],
                        params["oss_endpoint"], params["oss_bucket_name"])