Пример #1
0
def predict_step(datasource, select, data_table, result_table, label_column,
                 oss_model_path):
    """PAI TensorFlow prediction wrapper
    This function do some preparation for the local prediction, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        label_column: prediction label column
        oss_model_path: the model path on OSS
    """

    try:
        tf.enable_eager_execution()
    except:  # noqa: E722
        pass

    (estimator, feature_column_names, feature_column_names_map, feature_metas,
     label_meta, model_params,
     feature_columns_code) = oss.load_metas(oss_model_path,
                                            "tensorflow_model_desc")

    fc_map_ir = feature_columns_code
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.TENSORFLOW)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])

    is_estimator = is_tf_estimator(import_model(estimator))

    # Keras single node is using h5 format to save the model, no need to deal
    # with export model format. Keras distributed mode will use estimator, so
    # this is also needed.
    model_local_dir = oss_model_path.split("/")[-1]
    if is_estimator:
        oss.load_file(oss_model_path, "exported_path")
        # NOTE(typhoonzero): directory "model_save" is hardcoded in
        # codegen/tensorflow/codegen.go
        oss.load_dir("%s/%s" % (oss_model_path, model_local_dir))
    else:
        oss.load_dir(os.path.join(oss_model_path, "model_save"))

    _predict(datasource=datasource,
             estimator_string=estimator,
             select=select,
             result_table=result_table,
             feature_columns=feature_columns,
             feature_column_names=feature_column_names,
             feature_column_names_map=feature_column_names_map,
             train_label_name=label_meta["feature_name"],
             result_col_name=label_column,
             feature_metas=feature_metas,
             model_params=model_params,
             save=model_local_dir,
             batch_size=1,
             pai_table=data_table)
Пример #2
0
def pai_download_table_data_worker(dname, feature_metas, feature_column_names,
                                   label_meta, pai_table, slice_id,
                                   slice_count, feature_column_code,
                                   raw_data_dir):
    import runtime.xgboost as xgboost_extended
    if isinstance(feature_column_code, dict):
        # NOTE(typhoonzero): feature_column_code is a dict of
        # runtime.feature.column in refactored step code.
        feature_column_transformers = compile_ir_feature_columns(
            feature_column_code, EstimatorType.XGBOOST)
        transform_fn = \
            xgboost_extended.feature_column.ComposedColumnTransformer(
                feature_column_names,
                *feature_column_transformers["feature_columns"])
    else:
        feature_column_transformers = eval('[{}]'.format(feature_column_code))
        transform_fn = \
            xgboost_extended.feature_column.ComposedColumnTransformer(
                feature_column_names, *feature_column_transformers)

    conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count)
    gen = db.db_generator(conn, None, label_meta=label_meta)()
    selected_cols = db.selected_cols(conn, None)
    filename = "{}/{}.txt".format(dname, slice_id)
    dump_dmatrix(filename,
                 gen,
                 feature_column_names,
                 feature_metas,
                 label_meta,
                 selected_cols,
                 transform_fn=transform_fn,
                 raw_data_dir=raw_data_dir)
Пример #3
0
def evaluate_step(datasource, select, data_table, result_table, oss_model_path,
                  metrics):
    """PAI TensorFlow evaluate wrapper
    This function do some preparation for the local evaluation, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        oss_model_path: the model path on OSS
        metrics: metrics to evaluate
    """

    (estimator, feature_column_names, feature_column_names_map, feature_metas,
     label_meta, model_params,
     feature_columns_code) = oss.load_metas(oss_model_path,
                                            "tensorflow_model_desc")

    fc_map_ir = feature_columns_code
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.TENSORFLOW)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])

    # NOTE(typhoonzero): No need to eval model_params["optimizer"] and
    # model_params["loss"] because predicting do not need these parameters.

    is_estimator = is_tf_estimator(import_model(estimator))

    # Keras single node is using h5 format to save the model, no need to deal
    # with export model format. Keras distributed mode will use estimator, so
    # this is also needed.
    model_name = oss_model_path.split("/")[-1]
    if is_estimator:
        oss.load_file(oss_model_path, "exported_path")
        # NOTE(typhoonzero): directory "model_save" is hardcoded in
        # codegen/tensorflow/codegen.go
        oss.load_dir("%s/%s" % (oss_model_path, model_name))
    else:
        oss.load_dir(os.path.join(oss_model_path, "model_save"))

    _evaluate(datasource=datasource,
              estimator_string=estimator,
              select=select,
              result_table=result_table,
              feature_columns=feature_columns,
              feature_column_names=feature_column_names,
              feature_metas=feature_metas,
              label_meta=label_meta,
              model_params=model_params,
              validation_metrics=metrics,
              save="model_save",
              batch_size=1,
              validation_steps=None,
              verbose=0,
              pai_table=data_table)
Пример #4
0
 def compile_fc(self, fc, model_type):
     fc_dict = {"feature_columns": [fc]}
     rt_fc_dict = compile_ir_feature_columns(fc_dict, model_type)
     self.assertEqual(len(rt_fc_dict), 1)
     self.assertTrue("feature_columns" in rt_fc_dict)
     fc_list = rt_fc_dict.get("feature_columns")
     self.assertEqual(len(fc_list), 1)
     return fc_list[0]
Пример #5
0
def evaluate_step(datasource,
                  select,
                  result_table,
                  model,
                  label_name,
                  model_params,
                  pai_table=None):
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    if model_params is None:
        model_params = {}

    validation_metrics = model_params.get("validation.metrics", "Accuracy")
    validation_metrics = [m.strip() for m in validation_metrics.split(',')]
    validation_steps = model_params.get("validation.steps", None)
    batch_size = model_params.get("validation.batch_size", 1)
    verbose = model_params.get("validation.verbose", 0)

    conn = db.connect_with_data_source(datasource)
    create_evaluate_table(conn, result_table, validation_metrics)
    conn.close()

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())
    train_label_desc.name = label_name
    label_meta = train_label_desc.to_dict(dtype_to_string=True)

    _evaluate(datasource=datasource,
              estimator_string=estimator_string,
              select=select,
              result_table=result_table,
              feature_columns=feature_columns,
              feature_column_names=feature_column_names,
              feature_metas=feature_metas,
              label_meta=label_meta,
              model_params=model_params,
              validation_metrics=validation_metrics,
              save=save,
              batch_size=batch_size,
              validation_steps=validation_steps,
              verbose=verbose,
              pai_table=pai_table)
Пример #6
0
def explain_step(datasource, select, data_table, result_table, label_column,
                 oss_model_path):
    try:
        tf.enable_eager_execution()
    except Exception as e:
        sys.stderr.write("warning: failed to enable_eager_execution: %s" % e)
        pass

    (estimator, feature_column_names, feature_column_names_map, feature_metas,
     label_meta, model_params,
     feature_columns_code) = oss.load_metas(oss_model_path,
                                            "tensorflow_model_desc")

    fc_map_ir = feature_columns_code
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.TENSORFLOW)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])

    # NOTE(typhoonzero): No need to eval model_params["optimizer"] and
    # model_params["loss"] because predicting do not need these parameters.

    is_estimator = is_tf_estimator(import_model(estimator))

    # Keras single node is using h5 format to save the model, no need to deal
    # with export model format. Keras distributed mode will use estimator, so
    # this is also needed.
    model_name = oss_model_path.split("/")[-1]
    if is_estimator:
        oss.load_file(oss_model_path, "exported_path")
        # NOTE(typhoonzero): directory "model_save" is hardcoded in
        # codegen/tensorflow/codegen.go
        oss.load_dir("%s/%s" % (oss_model_path, model_name))
    else:
        oss.load_dir(os.path.join(oss_model_path, "model_save"))

    # (TODO: lhw) use oss to store result image
    _explain(datasource=datasource,
             estimator_string=estimator,
             select=select,
             feature_columns=feature_columns,
             feature_column_names=feature_column_names,
             feature_metas=feature_metas,
             label_meta=label_meta,
             model_params=model_params,
             save="model_save",
             result_table=result_table,
             pai_table=data_table,
             oss_dest=None,
             oss_ak=None,
             oss_sk=None,
             oss_endpoint=None,
             oss_bucket_name=None)
Пример #7
0
def explain_step(datasource, select, data_table, explainer, result_table,
                 label_column, oss_model_path):
    """Do XGBoost model explanation, this function use selected data to
    explain the model stored at oss_model_path

    Args:
        datasource: The datasource to load explain data
        select: SQL statement to get the data set
        data_table: tmp table to save the explain data
        result_table: table to store the explanation result
        label_column: name of the label column
        oss_model_path: path to the model to be explained
    """
    # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
    # in xgboost/train.py
    oss.load_file(oss_model_path, "my_model")

    (estimator, model_params, train_params, feature_field_meta,
     feature_column_names, label_field_meta,
     fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    summary_params = dict()
    for k in model_params:
        if k.startswith("summary."):
            summary_key = k.replace("summary.", "")
            summary_params[summary_key] = model_params[k]
    explain_xgb(
        datasource=datasource,
        select=select,
        feature_field_meta=feature_field_meta,
        feature_column_names=feature_column_names,
        label_meta=label_field_meta,
        summary_params=summary_params,
        explainer=explainer,
        result_table=result_table,
        is_pai=True,
        pai_explain_table=data_table,
        # (TODO:lhw) save/load explain result storage info into/from FLAGS
        oss_dest="",
        oss_ak="",
        oss_sk="",
        oss_endpoint="",
        oss_bucket_name="",
        transform_fn=transform_fn,
        feature_column_code=fc_map_ir)
Пример #8
0
def predict_step(datasource, select, data_table, result_table, label_column,
                 oss_model_path):
    """PAI XGBoost prediction wrapper
    This function do some preparation for the local prediction, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        label_column: prediction label column
        oss_model_path: the model path on OSS
    """
    # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
    # in xgboost/train.py
    oss.load_file(oss_model_path, "my_model")
    (estimator, model_params, train_params, feature_metas,
     feature_column_names, label_meta,
     fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")

    pred_label_meta = copy.copy(label_meta)
    pred_label_meta["feature_name"] = label_column

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    pred(datasource=datasource,
         select=select,
         feature_metas=feature_metas,
         feature_column_names=feature_column_names,
         train_label_meta=label_meta,
         pred_label_meta=label_meta,
         result_table=result_table,
         is_pai=True,
         pai_table=data_table,
         model_params=model_params,
         train_params=train_params,
         transform_fn=transform_fn,
         feature_column_code=fc_map_ir)
Пример #9
0
def predict(datasource,
            select,
            result_table,
            result_column_names,
            train_label_idx,
            model,
            extra_result_cols=[],
            pai_table=None):
    """TBD
    """
    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        # NOTE(typhoonzero): must run Model.load_from_db in a temp
        # directory, calling pyodps in current directory on PAI
        # workers will cause paiio fails.
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    model_params = model.get_meta("attributes")
    fc_map_ir = model.get_meta("features")
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    is_pai = True if pai_table else False
    if is_pai:
        conn = PaiIOConnection.from_table(pai_table)
    else:
        conn = db.connect_with_data_source(datasource)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(datasource=datasource,
                            fn=pred_fn,
                            dataset_sql=select,
                            feature_metas=feature_metas,
                            feature_column_names=feature_column_names,
                            label_meta=None,
                            cache=True,
                            batch_size=10000,
                            transform_fn=transform_fn,
                            raw_data_dir=raw_data_dir,
                            is_pai=is_pai,
                            pai_table=pai_table,
                            pai_single_file=True,
                            feature_column_code=fc_map_ir)

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            if is_pai:
                feature_file_name = os.path.join(tmp_dir_name,
                                                 "predict.txt.raw")
            else:
                feature_file_name = os.path.join(
                    tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Пример #10
0
def train(original_sql,
          model_image,
          estimator_string,
          datasource,
          select,
          validation_select,
          model_params,
          train_params,
          feature_column_map,
          label_column,
          save,
          load=None):
    """
    Train, evaluate and save the XGBoost model locally.

    Args:
        original_sql (str): the original SQL statement.
        model_image (str): the model repo docker image.
        estimator (str): the XGBoost booster type like xgboost.gbtree.
        datasource (str): the database connection URI.
        select (str): the SQL statement for training.
        validation_select (str): the SQL statement for evaluation.
        model_params (dict): the XGBoost model parameters.
        train_params (dict): the training parameters, can have
                             disk_cache(bool), batch_size(int), epoch(int)
                             settings in the dict.
        feature_column_map (dict): the feature column map to do derivation.
        label_column (FeatureColumn): the label column.
        save (str): the table name to save the trained model and meta.
        load (str): the table name to load the pretrained model.

    Returns:
        A dict which indicates the evaluation result.
    """
    conn = db.connect_with_data_source(datasource)
    fc_map_ir, fc_label_ir = infer_feature_columns(conn,
                                                   select,
                                                   feature_column_map,
                                                   label_column,
                                                   n=1000)
    fc_map = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST)

    feature_column_list = fc_map["feature_columns"]
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])
    label_meta = label_column.get_field_desc()[0].to_dict()

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *feature_column_list)

    disk_cache = False
    batch_size = None
    epoch = 1
    if "disk_cache" in train_params:
        disk_cache = train_params.pop("disk_cache")
    if "batch_size" in train_params:
        batch_size = train_params.pop("batch_size")
    if "epoch" in train_params:
        epoch = train_params.pop("epoch")

    def build_dataset(fn, slct):
        return xgb_dataset(datasource,
                           fn,
                           slct,
                           feature_metas,
                           feature_column_names,
                           label_meta,
                           cache=disk_cache,
                           batch_size=batch_size,
                           epoch=epoch,
                           transform_fn=transform_fn)

    file_name = "my_model"
    if load:
        Model.load_from_db(datasource, load)
        bst = xgb.Booster()
        bst.load_model(file_name)
    else:
        bst = None

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        train_fn = os.path.join(tmp_dir_name, 'train.txt')
        val_fn = os.path.join(tmp_dir_name, 'val.txt')
        train_dataset = build_dataset(train_fn, select)
        if validation_select:
            val_dataset = build_dataset(val_fn, validation_select)
        else:
            val_dataset = None

        eval_result = dict()
        watchlist = [None]
        if val_dataset:
            # The `xgboost.train` API only accepts the XGBoost DMatrix
            # object as the training or validation dataset, so we should
            # convert the generator to DMatrix.
            if isinstance(val_dataset, types.GeneratorType):
                val_dataset = list(val_dataset)[0]
            watchlist.append((val_dataset, "validate"))

        for per_batch_dmatrix in train_dataset:
            watchlist[0] = (per_batch_dmatrix, "train")
            bst = xgb.train(model_params,
                            per_batch_dmatrix,
                            evals=watchlist,
                            evals_result=eval_result,
                            xgb_model=bst,
                            **train_params)
            print("Evaluation result: %s" % eval_result)

    meta = collect_metadata(original_sql=original_sql,
                            select=select,
                            validation_select=validation_select,
                            model_repo_image=model_image,
                            class_name=estimator_string,
                            attributes=model_params,
                            features=fc_map_ir,
                            label=fc_label_ir,
                            evaluation=eval_result,
                            num_workers=1)

    save_model_to_local_file(bst, model_params, file_name)
    model = Model(EstimatorType.XGBOOST, meta)
    model.save_to_db(datasource, save)
    return eval_result
Пример #11
0
def explain_step(datasource,
                 select,
                 explainer,
                 model_params,
                 result_table,
                 model,
                 pai_table=None,
                 oss_dest=None,
                 oss_ak=None,
                 oss_sk=None,
                 oss_endpoint=None,
                 oss_bucket_name=None):
    """
    Do explanation to a trained TensorFlow model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        explainer (str): the explainer to explain the model.
                         Not used in TensorFlow models.
        model_params (dict): the parameters for evaluation.
        result_table (str): the output data table.
        model (Model|str): the model object or where to load the model.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    plot_type = model_params.get("summary.plot_type", "bar")

    train_attributes = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())

    label_name = model_params.get("label_col", train_label_desc.name)
    train_label_desc.name = label_name
    label_meta = train_label_desc.to_dict(dtype_to_string=True)

    if pai_table:
        assert oss_dest, "oss_dest must be given when submit to PAI"
    else:
        assert oss_dest is None

    if os.environ.get('DISPLAY', '') == '':
        print('no display found. Using non-interactive Agg backend')
        matplotlib.use('Agg')

    _explain(datasource=datasource,
             estimator_string=estimator_string,
             select=select,
             feature_columns=feature_columns,
             feature_column_names=feature_column_names,
             feature_metas=feature_metas,
             label_meta=label_meta,
             model_params=train_attributes,
             save=save,
             pai_table=pai_table,
             plot_type=plot_type,
             result_table=result_table,
             oss_dest=oss_dest,
             oss_ak=oss_ak,
             oss_sk=oss_sk,
             oss_endpoint=oss_endpoint,
             oss_bucket_name=oss_bucket_name)

    print_image_as_base64_html('summary.png')
Пример #12
0
def evaluate(datasource,
             select,
             result_table,
             model,
             pred_label_name=None,
             model_params=None):
    """
    Do evaluation to a trained TensorFlow model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        model (Model|str): the model object or where to load the model.
        pred_label_name (str): the label column name.
        model_params (dict): the parameters for evaluation.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    validation_metrics = model_params.get("validation.metrics", "Accuracy")
    validation_metrics = [m.strip() for m in validation_metrics.split(',')]
    validation_steps = model_params.get("validation.steps", None)
    batch_size = model_params.get("validation.batch_size", 1)
    verbose = model_params.get("validation.verbose", 0)

    conn = db.connect_with_data_source(datasource)
    create_evaluate_table(conn, result_table, validation_metrics)
    conn.close()

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())
    train_label_desc.name = pred_label_name
    label_meta = train_label_desc.to_dict(dtype_to_string=True)

    _evaluate(datasource=datasource,
              estimator_string=estimator_string,
              select=select,
              result_table=result_table,
              feature_columns=feature_columns,
              feature_column_names=feature_column_names,
              feature_metas=feature_metas,
              label_meta=label_meta,
              model_params=model_params,
              validation_metrics=validation_metrics,
              save=save,
              batch_size=batch_size,
              validation_steps=validation_steps,
              verbose=verbose)
Пример #13
0
def evaluate(datasource,
             select,
             result_table,
             model,
             pred_label_name=None,
             model_params=None):
    """
    Do evaluation to a trained XGBoost model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        model (Model|str): the model object or where to load the model.
        pred_label_name (str): the label column name.
        model_params (dict): the parameters for evaluation.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    if model_params is None:
        model_params = {}

    validation_metrics = model_params.get("validation.metrics", "Accuracy")
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    if pred_label_name:
        train_label_desc.name = pred_label_name

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")
    conn = db.connect_with_data_source(datasource)

    result_column_names = create_evaluate_table(conn, result_table,
                                                validation_metrics)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=train_label_desc.to_dict(dtype_to_string=True),
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn)

        for i, pred_dmatrix in enumerate(dpred):
            feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()
Пример #14
0
def train(original_sql,
          model_image,
          estimator_string,
          datasource,
          select,
          validation_select,
          model_params,
          train_params,
          validation_params,
          feature_column_map,
          label_column,
          save,
          load=None,
          pai_table="",
          pai_val_table=""):
    is_pai = True if pai_table != "" else False
    is_dist_train = False
    FLAGS = None
    oss_model_dir = ""

    if is_pai:
        FLAGS = define_tf_flags()
        num_workers = len(FLAGS.worker_hosts.split(","))
        is_dist_train = num_workers > 1
        oss_model_dir = FLAGS.sqlflow_oss_modeldir
        try:
            oss_path_to_load = train_params.pop("oss_path_to_load")
            if load:
                oss.load_file(oss_path_to_load, "my_model")
        except:  # noqa: E722
            pass

    feature_columns = compile_ir_feature_columns(feature_column_map,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(feature_column_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    label_meta = label_column.get_field_desc()[0].to_dict(dtype_to_string=True)

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    batch_size = train_params.pop("batch_size", None)
    epoch = train_params.pop("epoch", 1)
    load_pretrained_model = True if load else False
    disk_cache = train_params.pop("disk_cache", False)

    if is_dist_train:
        # NOTE(typhoonzero): dist_train returns None
        dist_train(flags=FLAGS,
                   datasource=datasource,
                   select=select,
                   model_params=model_params,
                   train_params=train_params,
                   feature_metas=feature_metas,
                   feature_column_names=feature_column_names,
                   label_meta=label_meta,
                   validation_select=validation_select,
                   disk_cache=disk_cache,
                   batch_size=batch_size,
                   epoch=epoch,
                   load_pretrained_model=load_pretrained_model,
                   is_pai=True,
                   pai_train_table=pai_table,
                   pai_validate_table=pai_val_table,
                   oss_model_dir=oss_model_dir,
                   transform_fn=transform_fn,
                   feature_column_code=feature_column_map,
                   model_repo_image=model_image,
                   original_sql=original_sql)
    else:
        return local_train(original_sql,
                           model_image,
                           estimator_string,
                           datasource,
                           select,
                           validation_select,
                           model_params,
                           train_params,
                           feature_metas,
                           feature_column_names,
                           feature_column_map,
                           label_column,
                           transform_fn,
                           save,
                           load=load,
                           is_pai=is_pai,
                           oss_model_dir=oss_model_dir)
Пример #15
0
def pred(datasource, select, result_table, pred_label_name, model):
    """
    Do prediction using a trained model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        pred_label_name (str): the output label name to predict.
        model (Model|str): the model object or where to load the model.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, pred_label_name)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=None,
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            raw_data_dir=raw_data_dir)  # NOTE: default to use external memory

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            feature_file_name = os.path.join(
                tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Пример #16
0
def predict_step(datasource,
                 select,
                 result_table,
                 label_name,
                 model,
                 pai_table=None):
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    label_meta = model.get_meta("label")
    train_label_desc = label_meta.get_field_desc()[0] if label_meta else None
    train_label_name = train_label_desc.name if train_label_desc else None
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())

    is_pai = True if pai_table else False
    if is_pai:
        select = "SELECT * FROM %s" % pai_table

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, label_name)

    if is_pai:
        conn.close()
        conn = PaiIOConnection.from_table(pai_table)
        select = None

    selected_cols = result_column_names[0:-1]
    if train_label_idx >= 0:
        selected_cols = selected_cols[0:train_label_idx] + [
            train_label_name
        ] + selected_cols[train_label_idx:]

    estimator = import_model(estimator_string)
    model_params.update(feature_columns)
    is_estimator = is_tf_estimator(estimator)
    predict_generator = db.db_generator(conn, select)

    pop_optimizer_and_loss(model_params)
    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, train_label_name,
                      label_name, conn, predict_generator, selected_cols)
    else:
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(result_table, feature_column_names, feature_metas,
                          train_label_name, label_name, conn,
                          predict_generator, selected_cols)

    print("Done predicting. Predict table : %s" % result_table)
    conn.close()
Пример #17
0
def train_step(original_sql,
               model_image,
               estimator_string,
               datasource,
               select,
               validation_select,
               model_params,
               train_params,
               validation_params,
               feature_column_map,
               label_column,
               save,
               load=None,
               pai_table=None,
               pai_val_table=None):
    if model_params is None:
        model_params = {}

    if train_params is None:
        train_params = {}

    if validation_params is None:
        validation_params = {}

    if load:
        Model.load_from_db(datasource, load)
        load = "model_save"
    else:
        load = None

    is_pai = True if pai_table else False

    fc_map = compile_ir_feature_columns(feature_column_map,
                                        EstimatorType.TENSORFLOW)
    field_descs = get_ordered_field_descs(feature_column_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # no label for clustering model
    label_meta = None
    if label_column:
        label_meta = label_column.get_field_desc()[0].to_dict(
            dtype_to_string=True)

    feature_column_names_map = dict()
    for target in feature_column_map:
        fclist = feature_column_map[target]
        feature_column_names_map[target] = [
            fc.get_field_desc()[0].name for fc in fclist
        ]

    # Construct optimizer objects to pass to model initializer.
    # The original model_params is serializable (do not have tf.xxx objects).
    model_params_constructed = copy.deepcopy(model_params)
    for optimizer_arg in ["optimizer", "dnn_optimizer", "linear_optimizer"]:
        if optimizer_arg in model_params_constructed:
            model_params_constructed[optimizer_arg] = get_tf_optimizer(
                model_params_constructed[optimizer_arg])

    if "loss" in model_params_constructed:
        model_params_constructed["loss"] = get_tf_loss(
            model_params_constructed["loss"])

    # extract params for training.
    verbose = train_params.get("verbose", 1)
    batch_size = train_params.get("batch_size", 1)
    epoch = train_params.get("epoch", 1)
    save_checkpoints_steps = train_params.get("save_checkpoints_steps", 100)
    max_steps = train_params.get("max_steps", None)
    if max_steps is not None and max_steps <= 0:
        max_steps = None

    validation_metrics = validation_params.get("metrics", "Accuracy")
    validation_metrics = [v.strip() for v in validation_metrics.split(",")]
    validation_steps = validation_params.get("steps", 1)
    validation_start_delay_secs = validation_params.get("start_delay_secs", 0)
    validation_throttle_secs = validation_params.get("throttle_secs", 0)

    estimator = import_model(estimator_string)
    is_estimator = is_tf_estimator(estimator)

    # always use verbose == 1 when using PAI to get more logs
    if verbose < 1:
        verbose = 1
    set_log_level(verbose, is_estimator)

    model_params_constructed.update(fc_map)

    FLAGS = define_tf_flags()
    set_oss_environs(FLAGS)
    num_workers = len(FLAGS.worker_hosts.split(","))
    worker_id = FLAGS.task_index

    train_dataset_fn = get_dataset_fn(select,
                                      datasource,
                                      feature_column_names,
                                      feature_metas,
                                      label_meta,
                                      is_pai,
                                      pai_table,
                                      batch_size,
                                      epochs=epoch,
                                      shuffle_size=1000,
                                      num_workers=num_workers,
                                      worker_id=worker_id)
    val_dataset_fn = None
    if validation_select or pai_val_table:
        val_dataset_fn = get_dataset_fn(validation_select, datasource,
                                        feature_column_names, feature_metas,
                                        label_meta, is_pai, pai_val_table,
                                        batch_size)

    model_meta = collect_metadata(original_sql=original_sql,
                                  select=select,
                                  validation_select=validation_select,
                                  model_repo_image=model_image,
                                  class_name=estimator_string,
                                  attributes=model_params,
                                  features=feature_column_map,
                                  label=label_column)

    # FIXME(typhoonzero): avoid save model_meta twice, keras_train_and_save,
    # estimator_train_and_save also dumps model_meta to a file under cwd.
    # should only keep the model.save_to_db part.
    save_dir = "model_save"
    if not is_estimator:
        if isinstance(estimator, types.FunctionType):
            # functional model need field_metas parameter
            model_params_constructed["field_metas"] = feature_metas
        keras_train_and_save(estimator, model_params_constructed, save_dir,
                             FLAGS, train_dataset_fn, val_dataset_fn,
                             label_meta, epoch, verbose, validation_metrics,
                             validation_steps, load, model_meta, is_pai)
    else:
        estimator_train_and_save(estimator, model_params_constructed, save_dir,
                                 FLAGS, train_dataset_fn, val_dataset_fn,
                                 max_steps, validation_start_delay_secs,
                                 validation_throttle_secs,
                                 save_checkpoints_steps, validation_metrics,
                                 load, model_meta)

    # save model to DB/OSS
    model = Model(EstimatorType.TENSORFLOW, model_meta)
    if num_workers == 1 or worker_id == 0:
        saved = model.save_to_db(datasource,
                                 save,
                                 oss_model_dir=FLAGS.sqlflow_oss_modeldir)
        print("Model saved to DB: %s" % saved)

    print("Done training")
Пример #18
0
def evaluate(datasource,
             select,
             result_table,
             model,
             label_name=None,
             model_params=None,
             result_column_names=[],
             pai_table=None):
    """TBD
    """
    if model_params is None:
        model_params = {}
    validation_metrics = model_params.get("validation.metrics",
                                          "accuracy_score")
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]

    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    model_params = model.get_meta("attributes")
    fc_map_ir = model.get_meta("features")
    train_label = model.get_meta("label")
    train_label_desc = train_label.get_field_desc()[0]

    if label_name:
        train_label_desc.name = label_name

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    is_pai = True if pai_table else False
    if is_pai:
        conn = PaiIOConnection.from_table(pai_table)
    else:
        conn = db.connect_with_data_source(datasource)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=train_label_desc.to_dict(dtype_to_string=True),
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            is_pai=is_pai,
            pai_table=pai_table,
            pai_single_file=True,
            feature_column_code=fc_map_ir)

        for i, pred_dmatrix in enumerate(dpred):
            if is_pai:
                feature_file_name = pred_fn
            else:
                feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()
Пример #19
0
def explain(datasource, select, explainer, model_params, result_table, model):
    """
    Do explanation to a trained TensorFlow model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        explainer (str): the explainer to explain the model.
                         Not used in TensorFlow models.
        model_params (dict): the parameters for evaluation.
        result_table (str): the output data table.
        model (Model|str): the model object or where to load the model.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    plot_type = model_params.get("summary.plot_type", "bar")

    train_attributes = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())

    label_name = model_params.get("label_col", train_label_desc.name)
    train_label_desc.name = label_name
    label_meta = train_label_desc.to_dict(dtype_to_string=True)

    if result_table:
        conn = db.connect_with_data_source(datasource)
        if estimator_string.startswith("BoostedTrees"):
            column_defs = [
                "feature %s" %
                DataType.to_db_field_type(conn.driver, DataType.STRING),
                "dfc %s" %
                DataType.to_db_field_type(conn.driver, DataType.FLOAT32),
                "gain %s" %
                DataType.to_db_field_type(conn.driver, DataType.FLOAT32),
            ]
        else:
            selected_cols = db.selected_cols(conn, select)
            if label_name in selected_cols:
                selected_cols.remove(label_name)

            name_to_shape = dict([(fd.name, fd.shape) for fd in field_descs])
            column_defs = []
            float_field_type = DataType.to_db_field_type(
                conn.driver, DataType.FLOAT32)
            for name in selected_cols:
                shape = name_to_shape.get(name, None)
                if shape is None:
                    raise ValueError("cannot find column %s" % name)

                size = int(np.prod(shape))
                if size == 1:
                    column_def = "%s %s" % (name, float_field_type)
                    column_defs.append(column_def)
                else:
                    for i in six.moves.range(size):
                        column_def = "%s_%d %s" % (name, i, float_field_type)
                        column_defs.append(column_def)

        drop_sql = "DROP TABLE IF EXISTS %s;" % result_table
        create_sql = "CREATE TABLE %s (%s);" % (result_table,
                                                ",".join(column_defs))
        conn.execute(drop_sql)
        conn.execute(create_sql)
        conn.close()

    _explain(datasource=datasource,
             estimator_string=estimator_string,
             select=select,
             feature_columns=feature_columns,
             feature_column_names=feature_column_names,
             feature_metas=feature_metas,
             label_meta=label_meta,
             model_params=train_attributes,
             save=save,
             plot_type=plot_type,
             result_table=result_table)

    with open('summary.png', 'rb') as f:
        img = f.read()

    img = base64.b64encode(img)
    if six.PY3:
        img = img.decode('utf-8')
    img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \
          % img
    print(img)
Пример #20
0
def explain(datasource,
            select,
            explainer,
            model_params,
            result_table,
            model,
            pai_table="",
            oss_model_path=""):
    """Do XGBoost model explanation, this function use selected data to
    explain the model stored at oss_model_path

    Args:
        datasource: The datasource to load explain data
        select: SQL statement to get the data set
        data_table: tmp table to save the explain data
        result_table: table to store the explanation result
        label_column: name of the label column
        oss_model_path: path to the model to be explained
    """
    if model_params is None:
        model_params = {}

    summary_params = dict()
    for k in model_params:
        if k.startswith("summary."):
            summary_key = k.replace("summary.", "")
            summary_params[summary_key] = model_params[k]

    is_pai = True if pai_table != "" else False
    if is_pai:
        # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
        # in xgboost/train.py
        oss.load_file(oss_model_path, "my_model")

        (estimator, model_params, train_params, feature_field_meta,
         feature_column_names, label_desc,
         fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")
        label_meta = label_desc.to_dict(dtype_to_string=True)
    else:
        if isinstance(model, six.string_types):
            model = Model.load_from_db(datasource, model)
        else:
            assert isinstance(
                model, Model), "not supported model type %s" % type(model)
        fc_map_ir = model.get_meta("features")
        label_meta = model.get_meta("label").get_field_desc()[0].to_dict(
            dtype_to_string=True)

    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(fc_map_ir, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    dataset = xgb_shap_dataset(datasource, select, feature_column_names,
                               label_meta, feature_metas, transform_fn)

    bst = xgb.Booster()
    bst.load_model("my_model")

    if explainer == "XGBoostExplainer":
        xgb_native_explain(bst, datasource, result_table)
    else:
        # when explainer is "" or "TreeExplainer" use SHAP by default.
        shap_explain(bst, datasource, dataset, summary_params, result_table)
Пример #21
0
def train_step(original_sql,
               model_image,
               estimator_string,
               datasource,
               select,
               validation_select,
               pai_table,
               pai_val_table,
               model_params,
               train_params,
               feature_column_map,
               label_column,
               save,
               load=None):
    FLAGS = define_tf_flags()
    num_workers = len(FLAGS.worker_hosts.split(","))
    is_dist_train = num_workers > 1
    oss_model_dir = FLAGS.sqlflow_oss_modeldir

    oss_path_to_load = train_params.pop("oss_path_to_load")
    if load:
        oss.load_file(oss_path_to_load, "my_model")

    conn = db.connect_with_data_source(datasource)
    fc_map_ir, fc_label_ir = infer_feature_columns(conn,
                                                   select,
                                                   feature_column_map,
                                                   label_column,
                                                   n=1000)
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])
    label_meta = label_column.get_field_desc()[0].to_dict()

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    batch_size = train_params.pop("batch_size", None)
    epoch = train_params.pop("epoch", 1)
    load_pretrained_model = True if load else False
    disk_cache = train_params.pop("disk_cache", False)

    if is_dist_train:
        dist_train(flags=FLAGS,
                   datasource=datasource,
                   select=select,
                   model_params=model_params,
                   train_params=train_params,
                   feature_metas=feature_metas,
                   feature_column_names=feature_column_names,
                   label_meta=label_meta,
                   validation_select=validation_select,
                   disk_cache=disk_cache,
                   batch_size=batch_size,
                   epoch=epoch,
                   load_pretrained_model=load_pretrained_model,
                   is_pai=True,
                   pai_train_table=pai_table,
                   pai_validate_table=pai_val_table,
                   oss_model_dir=oss_model_dir,
                   transform_fn=transform_fn,
                   feature_column_code=fc_map_ir,
                   model_repo_image=model_image,
                   original_sql=original_sql)
    else:
        local_train(datasource=datasource,
                    select=select,
                    model_params=model_params,
                    train_params=train_params,
                    feature_metas=feature_metas,
                    feature_column_names=feature_column_names,
                    label_meta=label_meta,
                    validation_select=validation_select,
                    disk_cache=disk_cache,
                    batch_size=batch_size,
                    epoch=epoch,
                    load_pretrained_model=load_pretrained_model,
                    is_pai=True,
                    pai_train_table=pai_table,
                    pai_validate_table=pai_val_table,
                    rank=0,
                    nworkers=1,
                    oss_model_dir=oss_model_dir,
                    transform_fn=transform_fn,
                    feature_column_code=fc_map_ir,
                    model_repo_image=model_image,
                    original_sql=original_sql)
Пример #22
0
def explain(datasource,
            select,
            explainer,
            model_params,
            result_table,
            model,
            pai_table="",
            oss_model_path="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None):
    """TBD
    """
    if model_params is None:
        model_params = {}

    summary_params = dict()
    for k in model_params:
        if k.startswith("summary."):
            summary_key = k.replace("summary.", "")
            summary_params[summary_key] = model_params[k]

    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    fc_map_ir = model.get_meta("features")
    label_meta = model.get_meta("label").get_field_desc()[0].to_dict(
        dtype_to_string=True)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    is_pai = True if pai_table else False
    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST)
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    dataset = xgb_shap_dataset(datasource, select, feature_column_names,
                               label_meta, feature_metas, is_pai, pai_table,
                               transform_fn)

    if explainer == "XGBoostExplainer":
        xgb_native_explain(bst, datasource, result_table)
    else:
        # when explainer is "" or "TreeExplainer" use SHAP by default.
        shap_explain(bst,
                     datasource,
                     dataset,
                     summary_params,
                     result_table,
                     is_pai=is_pai,
                     oss_dest=oss_dest,
                     oss_ak=oss_ak,
                     oss_sk=oss_sk,
                     oss_endpoint=oss_endpoint,
                     oss_bucket_name=oss_bucket_name)
Пример #23
0
def shap_explain(booster, datasource, select, summary_params, result_table,
                 model):
    train_fc_map = model.get_meta("features")
    label_meta = model.get_meta("label").get_field_desc()[0].to_dict(
        dtype_to_string=True)

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    dataset = xgb_shap_dataset(datasource, select, feature_column_names,
                               label_meta, feature_metas, transform_fn)

    tree_explainer = shap.TreeExplainer(booster)
    shap_values = tree_explainer.shap_values(dataset)
    if result_table:
        conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values

        columns = list(dataset.columns)
        dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32)
                  ] * len(columns)
        _create_table(conn, result_table, columns, dtypes)
        with db.buffered_db_writer(conn, result_table, columns) as w:
            for row in to_write:
                w.write(list(row))

        conn.close()

    if summary_params.get("plot_type") == "decision":
        shap_interaction_values = tree_explainer.shap_interaction_values(
            dataset)
        expected_value = tree_explainer.expected_value
        if isinstance(shap_interaction_values, list):
            shap_interaction_values = shap_interaction_values[0]
        if isinstance(expected_value, list):
            expected_value = expected_value[0]

        plot_func = lambda: shap.decision_plot(  # noqa: E731
            expected_value,
            shap_interaction_values,
            dataset,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1)
    else:
        plot_func = lambda: shap.summary_plot(  # noqa: E731
            shap_values, dataset, show=False, **summary_params)

    filename = 'summary.png'
    with temp_file.TemporaryDirectory(as_cwd=True):
        explainer.plot_and_save(plot_func, filename=filename)
        with open(filename, 'rb') as f:
            img = f.read()

    img = base64.b64encode(img)
    if six.PY3:
        img = img.decode('utf-8')
    img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \
          % img
    print(img)
Пример #24
0
def predict(datasource,
            select,
            result_table,
            label_name,
            model,
            pai_table="",
            oss_model_path=""):
    """PAI XGBoost prediction wrapper
    This function do some preparation for the local prediction, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        label_name: prediction label column
        oss_model_path: the model path on OSS
    """
    is_pai = True if pai_table != "" else False
    if is_pai:
        # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
        # in xgboost/train.py
        oss.load_file(oss_model_path, "my_model")
        (estimator, model_params, train_params, feature_metas,
         feature_column_names, train_label_desc,
         fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")
    else:
        if isinstance(model, six.string_types):
            model = Model.load_from_db(datasource, model)
        else:
            assert isinstance(
                model, Model), "not supported model type %s" % type(model)

        model_params = model.get_meta("attributes")
        fc_map_ir = model.get_meta("features")
        train_label_desc = model.get_meta("label").get_field_desc()[0]

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, label_name)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=None,
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            raw_data_dir=raw_data_dir)  # NOTE: default to use external memory

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            feature_file_name = os.path.join(
                tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Пример #25
0
def evaluate(datasource,
             select,
             result_table,
             load,
             pred_label_name=None,
             validation_metrics=["accuracy_score"]):
    """
    Do evaluation to a trained XGBoost model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        load (str): where the trained model stores.
        pred_label_name (str): the label column name.
        validation_metrics (list[str]): the evaluation metric names.

    Returns:
        None.
    """
    model = Model.load_from_db(datasource, load)
    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    if pred_label_name:
        train_label_desc.name = pred_label_name

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")
    conn = db.connect_with_data_source(datasource)

    result_column_names = _create_evaluate_table(conn, result_table,
                                                 validation_metrics)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(datasource=datasource,
                            fn=pred_fn,
                            dataset_sql=select,
                            feature_metas=feature_metas,
                            feature_column_names=feature_column_names,
                            label_meta=train_label_desc.to_dict(),
                            cache=True,
                            batch_size=10000,
                            transform_fn=transform_fn)

        for i, pred_dmatrix in enumerate(dpred):
            feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()