예제 #1
0
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec,
                     feature_specs, is_pai, pai_explain_table):
    label_column_name = label_spec["feature_name"]
    if is_pai:
        pai_table_parts = pai_explain_table.split(".")
        formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                      pai_table_parts[1])
        stream = db.pai_maxcompute_db_generator(formated_pai_table,
                                                feature_column_names,
                                                label_column_name,
                                                feature_specs)
    else:
        conn = db.connect_with_data_source(datasource)
        stream = db.db_generator(conn.driver, conn, select,
                                 feature_column_names, label_spec,
                                 feature_specs)

    xs = pd.DataFrame(columns=feature_column_names)
    i = 0
    for row in stream():
        xs.loc[i] = [item[0] for item in row[0]]
        i += 1
    # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype
    # may be "object". Use below code to reproduce:
    # import pandas as pd
    # feature_column_names=["a", "b"]
    # xs = pd.DataFrame(columns=feature_column_names)
    # for i in range(10):
    #     xs.loc[i] = [int(j) for j in range(2)]
    # print(xs.dtypes)
    for fname in feature_column_names:
        dtype = feature_specs[fname]["dtype"]
        xs[fname] = xs[fname].astype(dtype)
    return xs
예제 #2
0
    def eval_input_fn(batch_size, cache=False):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))

        if is_pai:
            pai_table_parts = pai_table.split(".")
            formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                           pai_table_parts[1])
            gen = db.pai_maxcompute_db_generator(formatted_pai_table,
                                                 feature_column_names, None,
                                                 feature_metas)
        else:
            gen = db.db_generator(conn.driver, conn, select,
                                  feature_column_names, None, feature_metas)
        dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        dataset = dataset.map(ds_mapper).batch(batch_size)
        if cache:
            dataset = dataset.cache()
        return dataset
예제 #3
0
def pai_download_table_data_worker(dname, feature_specs, feature_column_names,
                                   label_spec, pai_table, slice_id,
                                   slice_count, feature_column_code,
                                   raw_data_dir):
    import sqlflow_submitter.xgboost as xgboost_extended
    feature_column_transformers = eval('[{}]'.format(feature_column_code))
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *feature_column_transformers)

    label_column_name = label_spec['feature_name'] if label_spec else None
    gen = db.pai_maxcompute_db_generator(pai_table,
                                         feature_column_names,
                                         label_column_name,
                                         feature_specs,
                                         slice_id=slice_id,
                                         slice_count=slice_count)()
    selected_cols = db.pai_selected_cols(pai_table)
    filename = "{}/{}.txt".format(dname, slice_id)
    dump_dmatrix(filename,
                 gen,
                 feature_column_names,
                 feature_specs,
                 label_spec,
                 selected_cols,
                 transform_fn=transform_fn,
                 raw_data_dir=raw_data_dir)
예제 #4
0
def pai_download_table_data_worker(dname, feature_specs, feature_column_names,
                                   label_spec, pai_table, slice_id):
    label_column_name = label_spec['feature_name'] if label_spec else None
    gen = db.pai_maxcompute_db_generator(pai_table,
                                         feature_column_names,
                                         label_column_name,
                                         feature_specs,
                                         slice_id=slice_id,
                                         slice_count=SLICE_NUM)()
    filename = "{}/{}.txt".format(dname, slice_id)
    dump_dmatrix(filename, gen, feature_column_names, feature_specs,
                 label_spec)
예제 #5
0
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_column_names_map,
                      feature_columns, feature_metas, result_col_name,
                      datasource, select, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass, is_pai, pai_table):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)

    column_names = feature_column_names[:]
    column_names.append(result_col_name)

    if is_pai:
        driver = "pai_maxcompute"
        conn = None
        pai_table_parts = pai_table.split(".")
        formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                       pai_table_parts[1])
        predict_generator = db.pai_maxcompute_db_generator(
            formatted_pai_table, feature_column_names, None, feature_metas)()
    else:
        driver = conn.driver
        predict_generator = db.db_generator(conn.driver, conn, select,
                                            feature_column_names, None,
                                            feature_metas)()
    # load from the exported model
    with open("exported_path", "r") as fn:
        export_path = fn.read()
    if tf_is_version2():
        imported = tf.saved_model.load(export_path)
    else:
        imported = tf.saved_model.load_v2(export_path)

    def add_to_example(example, x, i):
        feature_name = feature_column_names[i]
        dtype_str = feature_metas[feature_name]["dtype"]
        if feature_metas[feature_name]["delimiter"] != "":
            if feature_metas[feature_name]["is_sparse"]:
                # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only
                values = x[0][i][0].flatten()
            else:
                values = x[0][i].flatten()
            if dtype_str == "float32" or dtype_str == "float64":
                example.features.feature[feature_name].float_list.value.extend(
                    list(values))
            elif dtype_str == "int32" or dtype_str == "int64":
                example.features.feature[feature_name].int64_list.value.extend(
                    list(values))
        else:
            if "feature_columns" in feature_columns:
                idx = feature_column_names.index(feature_name)
                fc = feature_columns["feature_columns"][idx]
            else:
                # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param.
                idx = -1
                try:
                    idx = feature_column_names_map[
                        "dnn_feature_columns"].index(feature_name)
                    fc = feature_columns["dnn_feature_columns"][idx]
                except:
                    try:
                        idx = feature_column_names_map[
                            "linear_feature_columns"].index(feature_name)
                        fc = feature_columns["linear_feature_columns"][idx]
                    except:
                        pass
                if idx == -1:
                    raise ValueError(
                        "can not found feature %s in all feature columns")
            if dtype_str == "float32" or dtype_str == "float64":
                # need to pass a tuple(float, )
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "int32" or dtype_str == "int64":
                numeric_type = type(tf.feature_column.numeric_column("tmp"))
                if type(fc) == numeric_type:
                    example.features.feature[
                        feature_name].float_list.value.extend(
                            (float(x[0][i][0]), ))
                else:
                    example.features.feature[
                        feature_name].int64_list.value.extend(
                            (int(x[0][i][0]), ))
            elif dtype_str == "string":
                example.features.feature[feature_name].bytes_list.value.extend(
                    x[0][i])

    def predict(x):
        example = tf.train.Example()
        for i in range(len(feature_column_names)):
            add_to_example(example, x, i)
        return imported.signatures["predict"](
            examples=tf.constant([example.SerializeToString()]))

    with db.buffered_db_writer(driver, conn, result_table, column_names, 100,
                               hdfs_namenode_addr, hive_location, hdfs_user,
                               hdfs_pass) as w:
        for features in predict_generator:
            result = predict(features)
            row = []
            for idx, _ in enumerate(feature_column_names):
                per_feature = features[0][idx]
                if isinstance(per_feature, tuple) or isinstance(
                        per_feature, list):
                    # is sparse feature: tuple (indices, values, shape) or scalar
                    val = per_feature[0]
                elif isinstance(per_feature, np.ndarray):
                    val = per_feature
                # val = features[0][idx][0]
                row.append(str(val))
            if "class_ids" in result:
                row.append(str(result["class_ids"].numpy()[0][0]))
            else:
                # regression predictions
                row.append(str(result["predictions"].numpy()[0][0]))
            w.write(row)
예제 #6
0
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, result_col_name,
                      datasource, select, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass, is_pai, pai_table):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)

    column_names = feature_column_names[:]
    column_names.append(result_col_name)

    if is_pai:
        driver = "pai_maxcompute"
        conn = None
        pai_table_parts = pai_table.split(".")
        formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                      pai_table_parts[1])
        predict_generator = db.pai_maxcompute_db_generator(
            formated_pai_table, feature_column_names, None, feature_metas)()
    else:
        driver = conn.driver
        predict_generator = db.db_generator(conn.driver, conn, select,
                                            feature_column_names, None,
                                            feature_metas)()
    # load from the exported model
    if save.startswith("oss://"):
        with open("exported_path", "r") as fn:
            export_path = fn.read()
        parts = save.split("?")
        export_path_oss = parts[0] + export_path
        if TF_VERSION_2:
            imported = tf.saved_model.load(export_path_oss)
        else:
            imported = tf.saved_model.load_v2(export_path_oss)
    else:
        with open("exported_path", "r") as fn:
            export_path = fn.read()
        if TF_VERSION_2:
            imported = tf.saved_model.load(export_path)
        else:
            imported = tf.saved_model.load_v2(export_path)

    def add_to_example(example, x, i):
        feature_name = feature_column_names[i]
        dtype_str = feature_metas[feature_name]["dtype"]
        if feature_metas[feature_name]["delimiter"] != "":
            if feature_metas[feature_name]["is_sparse"]:
                # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only
                values = x[0][i][0].flatten()
            else:
                values = x[0][i].flatten()
            if dtype_str == "float32" or dtype_str == "float64":
                example.features.feature[feature_name].float_list.value.extend(
                    list(values))
            elif dtype_str == "int32" or dtype_str == "int64":
                example.features.feature[feature_name].int64_list.value.extend(
                    list(values))
        else:
            if dtype_str == "float32" or dtype_str == "float64":
                # need to pass a tuple(float, )
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "int32" or dtype_str == "int64":
                # FIXME(typhoonzero): figure out why int64 features need to convert to float
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "string":
                example.features.feature[feature_name].bytes_list.value.extend(
                    x[0][i])

    def predict(x):
        example = tf.train.Example()
        for i in range(len(feature_column_names)):
            add_to_example(example, x, i)
        return imported.signatures["predict"](
            examples=tf.constant([example.SerializeToString()]))

    with db.buffered_db_writer(driver, conn, result_table, column_names, 100,
                               hdfs_namenode_addr, hive_location, hdfs_user,
                               hdfs_pass) as w:
        for features in predict_generator:
            result = predict(features)
            row = []
            for idx, _ in enumerate(feature_column_names):
                per_feature = features[0][idx]
                if isinstance(per_feature, tuple) or isinstance(
                        per_feature, list):
                    # is sparse feature: tuple (indices, values, shape) or scalar
                    val = per_feature[0]
                elif isinstance(per_feature, np.ndarray):
                    val = per_feature
                # val = features[0][idx][0]
                row.append(str(val))
            if "class_ids" in result:
                row.append(str(result["class_ids"].numpy()[0][0]))
            else:
                # regression predictions
                row.append(str(result["predictions"].numpy()[0][0]))
            w.write(row)
예제 #7
0
def xgb_shap_dataset(datasource,
                     select,
                     feature_column_names,
                     label_spec,
                     feature_specs,
                     is_pai,
                     pai_explain_table,
                     transform_fn=None,
                     feature_column_code=""):
    label_column_name = label_spec["feature_name"]
    if is_pai:
        pai_table_parts = pai_explain_table.split(".")
        formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                       pai_table_parts[1])
        stream = db.pai_maxcompute_db_generator(formatted_pai_table,
                                                feature_column_names,
                                                label_column_name,
                                                feature_specs)
        selected_cols = db.pai_selected_cols(formatted_pai_table)
    else:
        conn = db.connect_with_data_source(datasource)
        stream = db.db_generator(conn.driver, conn, select,
                                 feature_column_names, label_spec,
                                 feature_specs)
        selected_cols = db.selected_cols(conn.driver, conn, select)

    if transform_fn:
        column_names = transform_fn.get_column_names()
    else:
        column_names = feature_column_names

    # NOTE(sneaxiy): pandas.DataFrame does not support Tensor whose rank is larger than 2.
    # But `INDICATOR` would generate one hot vector for each element, and pandas.DataFrame
    # would not accept `INDICATOR` results as its input. In a word, we do not support
    # `TO EXPLAIN` when using `INDICATOR`.
    xs = pd.DataFrame(columns=column_names)

    dtypes = []

    i = 0
    for row, label in stream():
        features = db.read_features_from_row(row, selected_cols,
                                             feature_column_names,
                                             feature_specs)
        if transform_fn:
            features = transform_fn(features)

        # TODO(sneaxiy): support sparse features in `TO EXPLAIN`
        features = [item[0] for item in features]
        xs.loc[i] = features

        if i == 0:
            for f in features:
                if isinstance(f, np.ndarray):
                    if f.dtype == np.float32 or f.dtype == np.float64:
                        dtypes.append('float32')
                    elif f.dtype == np.int32 or f.dtype == np.int64:
                        dtypes.append('int64')
                    else:
                        raise ValueError('Not supported data type {}'.format(
                            f.dtype))
                elif isinstance(f, (np.float32, np.float64, float)):
                    dtypes.append('float32')
                elif isinstance(f, (np.int32, np.int64, six.integer_types)):
                    dtypes.append('int64')
                else:
                    raise ValueError('Not supported data type {}'.format(
                        type(f)))

        i += 1
    # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype
    # may be "object". Use below code to reproduce:
    # import pandas as pd
    # feature_column_names=["a", "b"]
    # xs = pd.DataFrame(columns=feature_column_names)
    # for i in range(10):
    #     xs.loc[i] = [int(j) for j in range(2)]
    # print(xs.dtypes)
    for dtype, name in zip(dtypes, column_names):
        xs[name] = xs[name].astype(dtype)
    return xs