예제 #1
0
    def _do_test(self, driver, conn, hdfs_namenode_addr="", hive_location=""):
        table_name = "test_db"
        table_schema = ["label", "features"]
        values = [(1, '5,6,1,2')] * 10

        execute(driver, conn, self.drop_statement)

        if driver == "hive":
            execute(driver, conn, self.hive_create_statement)
        else:
            execute(driver, conn, self.create_statement)
        with buffered_db_writer(driver,
                                conn,
                                table_name,
                                table_schema,
                                buff_size=10,
                                hdfs_namenode_addr=hdfs_namenode_addr,
                                hive_location=hive_location) as w:
            for row in values:
                w.write(row)

        field_names, data = execute(driver, conn, self.select_statement)

        expect_features = ['5,6,1,2'] * 10
        expect_labels = [1] * 10

        self.assertEqual(field_names, ['features', 'label'])
        self.assertEqual(expect_features, data[0])
        self.assertEqual(expect_labels, data[1])
예제 #2
0
def save_solved_result_in_db(solved_result, data_frame, variables,
                             result_value_name, datasource, result_table):
    column_names = []
    for col in data_frame.columns:
        found = False
        for var in variables:
            if var.lower() == col.lower():
                found = True
                break

        if found:
            column_names.append(col)

    data_frame = data_frame[[*column_names]]

    if len(variables) == 1 and variables[0].lower() == result_value_name.lower(
    ):
        result_value_name += "_value"

    column_names.append(result_value_name)
    data_frame[result_value_name] = solved_result

    conn = db.connect_with_data_source(datasource)
    with db.buffered_db_writer(conn.driver, conn, result_table,
                               column_names) as w:
        for i in six.moves.range(len(data_frame)):
            rows = list(data_frame.loc[i])
            w.write(rows)

    print('Solved result is:')
    print(data_frame)
    print('Saved in {}.'.format(result_table))
예제 #3
0
def write_dfc_result(dfc_mean, gain, result_table, driver, conn,
                     feature_column_names, hdfs_namenode_addr, hive_location,
                     hdfs_user, hdfs_pass):
    with buffered_db_writer(conn, result_table, ["feature", "dfc", "gain"],
                            100) as w:
        for row_name in feature_column_names:
            w.write([row_name, dfc_mean.loc[row_name], gain[row_name]])
예제 #4
0
    def _do_test_hive_specified_db(self,
                                   driver,
                                   conn,
                                   hdfs_namenode_addr="",
                                   hive_location=""):
        create_db = '''create database if not exists test_db'''
        create_tbl = '''create table test_db.tbl (features string, label int)
                        ROW FORMAT DELIMITED FIELDS TERMINATED BY "\001"'''
        drop_tbl = '''drop table if exists test_db.tbl'''
        select_tbl = '''select * from test_db.tbl'''
        table_schema = ["label", "features"]
        values = [(1, '5,6,1,2')] * 10
        execute(driver, conn, create_db)
        execute(driver, conn, drop_tbl)
        execute(driver, conn, create_tbl)
        with buffered_db_writer(driver,
                                conn,
                                "test_db.tbl",
                                table_schema,
                                buff_size=10,
                                hdfs_namenode_addr=hdfs_namenode_addr,
                                hive_location=hive_location) as w:
            for row in values:
                w.write(row)

        field_names, data = execute(driver, conn, select_tbl)

        expect_features = ['5,6,1,2'] * 10
        expect_labels = [1] * 10

        self.assertEqual(field_names, ['features', 'label'])
        self.assertEqual(expect_features, data[0])
        self.assertEqual(expect_labels, data[1])
예제 #5
0
    def _do_test_hive_specified_db(self, conn):
        create_db = '''create database if not exists test_db'''
        create_tbl = '''create table test_db.tbl (features string, label int)
                        ROW FORMAT DELIMITED FIELDS TERMINATED BY "\001"'''
        drop_tbl = '''drop table if exists test_db.tbl'''
        select_tbl = '''select * from test_db.tbl'''
        table_schema = ["label", "features"]
        values = [(1, '5,6,1,2')] * 10
        self.assertTrue(conn.execute(create_db))
        self.assertTrue(conn.execute(drop_tbl))
        self.assertTrue(conn.execute(create_tbl))

        with buffered_db_writer(conn,
                                "test_db.tbl",
                                table_schema,
                                buff_size=10) as w:
            for row in values:
                w.write(row)

        field_names, data = execute(conn, select_tbl)

        expect_result = [('5,6,1,2', 1)] * 10

        self.assertEqual(field_names, ['features', 'label'])
        self.assertEqual(expect_result, data)
예제 #6
0
def shap_explain(booster,
                 datasource,
                 dataset,
                 summary_params,
                 result_table="",
                 is_pai=False,
                 oss_dest=None,
                 oss_ak=None,
                 oss_sk=None,
                 oss_endpoint=None,
                 oss_bucket_name=None):
    tree_explainer = shap.TreeExplainer(booster)
    shap_values = tree_explainer.shap_values(dataset)
    if result_table:
        if is_pai:
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values

        columns = list(dataset.columns)
        with db.buffered_db_writer(conn, result_table, columns) as w:
            for row in to_write:
                w.write(list(row))
        conn.close()

    if summary_params.get("plot_type") == "decision":
        shap_interaction_values = tree_explainer.shap_interaction_values(
            dataset)
        expected_value = tree_explainer.expected_value
        if isinstance(shap_interaction_values, list):
            shap_interaction_values = shap_interaction_values[0]
        if isinstance(expected_value, list):
            expected_value = expected_value[0]
        plot_func = lambda: shap.decision_plot(  # noqa: E731
            expected_value,
            shap_interaction_values,
            dataset,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1)
    else:
        plot_func = lambda: shap.summary_plot(  # noqa: E731
            shap_values, dataset, show=False, **summary_params)

    explainer.plot_and_save(plot_func,
                            oss_dest=oss_dest,
                            oss_ak=oss_ak,
                            oss_sk=oss_sk,
                            oss_endpoint=oss_endpoint,
                            oss_bucket_name=oss_bucket_name,
                            filename='summary')
예제 #7
0
def _store_predict_result(preds, result_table, result_column_names,
                          train_label_idx, feature_file_name, conn):
    """
    Save the prediction result in the table.

    Args:
        preds: the prediction result to save.
        result_table (str): the result table name.
        result_column_names (list[str]): the result column names.
        train_label_idx (int): the index where the trained label is inside
            result_column_names.
        feature_file_name (str): the file path where the feature dumps.
        conn: the database connection object.

    Returns:
        None.
    """
    with db.buffered_db_writer(conn, result_table, result_column_names) as w:
        with open(feature_file_name, "r") as feature_file_read:
            line_no = 0
            for line in feature_file_read.readlines():
                if not line:
                    break

                row = [
                    item for i, item in enumerate(line.strip().split(
                        DMATRIX_FILE_SEP)) if i != train_label_idx
                ]
                row.append(str(preds[line_no]))
                w.write(row)
                line_no += 1
예제 #8
0
def predict_and_store_result(bst, dpred, feature_file_id, model_params,
                             selected_cols, train_label_name, pred_label_name,
                             feature_column_names, feature_metas, is_pai, conn,
                             result_table, hdfs_namenode_addr, hive_location,
                             hdfs_user, hdfs_pass):
    preds = bst.predict(dpred)

    # TODO(yancey1989): should save train_params and model_params
    # not only on PAI submitter
    # TODO(yancey1989): output the original result for various
    # objective function.
    if model_params:
        obj = model_params["objective"]
        if obj.startswith("binary:"):
            preds = (preds > 0.5).astype(int)
        elif obj.startswith("multi:"):
            preds = np.argmax(np.array(preds), axis=1)
        else:
            # using the original prediction result of predict API by default
            pass
    else:
        # prediction output with multi-class job has two dimensions, this
        # is a temporary way, can remove this else branch when we can load
        # the model meta not only on PAI submitter.
        if len(preds.shape) == 2:
            preds = np.argmax(np.array(preds), axis=1)

    if is_pai:
        feature_file_read = open("predict.txt.raw", "r")
    else:
        feature_file_read = open(
            "predict.raw.dir/predict.txt_%d" % feature_file_id, "r")

    result_column_names = selected_cols[:]
    # remove train_label_name from result column, if train_label_name == "" or
    # the train_label_name is not selected, the index should be -1
    try:
        train_label_index = selected_cols.index(train_label_name)
    except ValueError:
        train_label_index = -1
    if train_label_index != -1:
        del result_column_names[train_label_index]
    result_column_names.append(pred_label_name)

    line_no = 0
    with db.buffered_db_writer(conn, result_table, result_column_names,
                               100) as w:
        while True:
            line = feature_file_read.readline()
            if not line:
                break
            # FIXME(typhoonzero): how to output columns that are not used
            # as features, like ids?
            row = [
                item for i, item in enumerate(line.strip().split("/"))
                if i != train_label_index
            ]
            row.append(str(preds[line_no]))
            w.write(row)
            line_no += 1
예제 #9
0
파일: db.py 프로젝트: redskycry/sqlflow
def write_with_generator(datasource, table, gen):
    """Write data into a table, the written data
    comes from the input generator.

    Args:
        datasource: string
            The connection string to connectDBMS.
        table: string
            The table name written.
        gen: Generator
            The generator to generte the data to insert
            into table.
    """
    conn = connect_with_data_source(datasource)
    _drop_table_if_exists(conn, table)
    _create_table(conn, table)
    idx = 0

    with buffered_db_writer(conn, table, ["id", "block"]) as w:
        for d in gen():
            block = base64.b64encode(d)
            row = [idx, block]
            w.write(row)
            idx += 1

    conn.close()
예제 #10
0
def xgb_native_explain(booster, datasource, result_table):
    if not result_table:
        raise ValueError(
            "XGBoostExplainer must use with INTO to output result to a table.")

    gain_map = booster.get_score(importance_type="gain")
    fscore_map = booster.get_fscore()
    conn = db.connect_with_data_source(datasource)

    all_feature_keys = list(gain_map.keys())
    all_feature_keys.sort()
    columns = ["feature", "fscore", "gain"]
    dtypes = [
        DataType.to_db_field_type(conn.driver, DataType.STRING),
        DataType.to_db_field_type(conn.driver, DataType.FLOAT32),
        DataType.to_db_field_type(conn.driver, DataType.FLOAT32),
    ]
    _create_table(conn, result_table, columns, dtypes)

    with db.buffered_db_writer(conn, result_table, columns) as w:
        for fkey in all_feature_keys:
            row = [fkey, fscore_map[fkey], gain_map[fkey]]
            w.write(list(row))

    conn.close()
예제 #11
0
파일: explain.py 프로젝트: vmnet04/sqlflow
def write_shap_values(shap_values, driver, conn, result_table,
                      feature_column_names, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass):
    with db.buffered_db_writer(driver, conn, result_table,
                               feature_column_names, 100, hdfs_namenode_addr,
                               hive_location, hdfs_user, hdfs_pass) as w:
        for row in shap_values:
            w.write(list(row))
예제 #12
0
    def __init__(self, conn, table):
        _drop_table_if_exists(conn, table)
        _create_table(conn, table)

        self.context_manager = buffered_db_writer(conn, table, ["id", "block"])
        self.writer = self.context_manager.__enter__()
        self.row_idx = 0
        self.buffer = b''
예제 #13
0
def write_shap_values(shap_values, conn, result_table, feature_column_names):
    with db.buffered_db_writer(conn, result_table, feature_column_names,
                               100) as w:
        for row in shap_values:
            # NOTE(typhoonzero): assume all shap explain value are float, and
            # there's no INT or other types of values yet.
            row_float = [float(c) for c in row]
            w.write(list(row_float))
예제 #14
0
def shap_explain(booster, datasource, dataset, summary_params, result_table):

    tree_explainer = shap.TreeExplainer(booster)
    shap_values = tree_explainer.shap_values(dataset)
    if result_table:
        conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values

        columns = list(dataset.columns)
        dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32)
                  ] * len(columns)
        _create_table(conn, result_table, columns, dtypes)
        with db.buffered_db_writer(conn, result_table, columns) as w:
            for row in to_write:
                w.write(list(row))

        conn.close()

    if summary_params.get("plot_type") == "decision":
        shap_interaction_values = tree_explainer.shap_interaction_values(
            dataset)
        expected_value = tree_explainer.expected_value
        if isinstance(shap_interaction_values, list):
            shap_interaction_values = shap_interaction_values[0]
        if isinstance(expected_value, list):
            expected_value = expected_value[0]

        plot_func = lambda: shap.decision_plot(  # noqa: E731
            expected_value,
            shap_interaction_values,
            dataset,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1)
    else:
        plot_func = lambda: shap.summary_plot(  # noqa: E731
            shap_values, dataset, show=False, **summary_params)

    filename = 'summary.png'
    with temp_file.TemporaryDirectory(as_cwd=True):
        explainer.plot_and_save(plot_func, filename=filename)
        with open(filename, 'rb') as f:
            img = f.read()

    img = base64.b64encode(img)
    if six.PY3:
        img = img.decode('utf-8')
    img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \
          % img
    print(img)
예제 #15
0
파일: evaluate.py 프로젝트: zlb1028/sqlflow
def write_result_metrics(result_metrics, metric_name_list, result_table, conn):
    # NOTE: assume that the result table is already created with columns:
    # loss | metric_names ...
    column_names = metric_name_list
    with buffered_db_writer(conn, result_table, column_names, 100) as w:
        row = []
        for key in metric_name_list:
            row.append(result_metrics[key])
        w.write(row)
예제 #16
0
def explain(datasource,
            select,
            feature_field_meta,
            feature_column_names,
            label_meta,
            summary_params,
            explainer="TreeExplainer",
            result_table="",
            is_pai=False,
            pai_explain_table="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None,
            transform_fn=None,
            feature_column_code=""):
    if explainer == "XGBoostExplainer":
        if result_table == "":
            raise ValueError("""XGBoostExplainer must use with INTO to output
result to a table.""")
        bst = xgb.Booster()
        bst.load_model("my_model")
        gain_map = bst.get_score(importance_type="gain")
        fscore_map = bst.get_fscore()
        if is_pai:
            from runtime.dbapi.paiio import PaiIOConnection
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)

        all_feature_keys = list(gain_map.keys())
        all_feature_keys.sort()
        with db.buffered_db_writer(conn, result_table,
                                   ["feature", "fscore", "gain"], 100) as w:
            for fkey in all_feature_keys:
                row = [fkey, fscore_map[fkey], gain_map[fkey]]
                w.write(list(row))
    else:
        # when explainer is "" or "TreeExplainer" use SHAP by default.
        shap_explain(datasource,
                     select,
                     feature_field_meta,
                     feature_column_names,
                     label_meta,
                     summary_params,
                     result_table=result_table,
                     is_pai=is_pai,
                     pai_explain_table=pai_explain_table,
                     oss_dest=oss_dest,
                     oss_ak=oss_ak,
                     oss_sk=oss_sk,
                     oss_endpoint=oss_endpoint,
                     oss_bucket_name=oss_bucket_name,
                     transform_fn=transform_fn,
                     feature_column_code=feature_column_code)
예제 #17
0
def evaluate_and_store_result(bst, dpred, feature_file_id, validation_metrics,
                              model_params, feature_column_names, label_meta,
                              is_pai, conn, result_table):
    preds = bst.predict(dpred)
    if model_params:
        obj = model_params["objective"]
        # binary:hinge output class labels
        if obj.startswith("binary:logistic"):
            preds = (preds > 0.5).astype(int)
        # multi:softmax output class labels
        elif obj.startswith("multi:softprob"):
            preds = np.argmax(np.array(preds), axis=1)
        # TODO(typhoonzero): deal with binary:logitraw when needed.
    else:
        # prediction output with multi-class job has two dimensions, this
        # is a temporary way, can remove this else branch when we can load
        # the model meta not only on PAI submitter.
        if len(preds.shape) == 2:
            preds = np.argmax(np.array(preds), axis=1)

    if is_pai:
        feature_file_read = open("predict.txt", "r")
    else:
        feature_file_read = open("predict.txt_%d" % feature_file_id, "r")

    y_test_list = []
    for line in feature_file_read:
        row = [i for i in line.strip().split(DMATRIX_FILE_SEP)]
        # DMatrix store label in the first column
        if label_meta["dtype"] == "float32" or label_meta[
                "dtype"] == DataType.FLOAT32:
            label = float(row[0])
        elif label_meta["dtype"] == "int64" or label_meta[
                "dtype"] == "int32" or label_meta["dtype"] == DataType.INT64:
            label = int(row[0])
        else:
            raise ValueError("unsupported label dtype: %s" %
                             label_meta["dtype"])
        y_test_list.append(label)
    y_test = np.array(y_test_list)

    evaluate_results = dict()
    for metric_name in validation_metrics:
        if metric_name not in SKLEARN_METRICS:
            raise ValueError("unsupported metric: %s" % metric_name)
        metric_func = getattr(sklearn.metrics, metric_name)
        metric_value = metric_func(y_test, preds)
        evaluate_results[metric_name] = metric_value

    # write evaluation result to result table
    result_columns = ["loss"] + validation_metrics
    with db.buffered_db_writer(conn, result_table, result_columns, 100) as w:
        row = ["0.0"]
        for mn in validation_metrics:
            row.append(str(evaluate_results[mn]))
        w.write(row)
예제 #18
0
def evaluate_and_store_result(bst, dpred, feature_file_id, validation_metrics,
                              model_params, feature_column_names, label_meta,
                              is_pai, conn, result_table, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass):
    preds = bst.predict(dpred)
    # FIXME(typhoonzero): copied from predict.py
    if model_params:
        obj = model_params["objective"]
        if obj.startswith("binary:"):
            preds = (preds > 0.5).astype(int)
        elif obj.startswith("multi:"):
            preds = np.argmax(np.array(preds), axis=1)
        else:
            # using the original prediction result of predict API by default
            pass
    else:
        # prediction output with multi-class job has two dimensions, this
        # is a temporary way, can remove this else branch when we can load
        # the model meta not only on PAI submitter.
        if len(preds.shape) == 2:
            preds = np.argmax(np.array(preds), axis=1)

    if is_pai:
        feature_file_read = open("predict.txt", "r")
    else:
        feature_file_read = open("predict.txt_%d" % feature_file_id, "r")

    y_test_list = []
    for line in feature_file_read:
        row = [i for i in line.strip().split("\t")]
        # DMatrix store label in the first column
        if label_meta["dtype"] == "float32":
            label = float(row[0])
        elif label_meta["dtype"] == "int64" or label_meta["dtype"] == "int32":
            label = int(row[0])
        else:
            raise ValueError("unsupported label dtype: %s" %
                             label_meta["dtype"])
        y_test_list.append(label)
    y_test = np.array(y_test_list)

    evaluate_results = dict()
    for metric_name in validation_metrics:
        if metric_name not in SKLEARN_METRICS:
            raise ValueError("unsupported metric: %s" % metric_name)
        metric_func = getattr(sklearn.metrics, metric_name)
        metric_value = metric_func(y_test, preds)
        evaluate_results[metric_name] = metric_value

    # write evaluation result to result table
    result_columns = ["loss"] + validation_metrics
    with db.buffered_db_writer(conn, result_table, result_columns, 100) as w:
        row = ["0.0"]
        for mn in validation_metrics:
            row.append(str(evaluate_results[mn]))
        w.write(row)
예제 #19
0
def write_result_metrics(result_metrics, metric_name_list, result_table,
                         driver, conn, hdfs_namenode_addr, hive_location,
                         hdfs_user, hdfs_pass):
    # NOTE: assume that the result table is already created with columns:
    # loss | metric_names ...
    column_names = metric_name_list
    with buffered_db_writer(driver, conn, result_table, column_names, 100,
                            hdfs_namenode_addr, hive_location, hdfs_user,
                            hdfs_pass) as w:
        row = []
        for key in metric_name_list:
            row.append(result_metrics[key])
        w.write(row)
예제 #20
0
파일: local.py 프로젝트: zlb1028/sqlflow
def save_solved_result_in_db(solved_result, data_frame, variables,
                             result_value_name, datasource, result_table):
    """
    Save the solved result of the Pyomo model into the database.

    Args:
        solved_result (tuple(numpy.ndarray, float)): a numpy array
            which indicates the solved x, and a float value which
            indicates the objective function value.
        data_frame (panda.DataFrame): the input table data.
        variables (list[str]): the variable names to be optimized.
        result_value_name (str): the result value name to be optimized.
        datasource (str): the database connection URI.
        result_table (str): the table name to save the solved results.

    Returns:
        None
    """
    column_names = []
    for col in data_frame.columns:
        found = False
        for var in variables:
            if var.lower() == col.lower():
                found = True
                break

        if found:
            column_names.append(col)

    data_frame = data_frame[[*column_names]]

    result_value_name = generate_unique_result_value_name(
        columns=data_frame.columns,
        result_value_name=result_value_name,
        variables=variables)

    column_names.append(result_value_name)
    data_frame[result_value_name] = solved_result[0]

    conn = db.connect_with_data_source(datasource)
    with db.buffered_db_writer(conn, result_table, column_names) as w:
        for i in six.moves.range(len(data_frame)):
            rows = list(data_frame.loc[i])
            w.write(rows)

    print('Solved result is:')
    print(data_frame)
    print('Saved in {}.'.format(result_table))
    print('Objective value is {}'.format(solved_result[1]))
예제 #21
0
파일: evaluate.py 프로젝트: hsjung6/sqlflow
def _store_evaluate_result(preds, feature_file_name, label_desc, result_table,
                           result_column_names, validation_metrics, conn):
    """
    Save the evaluation result in the table.

    Args:
        preds: the prediction result.
        feature_file_name (str): the file path where the feature dumps.
        label_desc (FieldDesc): the label FieldDesc object.
        result_table (str): the result table name.
        result_column_names (list[str]): the result column names.
        validation_metrics (list[str]): the evaluation metric names.
        conn: the database connection object.

    Returns:
        None.
    """
    y_test = []
    with open(feature_file_name, 'r') as f:
        for line in f.readlines():
            row = [i for i in line.strip().split("\t")]
            # DMatrix store label in the first column
            if label_desc.dtype == DataType.INT64:
                y_test.append(int(row[0]))
            elif label_desc.dtype == DataType.FLOAT32:
                y_test.append(float(row[0]))
            else:
                raise TypeError("unsupported data type {}".format(
                    label_desc.dtype))

    y_test = np.array(y_test)

    evaluate_results = dict()
    for metric_name in validation_metrics:
        metric_name = metric_name.strip()
        if metric_name not in SKLEARN_METRICS:
            raise ValueError("unsupported metrics %s" % metric_name)
        metric_func = getattr(sklearn.metrics, metric_name)
        metric_value = metric_func(y_test, preds)
        evaluate_results[metric_name] = metric_value

    # write evaluation result to result table
    with db.buffered_db_writer(conn, result_table, result_column_names) as w:
        row = ["0.0"]
        for mn in validation_metrics:
            row.append(str(evaluate_results[mn]))
        w.write(row)
예제 #22
0
    def _do_test(self, conn):
        table_name = "test_db"
        table_schema = ["features", "label"]
        values = [('5,6,1,2', 1)] * 10

        conn.execute(self.drop_statement)

        if conn.driver == "hive":
            conn.execute(self.hive_create_statement)
        else:
            conn.execute(self.create_statement)
        with buffered_db_writer(conn, table_name, table_schema,
                                buff_size=10) as w:
            for row in values:
                w.write(row)

        field_names, data = execute(conn, self.select_statement)

        self.assertEqual(table_schema, field_names)
        self.assertEqual(values, data)
예제 #23
0
def keras_predict(estimator, model_params, save, result_table, is_pai,
                  pai_table, feature_column_names, feature_metas,
                  train_label_name, result_col_name, datasource, select,
                  hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass):

    classifier = init_model_with_feature_column(estimator, model_params)
    classifier_pkg = sys.modules[estimator.__module__]
    conn = None
    if is_pai:
        driver = "pai_maxcompute"
    else:
        conn = db.connect_with_data_source(datasource)
        driver = conn.driver

    if is_pai:
        pai_table_parts = pai_table.split(".")
        formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                       pai_table_parts[1])
        gen = db.pai_maxcompute_db_generator(formatted_pai_table)
        selected_cols = feature_column_names
    else:
        gen = db.db_generator(conn, select)
        selected_cols = db.selected_cols(conn, select)

    def eval_input_fn(batch_size, cache=False):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))
        tf_gen = tf_generator(gen, selected_cols, feature_column_names,
                              feature_metas)
        dataset = tf.data.Dataset.from_generator(tf_gen,
                                                 (tuple(feature_types), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        dataset = dataset.map(ds_mapper).batch(batch_size)
        if cache:
            dataset = dataset.cache()
        return dataset

    # NOTE: always use batch_size=1 when predicting to get the pairs of
    #       features and predict results to insert into result table.
    pred_dataset = eval_input_fn(1)
    one_batch = next(iter(pred_dataset))
    # NOTE: must run predict one batch to initialize parameters
    # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models
    classifier.predict_on_batch(one_batch)
    classifier.load_weights(save)
    pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()

    column_names = selected_cols[:]
    train_label_index = selected_cols.index(train_label_name)
    if train_label_index != -1:
        del column_names[train_label_index]
    column_names.append(result_col_name)

    with db.buffered_db_writer(driver, conn, result_table, column_names, 100,
                               hdfs_namenode_addr, hive_location, hdfs_user,
                               hdfs_pass) as w:
        for features in pred_dataset:
            result = classifier.predict_on_batch(features)
            # FIXME(typhoonzero): determine the predict result is classification by
            # adding the prediction result together to see if it is close to 1.0.
            if len(result[0]) == 1:  # regression result
                result = result[0][0]
            else:
                sum = 0
                for i in result[0]:
                    sum += i
                if np.isclose(sum, 1.0):  # classification result
                    result = result[0].argmax(axis=-1)
                else:
                    result = result[0]  # multiple regression result
            row = []
            for idx, name in enumerate(feature_column_names):
                val = features[name].numpy()[0][0]
                row.append(str(val))
            if isinstance(result, np.ndarray):
                if len(result) > 1:
                    # NOTE(typhoonzero): if the output dimension > 1, format output tensor
                    # using a comma separated string. Only available for keras models.
                    row.append(",".join([str(i) for i in result]))
                else:
                    row.append(str(result[0]))
            else:
                row.append(str(result))
            w.write(row)
    del pred_dataset
예제 #24
0
def shap_explain(booster, datasource, select, summary_params, result_table,
                 model):
    train_fc_map = model.get_meta("features")
    label_meta = model.get_meta("label").get_field_desc()[0].to_dict(
        dtype_to_string=True)

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    dataset = xgb_shap_dataset(datasource, select, feature_column_names,
                               label_meta, feature_metas, transform_fn)

    tree_explainer = shap.TreeExplainer(booster)
    shap_values = tree_explainer.shap_values(dataset)
    if result_table:
        conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values

        columns = list(dataset.columns)
        dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32)
                  ] * len(columns)
        _create_table(conn, result_table, columns, dtypes)
        with db.buffered_db_writer(conn, result_table, columns) as w:
            for row in to_write:
                w.write(list(row))

        conn.close()

    if summary_params.get("plot_type") == "decision":
        shap_interaction_values = tree_explainer.shap_interaction_values(
            dataset)
        expected_value = tree_explainer.expected_value
        if isinstance(shap_interaction_values, list):
            shap_interaction_values = shap_interaction_values[0]
        if isinstance(expected_value, list):
            expected_value = expected_value[0]

        plot_func = lambda: shap.decision_plot(  # noqa: E731
            expected_value,
            shap_interaction_values,
            dataset,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1)
    else:
        plot_func = lambda: shap.summary_plot(  # noqa: E731
            shap_values, dataset, show=False, **summary_params)

    filename = 'summary.png'
    with temp_file.TemporaryDirectory(as_cwd=True):
        explainer.plot_and_save(plot_func, filename=filename)
        with open(filename, 'rb') as f:
            img = f.read()

    img = base64.b64encode(img)
    if six.PY3:
        img = img.decode('utf-8')
    img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \
          % img
    print(img)
예제 #25
0
파일: predict.py 프로젝트: zlb1028/sqlflow
def estimator_predict(result_table, feature_column_names, feature_metas,
                      train_label_name, result_col_name, conn,
                      predict_generator, selected_cols):
    write_cols = selected_cols[:]
    try:
        train_label_index = selected_cols.index(train_label_name)
    except ValueError:
        train_label_index = -1
    if train_label_index != -1:
        del write_cols[train_label_index]
    write_cols.append(result_col_name)

    # load from the exported model
    with open("exported_path", "r") as fn:
        export_path = fn.read()
    if tf_is_version2():
        imported = tf.saved_model.load(export_path)
    else:
        imported = tf.saved_model.load_v2(export_path)

    def add_to_example(example, x, i):
        feature_name = feature_column_names[i]
        dtype_str = feature_metas[feature_name]["dtype"]
        if feature_metas[feature_name]["delimiter"] != "":
            if feature_metas[feature_name]["delimiter_kv"] != "":
                keys = x[0][i][0].flatten()
                weights = x[0][i][1].flatten()
                weight_dtype_str = feature_metas[feature_name]["dtype_weight"]
                if (dtype_str == "float32" or dtype_str == "float64"
                        or dtype_str == DataType.FLOAT32):
                    raise ValueError(
                        "not supported key-value feature with key type float")
                elif (dtype_str == "int32" or dtype_str == "int64"
                      or dtype_str == DataType.INT64):
                    example.features.feature[
                        feature_name].int64_list.value.extend(list(keys))
                elif (dtype_str == "string" or dtype_str == DataType.STRING):
                    example.features.feature[
                        feature_name].bytes_list.value.extend(list(keys))
                if (weight_dtype_str == "float32"
                        or weight_dtype_str == "float64"
                        or weight_dtype_str == DataType.FLOAT32):
                    example.features.feature["_".join(
                        [feature_name,
                         "weight"])].float_list.value.extend(list(weights))
                else:
                    raise ValueError(
                        "not supported key value column weight data type: %s" %
                        weight_dtype_str)
            else:
                # NOTE(typhoonzero): sparse feature will get
                # (indices,values,shape) here, use indices only
                values = x[0][i][0].flatten()
                if (dtype_str == "float32" or dtype_str == "float64"
                        or dtype_str == DataType.FLOAT32):
                    example.features.feature[
                        feature_name].float_list.value.extend(list(values))
                elif (dtype_str == "int32" or dtype_str == "int64"
                      or dtype_str == DataType.INT64):
                    example.features.feature[
                        feature_name].int64_list.value.extend(list(values))
        else:
            if (dtype_str == "float32" or dtype_str == "float64"
                    or dtype_str == DataType.FLOAT32):
                # need to pass a tuple(float, )
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif (dtype_str == "int32" or dtype_str == "int64"
                  or dtype_str == DataType.INT64):
                example.features.feature[feature_name].int64_list.value.extend(
                    (int(x[0][i][0]), ))
            elif dtype_str == "string" or dtype_str == DataType.STRING:
                example.features.feature[feature_name].bytes_list.value.extend(
                    x[0][i])

    def predict(x):
        example = tf.train.Example()
        for i in range(len(feature_column_names)):
            add_to_example(example, x, i)
        return imported.signatures["predict"](
            examples=tf.constant([example.SerializeToString()]))

    with db.buffered_db_writer(conn, result_table, write_cols, 100) as w:
        for row, _ in predict_generator():
            features = db.read_features_from_row(row,
                                                 selected_cols,
                                                 feature_column_names,
                                                 feature_metas,
                                                 is_xgboost=False)
            result = predict((features, ))
            if train_label_index != -1 and len(row) > train_label_index:
                del row[train_label_index]
            if "class_ids" in result:
                row.append(str(result["class_ids"].numpy()[0][0]))
            else:
                # regression predictions
                row.append(str(result["predictions"].numpy()[0][0]))
            w.write(row)
예제 #26
0
파일: predict.py 프로젝트: zlb1028/sqlflow
def keras_predict(estimator, model_params, save, result_table,
                  feature_column_names, feature_metas, train_label_name,
                  result_col_name, conn, predict_generator, selected_cols,
                  extra_result_cols):
    pop_optimizer_and_loss(model_params)
    classifier = init_model_with_feature_column(estimator, model_params)

    def eval_input_fn(batch_size, cache=False):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))
        tf_gen = tf_generator(predict_generator, selected_cols,
                              feature_column_names, feature_metas)
        dataset = tf.data.Dataset.from_generator(tf_gen,
                                                 (tuple(feature_types), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        dataset = dataset.map(ds_mapper).batch(batch_size)
        if cache:
            dataset = dataset.cache()
        return dataset

    def to_feature_sample(row, selected_cols):
        features = {}
        for name in feature_column_names:
            row_val = row[selected_cols.index(name)]
            if feature_metas[name].get("delimiter_kv", "") != "":
                # kv list that should be parsed to two features.
                if feature_metas[name]["is_sparse"]:
                    features[name] = tf.SparseTensor(
                        row_val[0], tf.ones_like(tf.reshape(row_val[0], [-1])),
                        row_val[2])
                    features["_".join([name,
                                       "weight"])] = tf.SparseTensor(*row_val)
                else:
                    raise ValueError(
                        "not supported DENSE column with key:value"
                        "list format.")
            else:
                if feature_metas[name]["is_sparse"]:
                    features[name] = tf.SparseTensor(*row_val)
                else:
                    features[name] = tf.constant(([row_val], ))
        return features

    if not hasattr(classifier, 'sqlflow_predict_one'):
        # NOTE: load_weights should be called by keras models only.
        # NOTE: always use batch_size=1 when predicting to get the pairs of
        #       features and predict results to insert into result table.
        pred_dataset = eval_input_fn(1)
        one_batch = next(iter(pred_dataset))
        # NOTE: must run predict one batch to initialize parameters. See:
        # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models  # noqa: E501
        classifier.predict_on_batch(one_batch)
        load_keras_model_weights(classifier, save)
    pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()

    column_names = selected_cols[:]
    try:
        train_label_index = selected_cols.index(train_label_name)
    except:  # noqa: E722
        train_label_index = -1
    if train_label_index != -1:
        del column_names[train_label_index]
    column_names.append(result_col_name)

    column_names.extend(extra_result_cols)

    with db.buffered_db_writer(conn, result_table, column_names, 100) as w:
        for row, _ in predict_generator():
            features = to_feature_sample(row, column_names)
            if hasattr(classifier, 'sqlflow_predict_one'):
                result = classifier.sqlflow_predict_one(features)
            else:
                result = classifier.predict_on_batch(features)

            if extra_result_cols:
                assert isinstance(
                    result, tuple
                ), "TO PREDICT must return a " \
                   "tuple when predict.extra_outputs is not empty"
                assert len(extra_result_cols) + 1 <= len(
                    result
                ), "TO PREDICT must return at least " \
                   "%d items instead of %d" % (len(extra_result_cols) + 1,
                                               len(result))
                extra_pred_outputs = result[1:len(extra_result_cols) + 1]
                result = result[0:1]
            else:
                extra_pred_outputs = None

            # FIXME(typhoonzero): determine the predict result is
            # classification by adding the prediction result together
            # to see if it is close to 1.0.
            if len(result[0]) == 1:  # regression result
                result = result[0][0]
            else:
                sum = 0
                for i in result[0]:
                    sum += i
                if np.isclose(sum, 1.0):  # classification result
                    result = result[0].argmax(axis=-1)
                else:
                    result = result[0]  # multiple regression result

            row.append(encode_pred_result(result))
            if extra_pred_outputs is not None:
                row.extend([encode_pred_result(p) for p in extra_pred_outputs])
            if train_label_index != -1 and len(row) > train_label_index:
                del row[train_label_index]
            w.write(row)
    del pred_dataset
예제 #27
0
def keras_predict(estimator, model_params, save, result_table,
                  feature_column_names, feature_metas, train_label_name,
                  result_col_name, driver, conn, predict_generator,
                  selected_cols, hdfs_namenode_addr, hive_location, hdfs_user,
                  hdfs_pass):

    classifier = init_model_with_feature_column(estimator, model_params)

    def eval_input_fn(batch_size, cache=False):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))
        tf_gen = tf_generator(predict_generator, selected_cols,
                              feature_column_names, feature_metas)
        dataset = tf.data.Dataset.from_generator(tf_gen,
                                                 (tuple(feature_types), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        dataset = dataset.map(ds_mapper).batch(batch_size)
        if cache:
            dataset = dataset.cache()
        return dataset

    if not hasattr(classifier, 'sqlflow_predict_one'):
        # NOTE: load_weights should be called by keras models only.
        # NOTE: always use batch_size=1 when predicting to get the pairs of
        #       features and predict results to insert into result table.
        pred_dataset = eval_input_fn(1)
        one_batch = next(iter(pred_dataset))
        # NOTE: must run predict one batch to initialize parameters. See:
        # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models  # noqa: E501
        classifier.predict_on_batch(one_batch)
        classifier.load_weights(save)
    pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()

    column_names = selected_cols[:]
    try:
        train_label_index = selected_cols.index(train_label_name)
    except:  # noqa: E722
        train_label_index = -1
    if train_label_index != -1:
        del column_names[train_label_index]
    column_names.append(result_col_name)

    with db.buffered_db_writer(driver, conn, result_table, column_names, 100,
                               hdfs_namenode_addr, hive_location, hdfs_user,
                               hdfs_pass) as w:
        for features in pred_dataset:
            if hasattr(classifier, 'sqlflow_predict_one'):
                result = classifier.sqlflow_predict_one(features)
            else:
                result = classifier.predict_on_batch(features)
            # FIXME(typhoonzero): determine the predict result is
            # classification by adding the prediction result together
            # to see if it is close to 1.0.
            if len(result[0]) == 1:  # regression result
                result = result[0][0]
            else:
                sum = 0
                for i in result[0]:
                    sum += i
                if np.isclose(sum, 1.0):  # classification result
                    result = result[0].argmax(axis=-1)
                else:
                    result = result[0]  # multiple regression result
            row = []
            for idx, name in enumerate(feature_column_names):
                val = features[name].numpy()[0][0]
                row.append(str(val))
            if isinstance(result, np.ndarray):
                if len(result) > 1:
                    # NOTE(typhoonzero): if the output dimension > 1, format
                    # output tensor using a comma separated string. Only
                    # available for keras models.
                    row.append(",".join([str(i) for i in result]))
                else:
                    row.append(str(result[0]))
            else:
                row.append(str(result))
            w.write(row)
    del pred_dataset
예제 #28
0
파일: explain.py 프로젝트: hsjung6/sqlflow
def write_dfc_result(dfc_mean, gain, result_table, conn, feature_column_names):
    with buffered_db_writer(conn, result_table, ["feature", "dfc", "gain"],
                            100) as w:
        for row_name in feature_column_names:
            w.write([row_name, dfc_mean.loc[row_name], gain[row_name]])
예제 #29
0
def write_shap_values(shap_values, conn, result_table, feature_column_names):
    with db.buffered_db_writer(conn, result_table, feature_column_names,
                               100) as w:
        for row in shap_values:
            w.write(list(row))
예제 #30
0
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_column_names_map,
                      feature_columns, feature_metas, train_label_name,
                      result_col_name, driver, conn, predict_generator,
                      selected_cols, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass):
    write_cols = selected_cols[:]
    try:
        train_label_index = selected_cols.index(train_label_name)
    except ValueError:
        train_label_index = -1
    if train_label_index != -1:
        del write_cols[train_label_index]
    write_cols.append(result_col_name)

    # load from the exported model
    with open("exported_path", "r") as fn:
        export_path = fn.read()
    if tf_is_version2():
        imported = tf.saved_model.load(export_path)
    else:
        imported = tf.saved_model.load_v2(export_path)

    def add_to_example(example, x, i):
        feature_name = feature_column_names[i]
        dtype_str = feature_metas[feature_name]["dtype"]
        if feature_metas[feature_name]["delimiter"] != "":
            if feature_metas[feature_name]["is_sparse"]:
                # NOTE(typhoonzero): sparse feature will get
                # (indices,values,shape) here, use indices only
                values = x[0][i][0].flatten()
            else:
                values = x[0][i].flatten()
            if dtype_str == "float32" or dtype_str == "float64":
                example.features.feature[feature_name].float_list.value.extend(
                    list(values))
            elif dtype_str == "int32" or dtype_str == "int64":
                example.features.feature[feature_name].int64_list.value.extend(
                    list(values))
        else:
            if "feature_columns" in feature_columns:
                idx = feature_column_names.index(feature_name)
                fc = feature_columns["feature_columns"][idx]
            else:
                # DNNLinearCombinedXXX have dnn_feature_columns and
                # linear_feature_columns param.
                idx = -1
                try:
                    idx = feature_column_names_map[
                        "dnn_feature_columns"].index(feature_name)
                    fc = feature_columns["dnn_feature_columns"][idx]
                except:  # noqa: E722
                    try:
                        idx = feature_column_names_map[
                            "linear_feature_columns"].index(feature_name)
                        fc = feature_columns["linear_feature_columns"][idx]
                    except:  # noqa: E722
                        pass
                if idx == -1:
                    raise ValueError(
                        "can not found feature %s in all feature columns")
            if dtype_str == "float32" or dtype_str == "float64":
                # need to pass a tuple(float, )
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "int32" or dtype_str == "int64":
                numeric_type = type(tf.feature_column.numeric_column("tmp"))
                if type(fc) == numeric_type:
                    example.features.feature[
                        feature_name].float_list.value.extend(
                            (float(x[0][i][0]), ))
                else:
                    example.features.feature[
                        feature_name].int64_list.value.extend(
                            (int(x[0][i][0]), ))
            elif dtype_str == "string":
                example.features.feature[feature_name].bytes_list.value.extend(
                    x[0][i])

    def predict(x):
        example = tf.train.Example()
        for i in range(len(feature_column_names)):
            add_to_example(example, x, i)
        return imported.signatures["predict"](
            examples=tf.constant([example.SerializeToString()]))

    with db.buffered_db_writer(driver, conn, result_table, write_cols, 100,
                               hdfs_namenode_addr, hive_location, hdfs_user,
                               hdfs_pass) as w:
        for row, _ in predict_generator():
            features = db.read_features_from_row(row, selected_cols,
                                                 feature_column_names,
                                                 feature_metas)
            result = predict((features, ))
            if train_label_index != -1 and len(row) > train_label_index:
                del row[train_label_index]
            if "class_ids" in result:
                row.append(str(result["class_ids"].numpy()[0][0]))
            else:
                # regression predictions
                row.append(str(result["predictions"].numpy()[0][0]))
            w.write(row)