示例#1
0
    def test_generator(self):
        conn = connect(testing.get_datasource())
        # prepare test data
        conn.execute(self.drop_statement)
        conn.execute(self.create_statement)
        conn.execute(self.insert_statement)

        column_name_to_type = {
            "features": {
                "feature_name": "features",
                "delimiter": "",
                "dtype": "float32",
                "is_sparse": False,
                "shape": []
            }
        }
        label_meta = {"feature_name": "label", "shape": [], "delimiter": ""}
        gen = db_generator(conn, "SELECT * FROM test_table_float_fea",
                           label_meta)
        idx = 0
        for row, label in gen():
            features = read_features_from_row(row, ["features"], ["features"],
                                              column_name_to_type)
            d = (features, label)
            if idx == 0:
                self.assertEqual(d, (((1.0, ), ), 0))
            elif idx == 1:
                self.assertEqual(d, (((2.0, ), ), 1))
            idx += 1
        self.assertEqual(idx, 2)
示例#2
0
 def reader():
     for row, label in gen():
         features = db.read_features_from_row(row, selected_cols,
                                              feature_column_names,
                                              feature_metas)
         if label is None:
             yield (features, )
         else:
             yield (features, label)
示例#3
0
    def reader():
        for row, label in gen():
            features = db.read_features_from_row(row, selected_cols,
                                                 feature_column_names,
                                                 feature_metas)
            features = list(features)
            for i, f in enumerate(features):
                if len(f) == 1 and isinstance(f[0], np.ndarray):
                    features[i] = f[0]
            features = tuple(features)

            if label is None:
                yield (features, )
            else:
                yield (features, label)
示例#4
0
    def test_generator(self):
        driver = os.environ.get('SQLFLOW_TEST_DB')
        if driver == "mysql":
            database = "iris"
            user, password, host, port, database = testing_mysql_cfg()
            conn = connect(driver,
                           database,
                           user=user,
                           password=password,
                           host=host,
                           port=int(port))
            # prepare test data
            execute(driver, conn, self.drop_statement)
            execute(driver, conn, self.create_statement)
            execute(driver, conn, self.insert_statement)

            column_name_to_type = {
                "features": {
                    "feature_name": "features",
                    "delimiter": "",
                    "dtype": "float32",
                    "is_sparse": False,
                    "shape": []
                }
            }
            label_meta = {
                "feature_name": "label",
                "shape": [],
                "delimiter": ""
            }
            gen = db_generator(conn, "SELECT * FROM test_table_float_fea",
                               label_meta)
            idx = 0
            for row, label in gen():
                features = read_features_from_row(row, ["features"],
                                                  ["features"],
                                                  column_name_to_type)
                d = (features, label)
                if idx == 0:
                    self.assertEqual(d, (((1.0, ), ), 0))
                elif idx == 1:
                    self.assertEqual(d, (((2.0, ), ), 1))
                idx += 1
            self.assertEqual(idx, 2)
示例#5
0
def xgb_shap_dataset(datasource,
                     select,
                     feature_column_names,
                     label_meta,
                     feature_metas,
                     is_pai,
                     pai_explain_table,
                     transform_fn=None,
                     feature_column_code=""):
    if is_pai:
        # (TODO: lhw) we may specify pai_explain_table in datasoure
        # and discard the condition statement here
        conn = PaiIOConnection.from_table(pai_explain_table)
        stream = db.db_generator(conn, None, label_meta)
    else:
        conn = db.connect_with_data_source(datasource)
        stream = db.db_generator(conn, select, label_meta)
    selected_cols = db.selected_cols(conn, select)

    if transform_fn:
        feature_names = transform_fn.get_feature_column_names()
    else:
        feature_names = feature_column_names

    xs = None
    dtypes = []
    sizes = []
    offsets = []

    i = 0
    for row, label in stream():
        features = db.read_features_from_row(row,
                                             selected_cols,
                                             feature_column_names,
                                             feature_metas,
                                             is_xgboost=True)
        if transform_fn:
            features = transform_fn(features)

        flatten_features = []
        for j, feature in enumerate(features):
            if len(feature) == 3:  # convert sparse to dense
                col_indices, values, dense_shape = feature
                size = int(np.prod(dense_shape))
                row_indices = np.zeros(shape=[col_indices.size])
                sparse_matrix = scipy.sparse.csr_matrix(
                    (values, (row_indices, col_indices)), shape=[1, size])
                values = sparse_matrix.toarray()
            else:
                values = feature[0]

            if isinstance(values, np.ndarray):
                flatten_features.extend(values.flatten().tolist())
                if i == 0:
                    sizes.append(values.size)
                    dtypes.append(infer_dtype(values))
            else:
                flatten_features.append(values)
                if i == 0:
                    sizes.append(1)
                    dtypes.append(infer_dtype(values))

        # Create the column name according to the feature number
        # of each column.
        #
        # If the column "c" contains only 1 feature, the result
        # column name would be "c" too.
        #
        # If the column "c" contains 3 features,
        # the result column name would be "c_0", "c_1" and "c_2"
        if i == 0:
            offsets = np.cumsum([0] + sizes)
            column_names = []
            for j in six.moves.range(len(offsets) - 1):
                start = offsets[j]
                end = offsets[j + 1]
                if end - start == 1:
                    column_names.append(feature_names[j])
                else:
                    for k in six.moves.range(start, end):
                        column_names.append('{}_{}'.format(
                            feature_names[j], k))

            xs = pd.DataFrame(columns=column_names)

        xs.loc[i] = flatten_features

        i += 1
    # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype
    # may be "object". Use below code to reproduce:
    # import pandas as pd
    # feature_column_names=["a", "b"]
    # xs = pd.DataFrame(columns=feature_column_names)
    # for i in range(10):
    #     xs.loc[i] = [int(j) for j in range(2)]
    # print(xs.dtypes)
    columns = xs.columns
    for i, dtype in enumerate(dtypes):
        for j in six.moves.range(offsets[i], offsets[i + 1]):
            xs[columns[j]] = xs[columns[j]].astype(dtype)

    return xs
示例#6
0
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_column_names_map,
                      feature_columns, feature_metas, train_label_name,
                      result_col_name, driver, conn, predict_generator,
                      selected_cols, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass):
    write_cols = selected_cols[:]
    try:
        train_label_index = selected_cols.index(train_label_name)
    except ValueError:
        train_label_index = -1
    if train_label_index != -1:
        del write_cols[train_label_index]
    write_cols.append(result_col_name)

    # load from the exported model
    with open("exported_path", "r") as fn:
        export_path = fn.read()
    if tf_is_version2():
        imported = tf.saved_model.load(export_path)
    else:
        imported = tf.saved_model.load_v2(export_path)

    def add_to_example(example, x, i):
        feature_name = feature_column_names[i]
        dtype_str = feature_metas[feature_name]["dtype"]
        if feature_metas[feature_name]["delimiter"] != "":
            if feature_metas[feature_name]["is_sparse"]:
                # NOTE(typhoonzero): sparse feature will get
                # (indices,values,shape) here, use indices only
                values = x[0][i][0].flatten()
            else:
                values = x[0][i].flatten()
            if dtype_str == "float32" or dtype_str == "float64":
                example.features.feature[feature_name].float_list.value.extend(
                    list(values))
            elif dtype_str == "int32" or dtype_str == "int64":
                example.features.feature[feature_name].int64_list.value.extend(
                    list(values))
        else:
            if "feature_columns" in feature_columns:
                idx = feature_column_names.index(feature_name)
                fc = feature_columns["feature_columns"][idx]
            else:
                # DNNLinearCombinedXXX have dnn_feature_columns and
                # linear_feature_columns param.
                idx = -1
                try:
                    idx = feature_column_names_map[
                        "dnn_feature_columns"].index(feature_name)
                    fc = feature_columns["dnn_feature_columns"][idx]
                except:  # noqa: E722
                    try:
                        idx = feature_column_names_map[
                            "linear_feature_columns"].index(feature_name)
                        fc = feature_columns["linear_feature_columns"][idx]
                    except:  # noqa: E722
                        pass
                if idx == -1:
                    raise ValueError(
                        "can not found feature %s in all feature columns")
            if dtype_str == "float32" or dtype_str == "float64":
                # need to pass a tuple(float, )
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "int32" or dtype_str == "int64":
                numeric_type = type(tf.feature_column.numeric_column("tmp"))
                if type(fc) == numeric_type:
                    example.features.feature[
                        feature_name].float_list.value.extend(
                            (float(x[0][i][0]), ))
                else:
                    example.features.feature[
                        feature_name].int64_list.value.extend(
                            (int(x[0][i][0]), ))
            elif dtype_str == "string":
                example.features.feature[feature_name].bytes_list.value.extend(
                    x[0][i])

    def predict(x):
        example = tf.train.Example()
        for i in range(len(feature_column_names)):
            add_to_example(example, x, i)
        return imported.signatures["predict"](
            examples=tf.constant([example.SerializeToString()]))

    with db.buffered_db_writer(driver, conn, result_table, write_cols, 100,
                               hdfs_namenode_addr, hive_location, hdfs_user,
                               hdfs_pass) as w:
        for row, _ in predict_generator():
            features = db.read_features_from_row(row, selected_cols,
                                                 feature_column_names,
                                                 feature_metas)
            result = predict((features, ))
            if train_label_index != -1 and len(row) > train_label_index:
                del row[train_label_index]
            if "class_ids" in result:
                row.append(str(result["class_ids"].numpy()[0][0]))
            else:
                # regression predictions
                row.append(str(result["predictions"].numpy()[0][0]))
            w.write(row)
示例#7
0
def estimator_predict(result_table, feature_column_names, feature_metas,
                      train_label_name, result_col_name, conn,
                      predict_generator, selected_cols):
    write_cols = selected_cols[:]
    try:
        train_label_index = selected_cols.index(train_label_name)
    except ValueError:
        train_label_index = -1
    if train_label_index != -1:
        del write_cols[train_label_index]
    write_cols.append(result_col_name)

    # load from the exported model
    with open("exported_path", "r") as fn:
        export_path = fn.read()
    if tf_is_version2():
        imported = tf.saved_model.load(export_path)
    else:
        imported = tf.saved_model.load_v2(export_path)

    def add_to_example(example, x, i):
        feature_name = feature_column_names[i]
        dtype_str = feature_metas[feature_name]["dtype"]
        if feature_metas[feature_name]["delimiter"] != "":
            if feature_metas[feature_name]["delimiter_kv"] != "":
                keys = x[0][i][0].flatten()
                weights = x[0][i][1].flatten()
                weight_dtype_str = feature_metas[feature_name]["dtype_weight"]
                if (dtype_str == "float32" or dtype_str == "float64"
                        or dtype_str == DataType.FLOAT32):
                    raise ValueError(
                        "not supported key-value feature with key type float")
                elif (dtype_str == "int32" or dtype_str == "int64"
                      or dtype_str == DataType.INT64):
                    example.features.feature[
                        feature_name].int64_list.value.extend(list(keys))
                elif (dtype_str == "string" or dtype_str == DataType.STRING):
                    example.features.feature[
                        feature_name].bytes_list.value.extend(list(keys))
                if (weight_dtype_str == "float32"
                        or weight_dtype_str == "float64"
                        or weight_dtype_str == DataType.FLOAT32):
                    example.features.feature["_".join(
                        [feature_name,
                         "weight"])].float_list.value.extend(list(weights))
                else:
                    raise ValueError(
                        "not supported key value column weight data type: %s" %
                        weight_dtype_str)
            else:
                # NOTE(typhoonzero): sparse feature will get
                # (indices,values,shape) here, use indices only
                values = x[0][i][0].flatten()
                if (dtype_str == "float32" or dtype_str == "float64"
                        or dtype_str == DataType.FLOAT32):
                    example.features.feature[
                        feature_name].float_list.value.extend(list(values))
                elif (dtype_str == "int32" or dtype_str == "int64"
                      or dtype_str == DataType.INT64):
                    example.features.feature[
                        feature_name].int64_list.value.extend(list(values))
        else:
            if (dtype_str == "float32" or dtype_str == "float64"
                    or dtype_str == DataType.FLOAT32):
                # need to pass a tuple(float, )
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif (dtype_str == "int32" or dtype_str == "int64"
                  or dtype_str == DataType.INT64):
                example.features.feature[feature_name].int64_list.value.extend(
                    (int(x[0][i][0]), ))
            elif dtype_str == "string" or dtype_str == DataType.STRING:
                example.features.feature[feature_name].bytes_list.value.extend(
                    x[0][i])

    def predict(x):
        example = tf.train.Example()
        for i in range(len(feature_column_names)):
            add_to_example(example, x, i)
        return imported.signatures["predict"](
            examples=tf.constant([example.SerializeToString()]))

    with db.buffered_db_writer(conn, result_table, write_cols, 100) as w:
        for row, _ in predict_generator():
            features = db.read_features_from_row(row,
                                                 selected_cols,
                                                 feature_column_names,
                                                 feature_metas,
                                                 is_xgboost=False)
            result = predict((features, ))
            if train_label_index != -1 and len(row) > train_label_index:
                del row[train_label_index]
            if "class_ids" in result:
                row.append(str(result["class_ids"].numpy()[0][0]))
            else:
                # regression predictions
                row.append(str(result["predictions"].numpy()[0][0]))
            w.write(row)
示例#8
0
def dump_dmatrix(filename,
                 generator,
                 feature_column_names,
                 feature_metas,
                 has_label,
                 selected_cols,
                 batch_size=None,
                 transform_fn=None,
                 raw_data_dir=None):
    # TODO(yancey1989): generate group and weight text file if necessary
    row_id = 0

    if raw_data_dir:
        index = filename.rindex('/') + 1 if '/' in filename else 0
        raw_data_fid = open(os.path.join(raw_data_dir, filename[index:]), 'a')
    else:
        raw_data_fid = None

    with open(filename, 'a') as f:
        for row, label in generator:
            features = db.read_features_from_row(row,
                                                 selected_cols,
                                                 feature_column_names,
                                                 feature_metas,
                                                 is_xgboost=True)

            if raw_data_fid is not None:
                raw_data_fid.write(
                    DMATRIX_FILE_SEP.join([str(r) for r in row]) + "\n")

            if transform_fn:
                features = transform_fn(features)

            row_data = []
            offset = 0
            for i, v in enumerate(features):
                if len(v) == 1:  # dense feature
                    value = v[0]
                    if isinstance(value, np.ndarray):
                        value = value.reshape((-1, ))
                        row_data.extend([
                            "{}:{}".format(i + offset, item)
                            for i, item in enumerate(value)
                        ])
                        offset += value.size
                    else:
                        row_data.append("{}:{}".format(offset, value))
                        offset += 1
                else:  # sparse feature
                    indices = v[0]
                    value = v[1].reshape((-1))
                    dense_size = np.prod(v[2])
                    row_data.extend([
                        "{}:{}".format(i + offset, item)
                        for i, item in six.moves.zip(indices, value)
                    ])
                    offset += dense_size

            if has_label:
                row_data = [str(label)] + row_data
            f.write(DMATRIX_FILE_SEP.join(row_data) + "\n")
            row_id += 1
            # batch_size == None means use all data in generator
            if batch_size is None:
                continue
            if row_id >= batch_size:
                break
    # return rows written
    if raw_data_fid is not None:
        raw_data_fid.close()

    return row_id
示例#9
0
    def test_generator(self):
        conn = connect(testing.get_datasource())
        # prepare test data
        conn.execute(self.drop_statement)
        conn.execute(self.create_statement)
        conn.execute(self.insert_statement)

        column_name_to_type = {
            "f1": {
                "feature_name": "f1",
                "delimiter": "",
                "dtype": "float32",
                "is_sparse": False,
                "shape": []
            },
            "f2": {
                "feature_name": "f2",
                "delimiter": "",
                "dtype": "int64",
                "is_sparse": False,
                "shape": []
            },
            "f3str": {
                "feature_name": "f3str",
                "delimiter": "",
                "dtype": "string",
                "is_sparse": False,
                "shape": []
            },
            "f4sparse": {
                "feature_name": "f4sparse",
                "delimiter": "",
                "dtype": "float32",
                "is_sparse": True,
                "shape": [],
                "format": "kv"
            },
            "f5dense": {
                "feature_name": "f5dense",
                "delimiter": ",",
                "dtype": "int64",
                "is_sparse": False,
                "shape": [3]
            }
        }
        label_meta = {"feature_name": "label", "shape": [], "delimiter": ""}
        gen = db_generator(conn, "SELECT * FROM test_table_float_fea",
                           label_meta)
        idx = 0
        for row, label in gen():
            if idx == 0:
                features = read_features_from_row(
                    row, ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    column_name_to_type)
                self.assertEqual(1.0, features[0][0])
                self.assertEqual(1, features[1][0])
                self.assertEqual('a', features[2][0])
                self.assertTrue(
                    np.array_equal(np.array([[1], [2]]), features[3][0]))
                self.assertTrue(
                    np.array_equal(np.array([1., 2.], dtype=np.float32),
                                   features[3][1]))
                self.assertTrue(
                    np.array_equal(np.array([1, 2, 3]), features[4][0]))
                self.assertEqual(0, label)
            elif idx == 1:
                try:
                    features = read_features_from_row(
                        row, ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                        ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                        column_name_to_type)
                except Exception as e:
                    self.assertTrue(isinstance(e, ValueError))
                features = read_features_from_row(
                    row, ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    column_name_to_type,
                    is_xgboost=True)
                self.assertEqual(XGBOOST_NULL_MAGIC, features[0][0])
                self.assertEqual(int(XGBOOST_NULL_MAGIC), features[1][0])
                self.assertEqual("", features[2][0])
                self.assertTrue(np.array_equal(np.array([]), features[3][0]))
                self.assertTrue(np.array_equal(np.array([]), features[3][1]))
                self.assertTrue(
                    np.array_equal(np.array([1, 2, 3]), features[4][0]))
                self.assertEqual(1, label)
            idx += 1
        self.assertEqual(idx, 2)