예제 #1
0
파일: db_test.py 프로젝트: wangjili/sqlflow
    def test_generator(self):
        driver = os.environ.get('SQLFLOW_TEST_DB')
        if driver == "mysql":
            database = "iris"
            user = os.environ.get('SQLFLOW_TEST_DB_MYSQL_USER') or "root"
            password = os.environ.get('SQLFLOW_TEST_DB_MYSQL_PASSWD') or "root"
            conn = connect(driver, database, user=user, password=password, host="127.0.0.1", port="3306")
            # prepare test data
            execute(driver, conn, self.drop_statement)
            execute(driver, conn, self.create_statement)
            execute(driver, conn, self.insert_statement)

            column_name_to_type = {"features": {
                "feature_name": "features",
                "delimiter": "",
                "dtype": "float32",
                "is_sparse": False,
                "shape": []
            }}
            gen = db_generator(driver, conn, "SELECT * FROM test_table_float_fea",
                               ["features"], "label", column_name_to_type)
            idx = 0
            for d in gen():
                if idx == 0:
                    self.assertEqual(d, ((1.0,), [0]))
                elif idx == 1:
                    self.assertEqual(d, ((2.0,), [1]))
                idx += 1
            self.assertEqual(idx, 2)
예제 #2
0
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec,
                     feature_specs, is_pai, pai_explain_table):
    label_column_name = label_spec["feature_name"]
    if is_pai:
        pai_table_parts = pai_explain_table.split(".")
        formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                      pai_table_parts[1])
        stream = db.pai_maxcompute_db_generator(formated_pai_table,
                                                feature_column_names,
                                                label_column_name,
                                                feature_specs)
    else:
        conn = db.connect_with_data_source(datasource)
        stream = db.db_generator(conn.driver, conn, select,
                                 feature_column_names, label_spec,
                                 feature_specs)

    xs = pd.DataFrame(columns=feature_column_names)
    i = 0
    for row in stream():
        xs.loc[i] = [item[0] for item in row[0]]
        i += 1
    # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype
    # may be "object". Use below code to reproduce:
    # import pandas as pd
    # feature_column_names=["a", "b"]
    # xs = pd.DataFrame(columns=feature_column_names)
    # for i in range(10):
    #     xs.loc[i] = [int(j) for j in range(2)]
    # print(xs.dtypes)
    for fname in feature_column_names:
        dtype = feature_specs[fname]["dtype"]
        xs[fname] = xs[fname].astype(dtype)
    return xs
예제 #3
0
def input_fn(select, conn, feature_column_names, feature_metas, label_meta):
    feature_types = []
    shapes = []
    for name in feature_column_names:
        # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
        if feature_metas[name]["is_sparse"]:
            feature_types.append((tf.int64, tf.int32, tf.int64))
            shapes.append((None, None, None))
        else:
            feature_types.append(get_dtype(feature_metas[name]["dtype"]))
            shapes.append(feature_metas[name]["shape"])

    gen = db_generator(conn.driver, conn, select, feature_column_names,
                       label_meta["feature_name"], feature_metas)
    # Clustering model do not have label
    if label_meta["feature_name"] == "":
        dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ),
                                                 (tuple(shapes), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    else:
        dataset = tf.data.Dataset.from_generator(
            gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])),
            (tuple(shapes), label_meta["shape"]))
        ds_mapper = functools.partial(
            parse_sparse_feature,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    return dataset.map(ds_mapper)
예제 #4
0
 def test_generate_fetch_size(self):
     driver = os.environ.get('SQLFLOW_TEST_DB')
     if driver == "mysql":
         database = "iris"
         user = os.environ.get('SQLFLOW_TEST_DB_MYSQL_USER') or "root"
         password = os.environ.get('SQLFLOW_TEST_DB_MYSQL_PASSWD') or "root"
         conn = connect(driver,
                        database,
                        user=user,
                        password=password,
                        host="127.0.0.1",
                        port="3306")
         column_name_to_type = {
             "sepal_length": {
                 "feature_name": "sepal_length",
                 "delimiter": "",
                 "dtype": "float32",
                 "is_sparse": False,
                 "shape": []
             }
         }
         gen = db_generator(driver,
                            conn,
                            'SELECT * FROM iris.train limit 10',
                            ["sepal_length"],
                            "class",
                            column_name_to_type,
                            fetch_size=4)
         self.assertEqual(len([g for g in gen()]), 10)
예제 #5
0
        def eval_input_fn(batch_size, cache=False):
            feature_types = []
            for name in feature_column_names:
                # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
                if feature_metas[name]["is_sparse"]:
                    feature_types.append((tf.int64, tf.int32, tf.int64))
                else:
                    feature_types.append(
                        get_dtype(feature_metas[name]["dtype"]))

            gen = db_generator(conn.driver, conn, select, feature_column_names,
                               label_meta["feature_name"], feature_metas)

            dataset = tf.data.Dataset.from_generator(
                gen,
                (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])))
            ds_mapper = functools.partial(
                parse_sparse_feature,
                feature_column_names=feature_column_names,
                feature_metas=feature_metas)
            dataset = dataset.map(ds_mapper).batch(batch_size)
            if cache:
                dataset = dataset.cache(
                    "cache/predict" if TF_VERSION_2 else "")
            return dataset
예제 #6
0
    def eval_input_fn(batch_size, cache=False):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))

        if is_pai:
            pai_table_parts = pai_table.split(".")
            formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                           pai_table_parts[1])
            gen = db.pai_maxcompute_db_generator(formatted_pai_table,
                                                 feature_column_names, None,
                                                 feature_metas)
        else:
            gen = db.db_generator(conn.driver, conn, select,
                                  feature_column_names, None, feature_metas)
        dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        dataset = dataset.map(ds_mapper).batch(batch_size)
        if cache:
            dataset = dataset.cache()
        return dataset
예제 #7
0
 def test_generate_fetch_size(self):
     driver = os.environ.get('SQLFLOW_TEST_DB')
     if driver == "mysql":
         user, password, host, port, database = testing_mysql_cfg()
         conn = connect(driver,
                        database,
                        user=user,
                        password=password,
                        host=host,
                        port=port)
         column_name_to_type = {
             "sepal_length": {
                 "feature_name": "sepal_length",
                 "delimiter": "",
                 "dtype": "float32",
                 "is_sparse": False,
                 "shape": []
             }
         }
         label_spec = {
             "feature_name": "label",
             "shape": [],
             "delimiter": ""
         }
         gen = db_generator(driver,
                            conn,
                            'SELECT * FROM iris.train limit 10',
                            ["sepal_length"],
                            label_spec,
                            column_name_to_type,
                            fetch_size=4)
         self.assertEqual(len([g for g in gen()]), 10)
예제 #8
0
파일: predict.py 프로젝트: af3dgce/sqlflow
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, label_meta,
                      datasource, select, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass, is_pai, pai_table):
    classifier = estimator(**model_params)
    conn = connect_with_data_source(datasource)

    def fast_input_fn(generator):
        feature_types = []
        for name in feature_column_names:
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))

        def _inner_input_fn():
            if is_pai:
                dataset = pai_maxcompute_input_fn(pai_table, datasource,
                                                  feature_column_names,
                                                  feature_metas, label_meta)
            else:
                dataset = tf.data.Dataset.from_generator(
                    generator,
                    (tuple(feature_types), eval(
                        "tf.%s" % label_meta["dtype"])))
                ds_mapper = functools.partial(
                    parse_sparse_feature,
                    feature_column_names=feature_column_names,
                    feature_metas=feature_metas)
                dataset = dataset.map(ds_mapper)
            dataset = dataset.batch(1).cache()
            iterator = dataset.make_one_shot_iterator()
            features = iterator.get_next()
            return features

        return _inner_input_fn

    column_names = feature_column_names[:]
    column_names.append(label_meta["feature_name"])
    fast_predictor = FastPredict(classifier, fast_input_fn)

    with buffered_db_writer(conn.driver, conn, result_table, column_names, 100,
                            hdfs_namenode_addr, hive_location, hdfs_user,
                            hdfs_pass) as w:
        for features in db_generator(conn.driver, conn, select,
                                     feature_column_names,
                                     label_meta["feature_name"],
                                     feature_metas)():
            result = fast_predictor.predict(features)
            row = []
            for idx, _ in enumerate(feature_column_names):
                val = features[0][idx][0]
                row.append(str(val))
            if "class_ids" in list(result)[0]:
                row.append(str(list(result)[0]["class_ids"][0]))
            else:
                # regression predictions
                row.append(str(list(result)[0]["predictions"][0]))
            w.write(row)
예제 #9
0
파일: train.py 프로젝트: ChenPufeng/sqlflow
def xgb_dataset(conn, fn, dataset_sql, feature_column_name, label_name, feature_spec):
    gen = db_generator(conn.driver, conn, dataset_sql, feature_column_name, label_name, feature_spec)
    with open(fn, 'w') as f:
        for item in gen():
            features, label = item
            row_data = [str(label[0])] + ["%d:%f" % (i, v) for i, v in enumerate(features)]
            f.write("\t".join(row_data) + "\n")
    # TODO(yancey1989): generate group and weight text file if necessary
    return xgb.DMatrix(fn)
예제 #10
0
파일: input_fn.py 프로젝트: lzj000/sqlflow
def input_fn(select,
             datasource,
             feature_column_names,
             feature_metas,
             label_meta,
             is_pai=False,
             pai_table="",
             num_workers=1,
             worker_id=0):
    feature_types = []
    shapes = []
    for name in feature_column_names:
        # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
        if feature_metas[name]["is_sparse"]:
            feature_types.append((tf.int64, tf.int32, tf.int64))
            shapes.append((None, None, None))
        else:
            feature_types.append(get_dtype(feature_metas[name]["dtype"]))
            shapes.append(feature_metas[name]["shape"])
    if is_pai:
        pai_table = "odps://{}/tables/{}".format(*pai_table.split("."))
        return pai_dataset(pai_table,
                           feature_column_names,
                           label_meta,
                           feature_metas,
                           slice_id=worker_id,
                           slice_count=num_workers)
        selected_cols = db.pai_selected_cols(pai_table)
    else:
        conn = db.connect_with_data_source(datasource)
        gen = db.db_generator(conn.driver, conn, select, feature_column_names,
                              label_meta, feature_metas)
        selected_cols = db.selected_cols(conn.driver, conn, select)

    gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas)

    # Clustering model do not have label
    if not label_meta or label_meta["feature_name"] == "":
        dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ),
                                                 (tuple(shapes), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    else:
        dataset = tf.data.Dataset.from_generator(
            gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])),
            (tuple(shapes), label_meta["shape"]))
        ds_mapper = functools.partial(
            parse_sparse_feature,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    return dataset.map(ds_mapper)
예제 #11
0
def xgb_shap_dataset(datasource, select, feature_column_names, label_name,
                     feature_specs):
    conn = connect_with_data_source(datasource)
    stream = db_generator(conn.driver, conn, select, feature_column_names,
                          label_name, feature_specs)
    xs = pd.DataFrame(columns=feature_column_names)
    ys = pd.DataFrame(columns=[label_name])
    i = 0
    for row in stream():
        xs.loc[i] = [item[0] for item in row[0]]
        ys.loc[i] = row[1]
        i += 1
    return xs
예제 #12
0
def xgb_dataset(datasource,
                fn,
                dataset_sql,
                feature_specs,
                feature_column_names,
                label_spec,
                is_pai=False,
                pai_table="",
                pai_single_file=False,
                cache=False,
                batch_size=None,
                epoch=1,
                rank=0,
                nworkers=1):
    if is_pai:
        for dmatrix in pai_dataset(
                fn,
                feature_specs,
                feature_column_names,
                label_spec,
                "odps://{}/tables/{}".format(*pai_table.split(".")),
                pai_single_file,
                cache,
                rank,
                nworkers,
                batch_size=batch_size):
            yield dmatrix
        return

    conn = db.connect_with_data_source(datasource)
    gen = db.db_generator(conn.driver, conn, dataset_sql, feature_column_names,
                          label_spec, feature_specs)()

    selected_cols = db.selected_cols(conn.driver, conn, dataset_sql)
    for i in range(epoch):
        step = 0
        # the filename per batch is [filename]_[step]
        step_file_name = "%s_%d" % (fn, step)
        written_rows = dump_dmatrix(step_file_name, gen, feature_column_names,
                                    feature_specs, label_spec, selected_cols)

        while written_rows > 0:
            yield load_dmatrix('{0}#{0}.cache'.format(step_file_name)
                               if cache else step_file_name)
            os.remove(step_file_name)

            step += 1
            step_file_name = "%s_%d" % (fn, step)
            written_rows = dump_dmatrix(step_file_name, gen,
                                        feature_column_names, feature_specs,
                                        label_spec, selected_cols)
예제 #13
0
def input_fn(select,
             datasource,
             feature_column_names,
             feature_metas,
             label_meta,
             is_pai=False,
             pai_table=""):
    feature_types = []
    shapes = []
    for name in feature_column_names:
        # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
        if feature_metas[name]["is_sparse"]:
            feature_types.append((tf.int64, tf.int32, tf.int64))
            shapes.append((None, None, None))
        else:
            feature_types.append(get_dtype(feature_metas[name]["dtype"]))
            shapes.append(feature_metas[name]["shape"])
    if is_pai:
        pai_table_parts = pai_table.split(".")
        formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                      pai_table_parts[1])
        gen = pai_maxcompute_db_generator(formated_pai_table,
                                          feature_column_names,
                                          label_meta["feature_name"],
                                          feature_metas)
    else:
        conn = connect_with_data_source(datasource)
        gen = db_generator(conn.driver, conn, select, feature_column_names,
                           label_meta, feature_metas)
    # Clustering model do not have label
    if label_meta["feature_name"] == "":
        dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ),
                                                 (tuple(shapes), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    else:
        dataset = tf.data.Dataset.from_generator(
            gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])),
            (tuple(shapes), label_meta["shape"]))
        ds_mapper = functools.partial(
            parse_sparse_feature,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    return dataset.map(ds_mapper)
예제 #14
0
    def test_generator(self):
        driver = os.environ.get('SQLFLOW_TEST_DB')
        if driver == "mysql":
            database = "iris"
            user, password, host, port, database = testing_mysql_cfg()
            conn = connect(driver,
                           database,
                           user=user,
                           password=password,
                           host=host,
                           port=int(port))
            # prepare test data
            execute(driver, conn, self.drop_statement)
            execute(driver, conn, self.create_statement)
            execute(driver, conn, self.insert_statement)

            column_name_to_type = {
                "features": {
                    "feature_name": "features",
                    "delimiter": "",
                    "dtype": "float32",
                    "is_sparse": False,
                    "shape": []
                }
            }
            label_spec = {
                "feature_name": "label",
                "shape": [],
                "delimiter": ""
            }
            gen = db_generator(driver, conn,
                               "SELECT * FROM test_table_float_fea",
                               ["features"], label_spec, column_name_to_type)
            idx = 0
            for row, label in gen():
                features = read_features_from_row(row, ["features"],
                                                  ["features"],
                                                  column_name_to_type)
                d = (features, label)
                if idx == 0:
                    self.assertEqual(d, (((1.0, ), ), 0))
                elif idx == 1:
                    self.assertEqual(d, (((2.0, ), ), 1))
                idx += 1
            self.assertEqual(idx, 2)
예제 #15
0
    def input_fn(datasetStr):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))

        gen = db_generator(conn.driver, conn, datasetStr, feature_column_names,
                           label_meta["feature_name"], feature_metas)
        dataset = tf.data.Dataset.from_generator(
            gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])))
        ds_mapper = functools.partial(
            parse_sparse_feature,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        return dataset.map(ds_mapper)
예제 #16
0
파일: dataset.py 프로젝트: xhcom-ui/sqlflow
def xgb_dataset(datasource,
                fn,
                dataset_sql,
                feature_specs,
                feature_column_names,
                label_spec,
                is_pai=False,
                pai_table="",
                pai_single_file=False):

    if is_pai:
        pai_dataset(fn, feature_specs, feature_column_names, label_spec,
                    "odps://{}/tables/{}".format(*pai_table.split(".")),
                    pai_single_file)
    else:
        conn = db.connect_with_data_source(datasource)
        gen = db.db_generator(conn.driver, conn, dataset_sql,
                              feature_column_names, label_spec, feature_specs)
        dump_dmatrix(fn, gen, label_spec)
    return xgb.DMatrix(fn)
예제 #17
0
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec,
                     feature_specs, is_pai, pai_explain_table):
    label_column_name = label_spec["feature_name"]
    if is_pai:
        pai_table_parts = pai_explain_table.split(".")
        formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                      pai_table_parts[1])
        stream = pai_maxcompute_db_generator(formated_pai_table,
                                             feature_column_names,
                                             label_column_name, feature_specs)
    else:
        conn = connect_with_data_source(datasource)
        stream = db_generator(conn.driver, conn, select, feature_column_names,
                              label_spec, feature_specs)

    xs = pd.DataFrame(columns=feature_column_names)
    i = 0
    for row in stream():
        xs.loc[i] = [item[0] for item in row[0]]
        i += 1
    return xs
예제 #18
0
def xgb_dataset(datasource,
                fn,
                dataset_sql,
                feature_metas,
                feature_column_names,
                label_meta,
                is_pai=False,
                pai_table=""):

    if is_pai:
        pai_table_parts = pai_table.split(".")
        formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                      pai_table_parts[1])
        if label_meta:
            label_column_name = label_meta['feature_name']
        else:
            label_column_name = None
        gen = pai_maxcompute_db_generator(formated_pai_table,
                                          feature_column_names,
                                          label_column_name, feature_metas)
    else:
        conn = connect_with_data_source(datasource)
        gen = db_generator(conn.driver, conn, dataset_sql,
                           feature_column_names, label_meta, feature_metas)
    with open(fn, 'w') as f:
        for item in gen():
            if label_meta is None:
                row_data = ["%d:%f" % (i, v[0]) for i, v in enumerate(item[0])]
            else:
                features, label = item
                row_data = [str(label)] + [
                    "%d:%f" % (i, v[0]) for i, v in enumerate(features)
                ]
            f.write("\t".join(row_data) + "\n")
    # TODO(yancey1989): generate group and weight text file if necessary
    return xgb.DMatrix(fn)
예제 #19
0
def xgb_dataset(datasource,
                fn,
                dataset_sql,
                feature_specs,
                feature_column_names,
                label_spec,
                is_pai=False,
                pai_table="",
                pai_single_file=False,
                cache=False,
                batch_size=None,
                epoch=1,
                rank=0,
                nworkers=1,
                transform_fn=None,
                feature_column_code="",
                raw_data_dir=None):
    if raw_data_dir:
        # raw_data_dir is needed when predicting. Because we
        # should write the raw data from the source db into
        # the dest db, instead of the transformed data after
        # `transform_fn(features)` . If raw_data_dir is not
        # None, the raw data from the source db would be written
        # into another file.
        if os.path.exists(raw_data_dir):
            shutil.rmtree(raw_data_dir, ignore_errors=True)

        os.mkdir(raw_data_dir)

    if is_pai:
        for dmatrix in pai_dataset(
                fn,
                feature_specs,
                feature_column_names,
                label_spec,
                "odps://{}/tables/{}".format(*pai_table.split(".")),
                pai_single_file,
                cache,
                rank,
                nworkers,
                batch_size=batch_size,
                feature_column_code=feature_column_code,
                raw_data_dir=raw_data_dir):
            yield dmatrix
        return

    conn = db.connect_with_data_source(datasource)
    gen = db.db_generator(conn.driver, conn, dataset_sql, feature_column_names,
                          label_spec, feature_specs)()

    selected_cols = db.selected_cols(conn.driver, conn, dataset_sql)
    for _ in six.moves.range(epoch):
        step = 0
        # the filename per batch is [filename]_[step]
        step_file_name = "%s_%d" % (fn, step)
        written_rows = dump_dmatrix(step_file_name,
                                    gen,
                                    feature_column_names,
                                    feature_specs,
                                    label_spec,
                                    selected_cols,
                                    transform_fn=transform_fn,
                                    raw_data_dir=raw_data_dir)

        while written_rows > 0:
            yield load_dmatrix('{0}#{0}.cache'.format(step_file_name)
                               if cache else step_file_name)
            os.remove(step_file_name)

            step += 1
            step_file_name = "%s_%d" % (fn, step)
            written_rows = dump_dmatrix(step_file_name,
                                        gen,
                                        feature_column_names,
                                        feature_specs,
                                        label_spec,
                                        selected_cols,
                                        transform_fn=transform_fn,
                                        raw_data_dir=raw_data_dir)
예제 #20
0
파일: predict.py 프로젝트: wangjili/sqlflow
def pred(is_keras_model,
         datasource,
         estimator,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         feature_metas={},
         label_meta={},
         model_params={},
         save="",
         batch_size=1,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass=""):
    conn = connect_with_data_source(datasource)
    model_params.update(feature_columns)
    if not is_keras_model:
        model_params['model_dir'] = save
        classifier = estimator(**model_params)
    else:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        classifier = estimator(**model_params)
        classifier_pkg = sys.modules[estimator.__module__]

    if is_keras_model:

        def eval_input_fn(batch_size, cache=False):
            feature_types = []
            for name in feature_column_names:
                # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
                if feature_metas[name]["is_sparse"]:
                    feature_types.append((tf.int64, tf.int32, tf.int64))
                else:
                    feature_types.append(
                        get_dtype(feature_metas[name]["dtype"]))

            gen = db_generator(conn.driver, conn, select, feature_column_names,
                               label_meta["feature_name"], feature_metas)

            dataset = tf.data.Dataset.from_generator(
                gen,
                (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])))
            ds_mapper = functools.partial(
                parse_sparse_feature,
                feature_column_names=feature_column_names,
                feature_metas=feature_metas)
            dataset = dataset.map(ds_mapper).batch(batch_size)
            if cache:
                dataset = dataset.cache()
            return dataset

        # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results
        #       to insert into result table.
        pred_dataset = eval_input_fn(1)
        one_batch = pred_dataset.__iter__().next()
        # NOTE: must run predict one batch to initialize parameters
        # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models
        classifier.predict_on_batch(one_batch[0])
        classifier.load_weights(save)
        del pred_dataset
        pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()
        buff_rows = []
        column_names = feature_column_names[:]
        column_names.append(label_meta["feature_name"])
        with buffered_db_writer(conn.driver, conn, result_table, column_names,
                                100, hdfs_namenode_addr, hive_location,
                                hdfs_user, hdfs_pass) as w:
            while True:
                try:
                    features = pred_dataset.get_next()
                except tf.errors.OutOfRangeError:
                    break
                result = classifier.predict_on_batch(features[0])
                result = classifier_pkg.prepare_prediction_column(result[0])
                row = []
                for idx, name in enumerate(feature_column_names):
                    val = features[0][name].numpy()[0]
                    row.append(str(val))
                row.append(str(result))
                w.write(row)
        del pred_dataset

    else:

        def fast_input_fn(generator):
            feature_types = []
            for name in feature_column_names:
                if feature_metas[name]["is_sparse"]:
                    feature_types.append((tf.int64, tf.int32, tf.int64))
                else:
                    feature_types.append(
                        get_dtype(feature_metas[name]["dtype"]))

            def _inner_input_fn():
                dataset = tf.data.Dataset.from_generator(
                    generator,
                    (tuple(feature_types), eval(
                        "tf.%s" % label_meta["dtype"])))
                ds_mapper = functools.partial(
                    parse_sparse_feature,
                    feature_column_names=feature_column_names,
                    feature_metas=feature_metas)
                dataset = dataset.map(ds_mapper).batch(1).cache()
                iterator = dataset.make_one_shot_iterator()
                features = iterator.get_next()
                return features

            return _inner_input_fn

        column_names = feature_column_names[:]
        column_names.append(label_meta["feature_name"])
        pred_gen = db_generator(conn.driver, conn, select,
                                feature_column_names,
                                label_meta["feature_name"], feature_metas)()
        fast_predictor = FastPredict(classifier, fast_input_fn)

        with buffered_db_writer(conn.driver, conn, result_table, column_names,
                                100, hdfs_namenode_addr, hive_location,
                                hdfs_user, hdfs_pass) as w:
            while True:
                try:
                    features = next(pred_gen)
                except StopIteration:
                    break
                result = fast_predictor.predict(features)
                row = []
                for idx, _ in enumerate(feature_column_names):
                    val = features[0][idx]
                    row.append(str(val))
                if "class_ids" in list(result)[0]:
                    row.append(str(list(result)[0]["class_ids"][0]))
                else:
                    # regression predictions
                    row.append(str(list(result)[0]["predictions"][0]))
                w.write(row)
        fast_predictor.close()

    print("Done predicting. Predict table : %s" % result_table)
예제 #21
0
def pred(is_keras_model,
         datasource,
         estimator,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         feature_metas={},
         label_meta={},
         model_params={},
         save="",
         batch_size=1,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         is_pai=False,
         pai_table=""):
    global FLAGS
    define_tf_flags()
    if not is_pai:
        conn = connect_with_data_source(datasource)
    model_params.update(feature_columns)

    if is_keras_model:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        classifier = estimator(**model_params)
        classifier_pkg = sys.modules[estimator.__module__]

        def eval_input_fn(batch_size, cache=False):
            feature_types = []
            for name in feature_column_names:
                # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
                if feature_metas[name]["is_sparse"]:
                    feature_types.append((tf.int64, tf.int32, tf.int64))
                else:
                    feature_types.append(get_dtype(feature_metas[name]["dtype"]))

            gen = db_generator(conn.driver, conn, select,
                feature_column_names, label_meta["feature_name"], feature_metas)
                
            dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])))
            ds_mapper = functools.partial(parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas)
            dataset = dataset.map(ds_mapper).batch(batch_size)
            if cache:
                dataset = dataset.cache()
            return dataset

        # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results
        #       to insert into result table.
        pred_dataset = eval_input_fn(1)
        one_batch = pred_dataset.__iter__().next()
        # NOTE: must run predict one batch to initialize parameters
        # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models
        classifier.predict_on_batch(one_batch[0])
        classifier.load_weights(save)
        del pred_dataset
        pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()
        buff_rows = []
        column_names = feature_column_names[:]
        column_names.append(label_meta["feature_name"])
        with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w:
            while True:
                try:
                    features = pred_dataset.get_next()
                except tf.errors.OutOfRangeError:
                    break
                result = classifier.predict_on_batch(features[0])
                result = classifier_pkg.prepare_prediction_column(result[0])
                row = []
                for idx, name in enumerate(feature_column_names):
                    val = features[0][name].numpy()[0]
                    row.append(str(val))
                row.append(str(result))
                w.write(row)
        del pred_dataset

    else:
        if is_pai:
            model_params["model_dir"] = FLAGS.checkpointDir
        else:
            model_params['model_dir'] = save
        classifier = estimator(**model_params)

        # FIXME(typhoonzero): copied from train.py
        def pai_maxcompute_input_fn():
            table_parts = pai_table.split(".")
            if len(table_parts) == 2:
                database, table_name = table_parts
            elif len(table_parts) == 1:
                table_name = pai_table
                driver, dsn = datasource.split("://")
                database = parseMaxComputeDSN(dsn)[-1]
            else:
                raise ValueError("error database.table format: %s" % pai_table)

            tables = ["odps://%s/tables/%s" % (database, table_name)]
            record_defaults = []
            for name in feature_column_names:
                dtype = get_dtype(feature_metas[name]["dtype"])
                record_defaults.append(tf.constant(0, dtype=dtype, shape=feature_metas[name]["shape"]))

            dataset = tf.data.TableRecordDataset(tables,
                                        record_defaults=record_defaults,
                                        selected_cols=",".join(feature_column_names))
            def tensor_to_dict(*args):
                num_features = len(feature_column_names)
                features_dict = dict()
                for idx in range(num_features):
                    name = feature_column_names[idx]
                    features_dict[name] = tf.reshape(args[idx], [-1])
                return features_dict

            return dataset.map(tensor_to_dict)

        def fast_input_fn(generator):
            feature_types = []
            for name in feature_column_names:
                if feature_metas[name]["is_sparse"]:
                    feature_types.append((tf.int64, tf.int32, tf.int64))
                else:
                    feature_types.append(get_dtype(feature_metas[name]["dtype"]))

            def _inner_input_fn():
                if is_pai:
                    dataset = pai_maxcompute_input_fn()
                else:
                    dataset = tf.data.Dataset.from_generator(generator, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])))
                    ds_mapper = functools.partial(parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas)
                    dataset = dataset.map(ds_mapper)
                dataset = dataset.batch(1).cache()
                iterator = dataset.make_one_shot_iterator()
                features = iterator.get_next()
                return features

            return _inner_input_fn


        column_names = feature_column_names[:]
        column_names.append(label_meta["feature_name"])
        pred_gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas)()
        fast_predictor = FastPredict(classifier, fast_input_fn)

        with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w:
            while True:
                try:
                    features = next(pred_gen)
                except StopIteration:
                    break
                result = fast_predictor.predict(features)
                row = []
                for idx, _ in enumerate(feature_column_names):
                    val = features[0][idx]
                    row.append(str(val))
                if "class_ids" in list(result)[0]:
                    row.append(str(list(result)[0]["class_ids"][0]))
                else:
                    # regression predictions
                    row.append(str(list(result)[0]["predictions"][0]))
                w.write(row)
        fast_predictor.close()

    print("Done predicting. Predict table : %s" % result_table)
예제 #22
0
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_column_names_map,
                      feature_columns, feature_metas, result_col_name,
                      datasource, select, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass, is_pai, pai_table):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)

    column_names = feature_column_names[:]
    column_names.append(result_col_name)

    if is_pai:
        driver = "pai_maxcompute"
        conn = None
        pai_table_parts = pai_table.split(".")
        formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                       pai_table_parts[1])
        predict_generator = db.pai_maxcompute_db_generator(
            formatted_pai_table, feature_column_names, None, feature_metas)()
    else:
        driver = conn.driver
        predict_generator = db.db_generator(conn.driver, conn, select,
                                            feature_column_names, None,
                                            feature_metas)()
    # load from the exported model
    with open("exported_path", "r") as fn:
        export_path = fn.read()
    if tf_is_version2():
        imported = tf.saved_model.load(export_path)
    else:
        imported = tf.saved_model.load_v2(export_path)

    def add_to_example(example, x, i):
        feature_name = feature_column_names[i]
        dtype_str = feature_metas[feature_name]["dtype"]
        if feature_metas[feature_name]["delimiter"] != "":
            if feature_metas[feature_name]["is_sparse"]:
                # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only
                values = x[0][i][0].flatten()
            else:
                values = x[0][i].flatten()
            if dtype_str == "float32" or dtype_str == "float64":
                example.features.feature[feature_name].float_list.value.extend(
                    list(values))
            elif dtype_str == "int32" or dtype_str == "int64":
                example.features.feature[feature_name].int64_list.value.extend(
                    list(values))
        else:
            if "feature_columns" in feature_columns:
                idx = feature_column_names.index(feature_name)
                fc = feature_columns["feature_columns"][idx]
            else:
                # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param.
                idx = -1
                try:
                    idx = feature_column_names_map[
                        "dnn_feature_columns"].index(feature_name)
                    fc = feature_columns["dnn_feature_columns"][idx]
                except:
                    try:
                        idx = feature_column_names_map[
                            "linear_feature_columns"].index(feature_name)
                        fc = feature_columns["linear_feature_columns"][idx]
                    except:
                        pass
                if idx == -1:
                    raise ValueError(
                        "can not found feature %s in all feature columns")
            if dtype_str == "float32" or dtype_str == "float64":
                # need to pass a tuple(float, )
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "int32" or dtype_str == "int64":
                numeric_type = type(tf.feature_column.numeric_column("tmp"))
                if type(fc) == numeric_type:
                    example.features.feature[
                        feature_name].float_list.value.extend(
                            (float(x[0][i][0]), ))
                else:
                    example.features.feature[
                        feature_name].int64_list.value.extend(
                            (int(x[0][i][0]), ))
            elif dtype_str == "string":
                example.features.feature[feature_name].bytes_list.value.extend(
                    x[0][i])

    def predict(x):
        example = tf.train.Example()
        for i in range(len(feature_column_names)):
            add_to_example(example, x, i)
        return imported.signatures["predict"](
            examples=tf.constant([example.SerializeToString()]))

    with db.buffered_db_writer(driver, conn, result_table, column_names, 100,
                               hdfs_namenode_addr, hive_location, hdfs_user,
                               hdfs_pass) as w:
        for features in predict_generator:
            result = predict(features)
            row = []
            for idx, _ in enumerate(feature_column_names):
                per_feature = features[0][idx]
                if isinstance(per_feature, tuple) or isinstance(
                        per_feature, list):
                    # is sparse feature: tuple (indices, values, shape) or scalar
                    val = per_feature[0]
                elif isinstance(per_feature, np.ndarray):
                    val = per_feature
                # val = features[0][idx][0]
                row.append(str(val))
            if "class_ids" in result:
                row.append(str(result["class_ids"].numpy()[0][0]))
            else:
                # regression predictions
                row.append(str(result["predictions"].numpy()[0][0]))
            w.write(row)
예제 #23
0
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, result_col_name,
                      datasource, select, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass, is_pai, pai_table):
    if not is_pai:
        conn = connect_with_data_source(datasource)

    column_names = feature_column_names[:]
    column_names.append(result_col_name)

    if is_pai:
        driver = "pai_maxcompute"
        conn = None
        pai_table_parts = pai_table.split(".")
        formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                      pai_table_parts[1])
        predict_generator = pai_maxcompute_db_generator(
            formated_pai_table, feature_column_names, None, feature_metas)()
    else:
        driver = conn.driver
        predict_generator = db_generator(conn.driver, conn, select,
                                         feature_column_names, None,
                                         feature_metas)()
    # load from the exported model
    if save.startswith("oss://"):
        with open("exported_path", "r") as fn:
            export_path = fn.read()
        parts = save.split("?")
        export_path_oss = parts[0] + export_path
        if TF_VERSION_2:
            imported = tf.saved_model.load(export_path_oss)
        else:
            imported = tf.saved_model.load_v2(export_path_oss)
    else:
        with open("exported_path", "r") as fn:
            export_path = fn.read()
        if TF_VERSION_2:
            imported = tf.saved_model.load(export_path)
        else:
            imported = tf.saved_model.load_v2(export_path)

    def add_to_example(example, x, i):
        feature_name = feature_column_names[i]
        dtype_str = feature_metas[feature_name]["dtype"]
        if feature_metas[feature_name]["delimiter"] != "":
            if feature_metas[feature_name]["is_sparse"]:
                # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only
                values = x[0][i][0].flatten()
            else:
                values = x[0][i].flatten()
            if dtype_str == "float32" or dtype_str == "float64":
                example.features.feature[feature_name].float_list.value.extend(
                    list(values))
            elif dtype_str == "int32" or dtype_str == "int64":
                example.features.feature[feature_name].int64_list.value.extend(
                    list(values))
        else:
            if dtype_str == "float32" or dtype_str == "float64":
                # need to pass a tuple(float, )
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "int32" or dtype_str == "int64":
                # FIXME(typhoonzero): figure out why int64 features need to convert to float
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "string":
                example.features.feature[feature_name].bytes_list.value.extend(
                    x[0][i])

    def predict(x):
        example = tf.train.Example()
        for i in range(len(feature_column_names)):
            add_to_example(example, x, i)
        return imported.signatures["predict"](
            examples=tf.constant([example.SerializeToString()]))

    with buffered_db_writer(driver, conn, result_table, column_names, 100,
                            hdfs_namenode_addr, hive_location, hdfs_user,
                            hdfs_pass) as w:
        for features in predict_generator:
            result = predict(features)
            row = []
            for idx, _ in enumerate(feature_column_names):
                per_feature = features[0][idx]
                if isinstance(per_feature, tuple) or isinstance(
                        per_feature, list):
                    # is sparse feature: tuple (indices, values, shape) or scalar
                    val = per_feature[0]
                elif isinstance(per_feature, np.ndarray):
                    val = per_feature
                # val = features[0][idx][0]
                row.append(str(val))
            if "class_ids" in result:
                row.append(str(result["class_ids"].numpy()[0][0]))
            else:
                # regression predictions
                row.append(str(result["predictions"].numpy()[0][0]))
            w.write(row)
예제 #24
0
def xgb_shap_dataset(datasource,
                     select,
                     feature_column_names,
                     label_spec,
                     feature_specs,
                     is_pai,
                     pai_explain_table,
                     transform_fn=None,
                     feature_column_code=""):
    label_column_name = label_spec["feature_name"]
    if is_pai:
        pai_table_parts = pai_explain_table.split(".")
        formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                       pai_table_parts[1])
        stream = db.pai_maxcompute_db_generator(formatted_pai_table,
                                                feature_column_names,
                                                label_column_name,
                                                feature_specs)
        selected_cols = db.pai_selected_cols(formatted_pai_table)
    else:
        conn = db.connect_with_data_source(datasource)
        stream = db.db_generator(conn.driver, conn, select,
                                 feature_column_names, label_spec,
                                 feature_specs)
        selected_cols = db.selected_cols(conn.driver, conn, select)

    if transform_fn:
        column_names = transform_fn.get_column_names()
    else:
        column_names = feature_column_names

    # NOTE(sneaxiy): pandas.DataFrame does not support Tensor whose rank is larger than 2.
    # But `INDICATOR` would generate one hot vector for each element, and pandas.DataFrame
    # would not accept `INDICATOR` results as its input. In a word, we do not support
    # `TO EXPLAIN` when using `INDICATOR`.
    xs = pd.DataFrame(columns=column_names)

    dtypes = []

    i = 0
    for row, label in stream():
        features = db.read_features_from_row(row, selected_cols,
                                             feature_column_names,
                                             feature_specs)
        if transform_fn:
            features = transform_fn(features)

        # TODO(sneaxiy): support sparse features in `TO EXPLAIN`
        features = [item[0] for item in features]
        xs.loc[i] = features

        if i == 0:
            for f in features:
                if isinstance(f, np.ndarray):
                    if f.dtype == np.float32 or f.dtype == np.float64:
                        dtypes.append('float32')
                    elif f.dtype == np.int32 or f.dtype == np.int64:
                        dtypes.append('int64')
                    else:
                        raise ValueError('Not supported data type {}'.format(
                            f.dtype))
                elif isinstance(f, (np.float32, np.float64, float)):
                    dtypes.append('float32')
                elif isinstance(f, (np.int32, np.int64, six.integer_types)):
                    dtypes.append('int64')
                else:
                    raise ValueError('Not supported data type {}'.format(
                        type(f)))

        i += 1
    # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype
    # may be "object". Use below code to reproduce:
    # import pandas as pd
    # feature_column_names=["a", "b"]
    # xs = pd.DataFrame(columns=feature_column_names)
    # for i in range(10):
    #     xs.loc[i] = [int(j) for j in range(2)]
    # print(xs.dtypes)
    for dtype, name in zip(dtypes, column_names):
        xs[name] = xs[name].astype(dtype)
    return xs