示例#1
0
def pred(datasource,
         estimator_string,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         feature_column_names_map,
         train_label_name,
         result_col_name,
         feature_metas={},
         model_params={},
         pred_params={},
         save="",
         batch_size=1,
         pai_table=""):
    estimator = import_model(estimator_string)
    model_params.update(feature_columns)
    is_estimator = is_tf_estimator(estimator)

    if pai_table != "":
        conn = PaiIOConnection.from_table(pai_table)
        selected_cols = db.selected_cols(conn, None)
        predict_generator = db.db_generator(conn, None)
    else:
        conn = db.connect_with_data_source(datasource)
        selected_cols = db.selected_cols(conn, select)
        predict_generator = db.db_generator(conn, select)

    pop_optimizer_and_loss(model_params)

    if pred_params is None:
        extra_result_cols = []
    else:
        extra_result_cols = pred_params.get("extra_outputs", "")
        extra_result_cols = [
            c.strip() for c in extra_result_cols.split(",") if c.strip()
        ]

    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, train_label_name,
                      result_col_name, conn, predict_generator, selected_cols,
                      extra_result_cols)
    else:
        # TODO(sneaxiy): support extra_result_cols for estimator
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(result_table, feature_column_names, feature_metas,
                          train_label_name, result_col_name, conn,
                          predict_generator, selected_cols)

    print("Done predicting. Predict table : %s" % result_table)
示例#2
0
def pai_download_table_data_worker(dname, feature_metas, feature_column_names,
                                   label_meta, pai_table, slice_id,
                                   slice_count, feature_column_code,
                                   raw_data_dir):
    import runtime.xgboost as xgboost_extended
    if isinstance(feature_column_code, dict):
        # NOTE(typhoonzero): feature_column_code is a dict of
        # runtime.feature.column in refactored step code.
        feature_column_transformers = compile_ir_feature_columns(
            feature_column_code, EstimatorType.XGBOOST)
        transform_fn = \
            xgboost_extended.feature_column.ComposedColumnTransformer(
                feature_column_names,
                *feature_column_transformers["feature_columns"])
    else:
        feature_column_transformers = eval('[{}]'.format(feature_column_code))
        transform_fn = \
            xgboost_extended.feature_column.ComposedColumnTransformer(
                feature_column_names, *feature_column_transformers)

    conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count)
    gen = db.db_generator(conn, None, label_meta=label_meta)()
    selected_cols = db.selected_cols(conn, None)
    filename = "{}/{}.txt".format(dname, slice_id)
    dump_dmatrix(filename,
                 gen,
                 feature_column_names,
                 feature_metas,
                 label_meta,
                 selected_cols,
                 transform_fn=transform_fn,
                 raw_data_dir=raw_data_dir)
示例#3
0
 def test_generate_fetch_size(self):
     label_meta = {"feature_name": "label", "shape": [], "delimiter": ""}
     gen = db_generator(testing.get_singleton_db_connection(),
                        'SELECT * FROM iris.train limit 10',
                        label_meta,
                        fetch_size=4)
     self.assertEqual(len([g for g in gen()]), 10)
示例#4
0
 def test_generate_fetch_size(self):
     driver = os.environ.get('SQLFLOW_TEST_DB')
     if driver == "mysql":
         user, password, host, port, database = testing_mysql_cfg()
         conn = connect(driver,
                        database,
                        user=user,
                        password=password,
                        host=host,
                        port=port)
         column_name_to_type = {
             "sepal_length": {
                 "feature_name": "sepal_length",
                 "delimiter": "",
                 "dtype": "float32",
                 "is_sparse": False,
                 "shape": []
             }
         }
         label_meta = {
             "feature_name": "label",
             "shape": [],
             "delimiter": ""
         }
         gen = db_generator(conn,
                            'SELECT * FROM iris.train limit 10',
                            label_meta,
                            fetch_size=4)
         self.assertEqual(len([g for g in gen()]), 10)
示例#5
0
    def test_generator(self):
        conn = connect(testing.get_datasource())
        # prepare test data
        conn.execute(self.drop_statement)
        conn.execute(self.create_statement)
        conn.execute(self.insert_statement)

        column_name_to_type = {
            "features": {
                "feature_name": "features",
                "delimiter": "",
                "dtype": "float32",
                "is_sparse": False,
                "shape": []
            }
        }
        label_meta = {"feature_name": "label", "shape": [], "delimiter": ""}
        gen = db_generator(conn, "SELECT * FROM test_table_float_fea",
                           label_meta)
        idx = 0
        for row, label in gen():
            features = read_features_from_row(row, ["features"], ["features"],
                                              column_name_to_type)
            d = (features, label)
            if idx == 0:
                self.assertEqual(d, (((1.0, ), ), 0))
            elif idx == 1:
                self.assertEqual(d, (((2.0, ), ), 1))
            idx += 1
        self.assertEqual(idx, 2)
示例#6
0
def input_fn(select,
             datasource,
             feature_column_names,
             feature_metas,
             label_meta,
             is_pai=False,
             pai_table="",
             num_workers=1,
             worker_id=0):
    feature_types = []
    shapes = []
    for name in feature_column_names:
        # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
        if feature_metas[name]["is_sparse"]:
            if feature_metas[name]["delimiter_kv"]:
                # extract two features from generator data.
                feature_types.append(
                    (get_dtype(feature_metas[name]["dtype"]),
                     get_dtype(feature_metas[name]["dtype_weight"]), tf.int64))
                shapes.append((None, None, None))
            else:
                feature_types.append((tf.int64, tf.int32, tf.int64))
                shapes.append((None, None, None))
        else:
            feature_types.append(get_dtype(feature_metas[name]["dtype"]))
            shapes.append(feature_metas[name]["shape"])
    if is_pai:
        pai_table = "odps://{}/tables/{}".format(*pai_table.split("."))
        return pai_dataset(pai_table,
                           feature_column_names,
                           label_meta,
                           feature_metas,
                           slice_id=worker_id,
                           slice_count=num_workers)
    else:
        conn = db.connect_with_data_source(datasource)
        gen = db.db_generator(conn, select, label_meta)
        selected_cols = db.selected_cols(conn, select)

    gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas)

    # Clustering model do not have label
    if not label_meta or label_meta["feature_name"] == "":
        dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ),
                                                 (tuple(shapes), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    else:
        dataset = tf.data.Dataset.from_generator(
            gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])),
            (tuple(shapes), label_meta["shape"]))
        ds_mapper = functools.partial(
            parse_sparse_feature,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    return dataset.map(ds_mapper)
示例#7
0
def pred(datasource,
         estimator_string,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         feature_column_names_map,
         train_label_name,
         result_col_name,
         feature_metas={},
         model_params={},
         save="",
         batch_size=1,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass=""):
    estimator = import_model(estimator_string)
    model_params.update(feature_columns)
    is_estimator = is_tf_estimator(estimator)

    conn = db.connect_with_data_source(datasource)
    driver = conn.driver
    predict_generator = db.db_generator(conn, select)
    selected_cols = db.selected_cols(conn, select)

    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, train_label_name,
                      result_col_name, driver, conn, predict_generator,
                      selected_cols, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass)
    else:
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(estimator, model_params, save, result_table,
                          feature_column_names, feature_column_names_map,
                          feature_columns, feature_metas, train_label_name,
                          result_col_name, driver, conn, predict_generator,
                          selected_cols, hdfs_namenode_addr, hive_location,
                          hdfs_user, hdfs_pass)

    print("Done predicting. Predict table : %s" % result_table)
示例#8
0
    def test_generator(self):
        driver = os.environ.get('SQLFLOW_TEST_DB')
        if driver == "mysql":
            database = "iris"
            user, password, host, port, database = testing_mysql_cfg()
            conn = connect(driver,
                           database,
                           user=user,
                           password=password,
                           host=host,
                           port=int(port))
            # prepare test data
            execute(driver, conn, self.drop_statement)
            execute(driver, conn, self.create_statement)
            execute(driver, conn, self.insert_statement)

            column_name_to_type = {
                "features": {
                    "feature_name": "features",
                    "delimiter": "",
                    "dtype": "float32",
                    "is_sparse": False,
                    "shape": []
                }
            }
            label_meta = {
                "feature_name": "label",
                "shape": [],
                "delimiter": ""
            }
            gen = db_generator(conn, "SELECT * FROM test_table_float_fea",
                               label_meta)
            idx = 0
            for row, label in gen():
                features = read_features_from_row(row, ["features"],
                                                  ["features"],
                                                  column_name_to_type)
                d = (features, label)
                if idx == 0:
                    self.assertEqual(d, (((1.0, ), ), 0))
                elif idx == 1:
                    self.assertEqual(d, (((2.0, ), ), 1))
                idx += 1
            self.assertEqual(idx, 2)
示例#9
0
def pai_download_table_data_worker(dname, feature_metas, feature_column_names,
                                   label_meta, pai_table, slice_id,
                                   slice_count, feature_column_code,
                                   raw_data_dir):
    import runtime.xgboost as xgboost_extended
    feature_column_transformers = eval('[{}]'.format(feature_column_code))
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *feature_column_transformers)

    conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count)
    gen = db.db_generator(conn, None, label_meta=label_meta)()
    selected_cols = db.selected_cols(conn, None)
    filename = "{}/{}.txt".format(dname, slice_id)
    dump_dmatrix(filename,
                 gen,
                 feature_column_names,
                 feature_metas,
                 label_meta,
                 selected_cols,
                 transform_fn=transform_fn,
                 raw_data_dir=raw_data_dir)
示例#10
0
def _predict(datasource,
             estimator_string,
             select,
             result_table,
             feature_columns,
             feature_column_names,
             feature_column_names_map,
             train_label_name,
             result_col_name,
             feature_metas={},
             model_params={},
             save="",
             batch_size=1,
             pai_table=""):
    estimator = import_model(estimator_string)
    model_params.update(feature_columns)
    is_estimator = is_tf_estimator(estimator)

    conn = PaiIOConnection.from_table(pai_table)
    selected_cols = db.selected_cols(conn, None)
    predict_generator = db.db_generator(conn, None)

    pop_optimizer_and_loss(model_params)

    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, train_label_name,
                      result_col_name, conn, predict_generator, selected_cols)
    else:
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(result_table, feature_column_names, feature_metas,
                          train_label_name, result_col_name, conn,
                          predict_generator, selected_cols)

    print("Done predicting. Predict table : %s" % result_table)
示例#11
0
def fetch_samples(conn, query, n=1):
    '''
    Fetch n sample(s) at most according to the query statement.

    Args:
        conn: the connection object.
        query (str): the select SQL statement.
        n (int): the maximum sample number to query. Query all samples
            if n < 0.

    Returns:
        A generator which yields each row of the data.
    '''

    query = db.limit_select(query, n)
    gen = db.db_generator(conn, query)

    # Note: Only when the iteration begins, we can get
    # gen.field_names and gen.field_types. So we take
    # the first element in the generator first, and
    # set field_names and field_types to the returned
    # result.
    gen_iter = iter(gen())
    rows = next(gen_iter, None)

    if rows is None:
        # No fetch data, just return None
        return None

    def reader():
        r = rows
        while r is not None:
            # r = (row_data, label_data), and label_data is None here
            yield r[0]
            r = next(gen_iter, None)

    reader.field_names = gen.field_names
    reader.field_types = gen.field_types
    return reader
示例#12
0
def xgb_shap_dataset(datasource,
                     select,
                     feature_column_names,
                     label_meta,
                     feature_metas,
                     is_pai,
                     pai_explain_table,
                     transform_fn=None,
                     feature_column_code=""):
    if is_pai:
        # (TODO: lhw) we may specify pai_explain_table in datasoure
        # and discard the condition statement here
        conn = PaiIOConnection.from_table(pai_explain_table)
        stream = db.db_generator(conn, None, label_meta)
    else:
        conn = db.connect_with_data_source(datasource)
        stream = db.db_generator(conn, select, label_meta)
    selected_cols = db.selected_cols(conn, select)

    if transform_fn:
        feature_names = transform_fn.get_feature_column_names()
    else:
        feature_names = feature_column_names

    xs = None
    dtypes = []
    sizes = []
    offsets = []

    i = 0
    for row, label in stream():
        features = db.read_features_from_row(row,
                                             selected_cols,
                                             feature_column_names,
                                             feature_metas,
                                             is_xgboost=True)
        if transform_fn:
            features = transform_fn(features)

        flatten_features = []
        for j, feature in enumerate(features):
            if len(feature) == 3:  # convert sparse to dense
                col_indices, values, dense_shape = feature
                size = int(np.prod(dense_shape))
                row_indices = np.zeros(shape=[col_indices.size])
                sparse_matrix = scipy.sparse.csr_matrix(
                    (values, (row_indices, col_indices)), shape=[1, size])
                values = sparse_matrix.toarray()
            else:
                values = feature[0]

            if isinstance(values, np.ndarray):
                flatten_features.extend(values.flatten().tolist())
                if i == 0:
                    sizes.append(values.size)
                    dtypes.append(infer_dtype(values))
            else:
                flatten_features.append(values)
                if i == 0:
                    sizes.append(1)
                    dtypes.append(infer_dtype(values))

        # Create the column name according to the feature number
        # of each column.
        #
        # If the column "c" contains only 1 feature, the result
        # column name would be "c" too.
        #
        # If the column "c" contains 3 features,
        # the result column name would be "c_0", "c_1" and "c_2"
        if i == 0:
            offsets = np.cumsum([0] + sizes)
            column_names = []
            for j in six.moves.range(len(offsets) - 1):
                start = offsets[j]
                end = offsets[j + 1]
                if end - start == 1:
                    column_names.append(feature_names[j])
                else:
                    for k in six.moves.range(start, end):
                        column_names.append('{}_{}'.format(
                            feature_names[j], k))

            xs = pd.DataFrame(columns=column_names)

        xs.loc[i] = flatten_features

        i += 1
    # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype
    # may be "object". Use below code to reproduce:
    # import pandas as pd
    # feature_column_names=["a", "b"]
    # xs = pd.DataFrame(columns=feature_column_names)
    # for i in range(10):
    #     xs.loc[i] = [int(j) for j in range(2)]
    # print(xs.dtypes)
    columns = xs.columns
    for i, dtype in enumerate(dtypes):
        for j in six.moves.range(offsets[i], offsets[i + 1]):
            xs[columns[j]] = xs[columns[j]].astype(dtype)

    return xs
示例#13
0
def predict_step(datasource,
                 select,
                 result_table,
                 label_name,
                 model,
                 pai_table=None):
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    label_meta = model.get_meta("label")
    train_label_desc = label_meta.get_field_desc()[0] if label_meta else None
    train_label_name = train_label_desc.name if train_label_desc else None
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())

    is_pai = True if pai_table else False
    if is_pai:
        select = "SELECT * FROM %s" % pai_table

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, label_name)

    if is_pai:
        conn.close()
        conn = PaiIOConnection.from_table(pai_table)
        select = None

    selected_cols = result_column_names[0:-1]
    if train_label_idx >= 0:
        selected_cols = selected_cols[0:train_label_idx] + [
            train_label_name
        ] + selected_cols[train_label_idx:]

    estimator = import_model(estimator_string)
    model_params.update(feature_columns)
    is_estimator = is_tf_estimator(estimator)
    predict_generator = db.db_generator(conn, select)

    pop_optimizer_and_loss(model_params)
    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, train_label_name,
                      label_name, conn, predict_generator, selected_cols)
    else:
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(result_table, feature_column_names, feature_metas,
                          train_label_name, label_name, conn,
                          predict_generator, selected_cols)

    print("Done predicting. Predict table : %s" % result_table)
    conn.close()
示例#14
0
def xgb_dataset(datasource,
                fn,
                dataset_sql,
                feature_metas,
                feature_column_names,
                label_meta,
                is_pai=False,
                pai_table="",
                pai_single_file=False,
                cache=False,
                batch_size=None,
                epoch=1,
                rank=0,
                nworkers=1,
                transform_fn=None,
                feature_column_code="",
                raw_data_dir=None):
    if raw_data_dir:
        # raw_data_dir is needed when predicting. Because we
        # should write the raw data from the source db into
        # the dest db, instead of the transformed data after
        # `transform_fn(features)` . If raw_data_dir is not
        # None, the raw data from the source db would be written
        # into another file.
        if os.path.exists(raw_data_dir):
            shutil.rmtree(raw_data_dir, ignore_errors=True)

        os.mkdir(raw_data_dir)

    if is_pai:
        for dmatrix in pai_dataset(fn,
                                   feature_metas,
                                   feature_column_names,
                                   label_meta,
                                   pai_table,
                                   pai_single_file,
                                   cache,
                                   rank,
                                   nworkers,
                                   batch_size=batch_size,
                                   feature_column_code=feature_column_code,
                                   raw_data_dir=raw_data_dir):
            yield dmatrix
        return

    conn = db.connect_with_data_source(datasource)
    gen = db.db_generator(conn, dataset_sql, label_meta)()

    selected_cols = db.selected_cols(conn, dataset_sql)
    for _ in six.moves.range(epoch):
        step = 0
        # the filename per batch is [filename]_[step]
        step_file_name = "%s_%d" % (fn, step)
        written_rows = dump_dmatrix(step_file_name,
                                    gen,
                                    feature_column_names,
                                    feature_metas,
                                    label_meta,
                                    selected_cols,
                                    transform_fn=transform_fn,
                                    raw_data_dir=raw_data_dir)

        while written_rows > 0:
            yield load_dmatrix('{0}#{0}.cache'.format(step_file_name)
                               if cache else step_file_name)
            os.remove(step_file_name)

            step += 1
            step_file_name = "%s_%d" % (fn, step)
            written_rows = dump_dmatrix(step_file_name,
                                        gen,
                                        feature_column_names,
                                        feature_metas,
                                        label_meta,
                                        selected_cols,
                                        transform_fn=transform_fn,
                                        raw_data_dir=raw_data_dir)
示例#15
0
    def test_generator(self):
        conn = connect(testing.get_datasource())
        # prepare test data
        conn.execute(self.drop_statement)
        conn.execute(self.create_statement)
        conn.execute(self.insert_statement)

        column_name_to_type = {
            "f1": {
                "feature_name": "f1",
                "delimiter": "",
                "dtype": "float32",
                "is_sparse": False,
                "shape": []
            },
            "f2": {
                "feature_name": "f2",
                "delimiter": "",
                "dtype": "int64",
                "is_sparse": False,
                "shape": []
            },
            "f3str": {
                "feature_name": "f3str",
                "delimiter": "",
                "dtype": "string",
                "is_sparse": False,
                "shape": []
            },
            "f4sparse": {
                "feature_name": "f4sparse",
                "delimiter": "",
                "dtype": "float32",
                "is_sparse": True,
                "shape": [],
                "format": "kv"
            },
            "f5dense": {
                "feature_name": "f5dense",
                "delimiter": ",",
                "dtype": "int64",
                "is_sparse": False,
                "shape": [3]
            }
        }
        label_meta = {"feature_name": "label", "shape": [], "delimiter": ""}
        gen = db_generator(conn, "SELECT * FROM test_table_float_fea",
                           label_meta)
        idx = 0
        for row, label in gen():
            if idx == 0:
                features = read_features_from_row(
                    row, ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    column_name_to_type)
                self.assertEqual(1.0, features[0][0])
                self.assertEqual(1, features[1][0])
                self.assertEqual('a', features[2][0])
                self.assertTrue(
                    np.array_equal(np.array([[1], [2]]), features[3][0]))
                self.assertTrue(
                    np.array_equal(np.array([1., 2.], dtype=np.float32),
                                   features[3][1]))
                self.assertTrue(
                    np.array_equal(np.array([1, 2, 3]), features[4][0]))
                self.assertEqual(0, label)
            elif idx == 1:
                try:
                    features = read_features_from_row(
                        row, ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                        ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                        column_name_to_type)
                except Exception as e:
                    self.assertTrue(isinstance(e, ValueError))
                features = read_features_from_row(
                    row, ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    column_name_to_type,
                    is_xgboost=True)
                self.assertEqual(XGBOOST_NULL_MAGIC, features[0][0])
                self.assertEqual(int(XGBOOST_NULL_MAGIC), features[1][0])
                self.assertEqual("", features[2][0])
                self.assertTrue(np.array_equal(np.array([]), features[3][0]))
                self.assertTrue(np.array_equal(np.array([]), features[3][1]))
                self.assertTrue(
                    np.array_equal(np.array([1, 2, 3]), features[4][0]))
                self.assertEqual(1, label)
            idx += 1
        self.assertEqual(idx, 2)
示例#16
0
def keras_predict(estimator, model_params, save, result_table, is_pai,
                  pai_table, feature_column_names, feature_metas,
                  train_label_name, result_col_name, datasource, select,
                  hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass):

    classifier = init_model_with_feature_column(estimator, model_params)
    classifier_pkg = sys.modules[estimator.__module__]
    conn = None
    if is_pai:
        driver = "pai_maxcompute"
    else:
        conn = db.connect_with_data_source(datasource)
        driver = conn.driver

    if is_pai:
        pai_table_parts = pai_table.split(".")
        formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                       pai_table_parts[1])
        gen = db.pai_maxcompute_db_generator(formatted_pai_table)
        selected_cols = feature_column_names
    else:
        gen = db.db_generator(conn, select)
        selected_cols = db.selected_cols(conn, select)

    def eval_input_fn(batch_size, cache=False):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))
        tf_gen = tf_generator(gen, selected_cols, feature_column_names,
                              feature_metas)
        dataset = tf.data.Dataset.from_generator(tf_gen,
                                                 (tuple(feature_types), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        dataset = dataset.map(ds_mapper).batch(batch_size)
        if cache:
            dataset = dataset.cache()
        return dataset

    # NOTE: always use batch_size=1 when predicting to get the pairs of
    #       features and predict results to insert into result table.
    pred_dataset = eval_input_fn(1)
    one_batch = next(iter(pred_dataset))
    # NOTE: must run predict one batch to initialize parameters
    # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models
    classifier.predict_on_batch(one_batch)
    classifier.load_weights(save)
    pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()

    column_names = selected_cols[:]
    train_label_index = selected_cols.index(train_label_name)
    if train_label_index != -1:
        del column_names[train_label_index]
    column_names.append(result_col_name)

    with db.buffered_db_writer(driver, conn, result_table, column_names, 100,
                               hdfs_namenode_addr, hive_location, hdfs_user,
                               hdfs_pass) as w:
        for features in pred_dataset:
            result = classifier.predict_on_batch(features)
            # FIXME(typhoonzero): determine the predict result is classification by
            # adding the prediction result together to see if it is close to 1.0.
            if len(result[0]) == 1:  # regression result
                result = result[0][0]
            else:
                sum = 0
                for i in result[0]:
                    sum += i
                if np.isclose(sum, 1.0):  # classification result
                    result = result[0].argmax(axis=-1)
                else:
                    result = result[0]  # multiple regression result
            row = []
            for idx, name in enumerate(feature_column_names):
                val = features[name].numpy()[0][0]
                row.append(str(val))
            if isinstance(result, np.ndarray):
                if len(result) > 1:
                    # NOTE(typhoonzero): if the output dimension > 1, format output tensor
                    # using a comma separated string. Only available for keras models.
                    row.append(",".join([str(i) for i in result]))
                else:
                    row.append(str(result[0]))
            else:
                row.append(str(result))
            w.write(row)
    del pred_dataset
示例#17
0
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_column_names_map,
                      feature_columns, feature_metas, train_label_name,
                      result_col_name, datasource, select, hdfs_namenode_addr,
                      hive_location, hdfs_user, hdfs_pass, is_pai, pai_table):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)

    if is_pai:
        driver = "pai_maxcompute"
        conn = None
        pai_table_parts = pai_table.split(".")
        formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                       pai_table_parts[1])
        selected_cols = db.pai_selected_cols(formatted_pai_table)
        predict_generator = db.pai_maxcompute_db_generator(
            formatted_pai_table)()

    else:
        driver = conn.driver

        # bypass all selected cols to the prediction result table
        selected_cols = db.selected_cols(conn, select)
        predict_generator = db.db_generator(conn, select)()

    write_cols = selected_cols[:]
    try:
        train_label_index = selected_cols.index(train_label_name)
    except ValueError:
        train_label_index = -1
    if train_label_index != -1:
        del write_cols[train_label_index]
    write_cols.append(result_col_name)

    # load from the exported model
    with open("exported_path", "r") as fn:
        export_path = fn.read()
    if tf_is_version2():
        imported = tf.saved_model.load(export_path)
    else:
        imported = tf.saved_model.load_v2(export_path)

    def add_to_example(example, x, i):
        feature_name = feature_column_names[i]
        dtype_str = feature_metas[feature_name]["dtype"]
        if feature_metas[feature_name]["delimiter"] != "":
            if feature_metas[feature_name]["is_sparse"]:
                # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only
                values = x[0][i][0].flatten()
            else:
                values = x[0][i].flatten()
            if dtype_str == "float32" or dtype_str == "float64":
                example.features.feature[feature_name].float_list.value.extend(
                    list(values))
            elif dtype_str == "int32" or dtype_str == "int64":
                example.features.feature[feature_name].int64_list.value.extend(
                    list(values))
        else:
            if "feature_columns" in feature_columns:
                idx = feature_column_names.index(feature_name)
                fc = feature_columns["feature_columns"][idx]
            else:
                # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param.
                idx = -1
                try:
                    idx = feature_column_names_map[
                        "dnn_feature_columns"].index(feature_name)
                    fc = feature_columns["dnn_feature_columns"][idx]
                except:
                    try:
                        idx = feature_column_names_map[
                            "linear_feature_columns"].index(feature_name)
                        fc = feature_columns["linear_feature_columns"][idx]
                    except:
                        pass
                if idx == -1:
                    raise ValueError(
                        "can not found feature %s in all feature columns")
            if dtype_str == "float32" or dtype_str == "float64":
                # need to pass a tuple(float, )
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "int32" or dtype_str == "int64":
                numeric_type = type(tf.feature_column.numeric_column("tmp"))
                if type(fc) == numeric_type:
                    example.features.feature[
                        feature_name].float_list.value.extend(
                            (float(x[0][i][0]), ))
                else:
                    example.features.feature[
                        feature_name].int64_list.value.extend(
                            (int(x[0][i][0]), ))
            elif dtype_str == "string":
                example.features.feature[feature_name].bytes_list.value.extend(
                    x[0][i])

    def predict(x):
        example = tf.train.Example()
        for i in range(len(feature_column_names)):
            add_to_example(example, x, i)
        return imported.signatures["predict"](
            examples=tf.constant([example.SerializeToString()]))

    with db.buffered_db_writer(driver, conn, result_table, write_cols, 100,
                               hdfs_namenode_addr, hive_location, hdfs_user,
                               hdfs_pass) as w:
        for row, _ in predict_generator:
            features = db.read_features_from_row(row, selected_cols,
                                                 feature_column_names,
                                                 feature_metas)
            result = predict((features, ))
            if train_label_index != -1 and len(row) > train_label_index:
                del row[train_label_index]
            if "class_ids" in result:
                row.append(str(result["class_ids"].numpy()[0][0]))
            else:
                # regression predictions
                row.append(str(result["predictions"].numpy()[0][0]))
            w.write(row)
示例#18
0
def load_db_data_to_data_frame(datasource,
                               select=None,
                               odps_table=None,
                               load_schema_only=False):
    if odps_table is None:
        conn = db.connect_with_data_source(datasource)
        selected_cols = db.selected_cols(conn, select)
        if load_schema_only:
            return pd.DataFrame(columns=selected_cols)

        generator = db.db_generator(conn, select)
    else:
        project, table = odps_table.split('.')
        conn = db.connect_with_data_source(datasource)
        schema = conn.get_table(table).schema
        selected_cols = [column.name for column in schema]
        if load_schema_only:
            return pd.DataFrame(columns=selected_cols)

        select_sql = "SELECT * FROM {}".format(table)
        instance = conn.execute_sql(select_sql)

        if not instance.is_successful():
            raise ValueError('cannot get data from table {}.{}'.format(
                project, table))

        def generator_func():
            from odps import tunnel
            compress = tunnel.CompressOption.CompressAlgorithm.ODPS_ZLIB
            with instance.open_reader(tunnel=False,
                                      compress=compress) as reader:
                for record in reader:
                    row_value = [
                        record[i] for i in six.moves.range(len(selected_cols))
                    ]
                    yield row_value, None

        generator = generator_func

    dtypes = [None] * len(selected_cols)
    values = [[] for _ in six.moves.range(len(selected_cols))]
    for row_value, _ in generator():
        for i, item in enumerate(row_value):
            if dtypes[i] == np.str:
                values[i].append(item)
                continue

            float_value = None
            try:
                float_value = float(item)
            except:
                pass

            if float_value is None:  # cannot convert to float value
                dtypes[i] = np.str
            else:
                item = float_value
                int_value = long(item) if six.PY2 else int(item)
                if int_value != item:
                    dtypes[i] = np.float64

            values[i].append(item)

    numpy_dict = collections.OrderedDict()
    for col_name, dtype, value in six.moves.zip(selected_cols, dtypes, values):
        if dtype is None:
            dtype = np.int64

        numpy_dict[col_name] = np.array(value, dtype=dtype)

    df = pd.DataFrame(data=numpy_dict)
    return df