Пример #1
0
    def _do_test_hive_specified_db(self,
                                   driver,
                                   conn,
                                   hdfs_namenode_addr="",
                                   hive_location=""):
        create_db = '''create database test_db'''
        create_tbl = '''create table test_db.tbl (features string, label int) ROW FORMAT DELIMITED FIELDS TERMINATED BY "\001"'''
        drop_tbl = '''drop table if exists test_db.tbl'''
        select_tbl = '''select * from test_db.tbl'''
        table_schema = ["label", "features"]
        values = [(1, '5,6,1,2')] * 10
        execute(driver, conn, create_db)
        execute(driver, conn, drop_tbl)
        execute(driver, conn, create_tbl)
        with buffered_db_writer(driver,
                                conn,
                                "test_db.tbl",
                                table_schema,
                                buff_size=10,
                                hdfs_namenode_addr=hdfs_namenode_addr,
                                hive_location=hive_location) as w:
            for row in values:
                w.write(row)

        field_names, data = execute(driver, conn, select_tbl)

        expect_features = ['5,6,1,2'] * 10
        expect_labels = [1] * 10

        self.assertEqual(field_names, ['features', 'label'])
        self.assertEqual(expect_features, data[0])
        self.assertEqual(expect_labels, data[1])
Пример #2
0
    def _do_test(self, driver, conn, hdfs_namenode_addr="", hive_location=""):
        table_name = "test_db"
        table_schema = ["label", "features"]
        values = [(1, '5,6,1,2')] * 10

        execute(driver, conn, self.drop_statement)

        if driver == "hive":
            execute(driver, conn, self.hive_create_statement)
        else:
            execute(driver, conn, self.create_statement)
        with buffered_db_writer(driver,
                                conn,
                                table_name,
                                table_schema,
                                buff_size=10,
                                hdfs_namenode_addr=hdfs_namenode_addr,
                                hive_location=hive_location) as w:
            for row in values:
                w.write(row)

        field_names, data = execute(driver, conn, self.select_statement)

        expect_features = ['5,6,1,2'] * 10
        expect_labels = [1] * 10

        self.assertEqual(field_names, ['features', 'label'])
        self.assertEqual(expect_features, data[0])
        self.assertEqual(expect_labels, data[1])
Пример #3
0
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, label_meta,
                      datasource, select, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass, is_pai, pai_table):
    classifier = estimator(**model_params)
    conn = connect_with_data_source(datasource)

    def fast_input_fn(generator):
        feature_types = []
        for name in feature_column_names:
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))

        def _inner_input_fn():
            if is_pai:
                dataset = pai_maxcompute_input_fn(pai_table, datasource,
                                                  feature_column_names,
                                                  feature_metas, label_meta)
            else:
                dataset = tf.data.Dataset.from_generator(
                    generator,
                    (tuple(feature_types), eval(
                        "tf.%s" % label_meta["dtype"])))
                ds_mapper = functools.partial(
                    parse_sparse_feature,
                    feature_column_names=feature_column_names,
                    feature_metas=feature_metas)
                dataset = dataset.map(ds_mapper)
            dataset = dataset.batch(1).cache()
            iterator = dataset.make_one_shot_iterator()
            features = iterator.get_next()
            return features

        return _inner_input_fn

    column_names = feature_column_names[:]
    column_names.append(label_meta["feature_name"])
    fast_predictor = FastPredict(classifier, fast_input_fn)

    with buffered_db_writer(conn.driver, conn, result_table, column_names, 100,
                            hdfs_namenode_addr, hive_location, hdfs_user,
                            hdfs_pass) as w:
        for features in db_generator(conn.driver, conn, select,
                                     feature_column_names,
                                     label_meta["feature_name"],
                                     feature_metas)():
            result = fast_predictor.predict(features)
            row = []
            for idx, _ in enumerate(feature_column_names):
                val = features[0][idx][0]
                row.append(str(val))
            if "class_ids" in list(result)[0]:
                row.append(str(list(result)[0]["class_ids"][0]))
            else:
                # regression predictions
                row.append(str(list(result)[0]["predictions"][0]))
            w.write(row)
Пример #4
0
def write_dfc_result(dfc_mean, gain, result_table, conn, feature_column_names,
                     hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass):
    with buffered_db_writer(conn.driver, conn, result_table,
                            ["feature", "dfc", "gain"], 100,
                            hdfs_namenode_addr, hive_location, hdfs_user,
                            hdfs_pass) as w:
        for row_name in feature_column_names:
            w.write([row_name, dfc_mean.loc[row_name], gain[row_name]])
Пример #5
0
def write_shap_values(shap_values, driver, conn, result_table,
                      feature_column_names, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass):
    with buffered_db_writer(driver, conn, result_table, feature_column_names,
                            100, hdfs_namenode_addr, hive_location, hdfs_user,
                            hdfs_pass) as w:
        for row in shap_values:
            w.write(list(row))
Пример #6
0
def pred(datasource,
         select,
         feature_metas,
         feature_column_names,
         label_meta,
         result_table,
         is_pai=False,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         pai_table=""):
    # TODO(typhoonzero): support running on PAI without MaxCompute AK/SK connection.
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    label_name = label_meta["feature_name"]

    dpred = xgb_dataset(datasource, 'predict.txt', select, feature_metas,
                        feature_column_names, None, is_pai, pai_table, True)

    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    print("Start predicting XGBoost model...")
    preds = bst.predict(dpred)

    # TODO(Yancey1989): using the train parameters to decide regression model or classifier model
    if len(preds.shape) == 2:
        # classifier result
        preds = np.argmax(np.array(preds), axis=1)
    feature_file_read = open("predict.txt", "r")

    result_column_names = feature_column_names
    result_column_names.append(label_name)
    line_no = 0
    if is_pai:
        driver = "pai_maxcompute"
        conn = None
    else:
        driver = conn.driver
    with db.buffered_db_writer(driver,
                               conn,
                               result_table,
                               result_column_names,
                               100,
                               hdfs_namenode_addr=hdfs_namenode_addr,
                               hive_location=hive_location,
                               hdfs_user=hdfs_user,
                               hdfs_pass=hdfs_pass) as w:
        while True:
            line = feature_file_read.readline()
            if not line:
                break
            row = [i.split(":")[1] for i in line.replace("\n", "").split("\t")]
            row.append(str(preds[line_no]))
            w.write(row)
            line_no += 1
    print("Done predicting. Predict table : %s" % result_table)
Пример #7
0
def keras_predict(estimator, model_params, save, result_table,
                  feature_column_names, feature_metas, label_meta, datasource,
                  select, hdfs_namenode_addr, hive_location, hdfs_user,
                  hdfs_pass):
    classifier = estimator(**model_params)
    classifier_pkg = sys.modules[estimator.__module__]

    conn = connect_with_data_source(datasource)

    def eval_input_fn(batch_size, cache=False):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))

        gen = db_generator(conn.driver, conn, select, feature_column_names,
                           label_meta["feature_name"], feature_metas)
        dataset = tf.data.Dataset.from_generator(
            gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])))
        ds_mapper = functools.partial(
            parse_sparse_feature,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        dataset = dataset.map(ds_mapper).batch(batch_size)
        if cache:
            dataset = dataset.cache()
        return dataset

    # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results
    #       to insert into result table.
    pred_dataset = eval_input_fn(1)
    one_batch = next(iter(pred_dataset))
    # NOTE: must run predict one batch to initialize parameters
    # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models
    classifier.predict_on_batch(one_batch[0])
    classifier.load_weights(save)
    pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()
    buff_rows = []
    column_names = feature_column_names[:]
    column_names.append(label_meta["feature_name"])
    with buffered_db_writer(conn.driver, conn, result_table, column_names, 100,
                            hdfs_namenode_addr, hive_location, hdfs_user,
                            hdfs_pass) as w:
        for features in pred_dataset:
            result = classifier.predict_on_batch(features[0])
            result = classifier_pkg.prepare_prediction_column(result[0])
            row = []
            for idx, name in enumerate(feature_column_names):
                val = features[0][name].numpy()[0][0]
                row.append(str(val))
            row.append(str(result))
            w.write(row)
    del pred_dataset
Пример #8
0
def pred(datasource,
         select,
         feature_field_meta,
         label_field_meta,
         result_table,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass=""):
    conn = connect_with_data_source(datasource)

    feature_column_names = [k["name"] for k in feature_field_meta]
    label_name = label_field_meta["name"]

    feature_specs = {k['name']: k for k in feature_field_meta}

    dpred = xgb_dataset(conn, 'predict.txt', select, feature_column_names,
                        label_name, feature_specs)

    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    preds = bst.predict(dpred)

    # TODO(Yancey1989): using the train parameters to decide regression model or classifier model
    if len(preds.shape) == 2:
        # classifier result
        preds = np.argmax(np.array(preds), axis=1)
    feature_file_read = open("predict.txt", "r")

    result_column_names = feature_column_names
    result_column_names.append(label_name)
    line_no = 0
    with buffered_db_writer(conn.driver,
                            conn,
                            result_table,
                            result_column_names,
                            100,
                            hdfs_namenode_addr=hdfs_namenode_addr,
                            hive_location=hive_location,
                            hdfs_user=hdfs_user,
                            hdfs_pass=hdfs_pass) as w:
        while True:
            line = feature_file_read.readline()
            if not line:
                break
            row = [
                i.split(":")[1] for i in line.replace("\n", "").split("\t")[1:]
            ]
            row.append(str(preds[line_no]))
            w.write(row)
            line_no += 1
    print("Done predicting. Predict table : %s" % result_table)
Пример #9
0
def write_result_metrics(result_metrics, metric_name_list, result_table,
                         driver, conn, hdfs_namenode_addr, hive_location,
                         hdfs_user, hdfs_pass):
    # NOTE: assume that the result table is already created with columns:
    # loss | metric_names ...
    column_names = metric_name_list
    with buffered_db_writer(driver, conn, result_table, column_names, 100,
                            hdfs_namenode_addr, hive_location, hdfs_user,
                            hdfs_pass) as w:
        row = []
        for key in metric_name_list:
            row.append(result_metrics[key])
        w.write(row)
Пример #10
0
def predict_and_store_result(bst, dpred, feature_file_id, model_params,
                             feature_column_names, label_name, is_pai, conn,
                             result_table, hdfs_namenode_addr, hive_location,
                             hdfs_user, hdfs_pass):
    preds = bst.predict(dpred)

    #TODO(yancey1989): should save train_params and model_params not only on PAI submitter
    #TODO(yancey1989): output the original result for various objective function.
    if model_params:
        obj = model_params["objective"]
        if obj.startswith("binary:"):
            preds = (preds > 0.5).astype(int)
        elif obj.startswith("multi:"):
            preds = np.argmax(np.array(preds), axis=1)
        else:
            # using the original prediction result of predict API by default
            pass
    else:
        # prediction output wiht multi-class job has two dimensions, this is a temporary
        # way, can remove this else branch when we can load the model meta not only on PAI submitter.
        if len(preds.shape) == 2:
            preds = np.argmax(np.array(preds), axis=1)
    if is_pai:
        feature_file_read = open("predict.txt", "r")
    else:
        feature_file_read = open("predict.txt_%d" % feature_file_id, "r")

    result_column_names = feature_column_names
    result_column_names.append(label_name)
    line_no = 0
    if is_pai:
        driver = "pai_maxcompute"
    else:
        driver = conn.driver
    with db.buffered_db_writer(driver,
                               conn,
                               result_table,
                               result_column_names,
                               100,
                               hdfs_namenode_addr=hdfs_namenode_addr,
                               hive_location=hive_location,
                               hdfs_user=hdfs_user,
                               hdfs_pass=hdfs_pass) as w:
        while True:
            line = feature_file_read.readline()
            if not line:
                break
            row = [i.split(":")[1] for i in line.replace("\n", "").split("\t")]
            row.append(str(preds[line_no]))
            w.write(row)
            line_no += 1
Пример #11
0
def keras_predict(estimator, model_params, save, result_table, is_pai,
                  pai_table, feature_column_names, feature_metas,
                  result_col_name, datasource, select, hdfs_namenode_addr,
                  hive_location, hdfs_user, hdfs_pass):
    classifier = estimator(**model_params)
    classifier_pkg = sys.modules[estimator.__module__]

    conn = db.connect_with_data_source(datasource)

    def eval_input_fn(batch_size, cache=False):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))

        if is_pai:
            pai_table_parts = pai_table.split(".")
            formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                           pai_table_parts[1])
            gen = db.pai_maxcompute_db_generator(formatted_pai_table,
                                                 feature_column_names, None,
                                                 feature_metas)
        else:
            gen = db.db_generator(conn.driver, conn, select,
                                  feature_column_names, None, feature_metas)
        dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        dataset = dataset.map(ds_mapper).batch(batch_size)
        if cache:
            dataset = dataset.cache()
        return dataset

    # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results
    #       to insert into result table.
    pred_dataset = eval_input_fn(1)
    one_batch = next(iter(pred_dataset))
    # NOTE: must run predict one batch to initialize parameters
    # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models
    classifier.predict_on_batch(one_batch)
    classifier.load_weights(save)
    pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()
    buff_rows = []
    column_names = feature_column_names[:]
    column_names.append(result_col_name)
    with db.buffered_db_writer(conn.driver, conn, result_table, column_names,
                               100, hdfs_namenode_addr, hive_location,
                               hdfs_user, hdfs_pass) as w:
        for features in pred_dataset:
            result = classifier.predict_on_batch(features)
            result = classifier_pkg.prepare_prediction_column(result[0])
            row = []
            for idx, name in enumerate(feature_column_names):
                val = features[name].numpy()[0][0]
                row.append(str(val))
            if isinstance(result, np.ndarray):
                if len(result) > 1:
                    # NOTE(typhoonzero): if the output dimension > 1, format output tensor
                    # using a comma separated string. Only available for keras models.
                    row.append(",".join([str(i) for i in result]))
                else:
                    row.append(str(result[0]))
            else:
                row.append(str(result))
            w.write(row)
    del pred_dataset
Пример #12
0
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_column_names_map,
                      feature_columns, feature_metas, result_col_name,
                      datasource, select, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass, is_pai, pai_table):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)

    column_names = feature_column_names[:]
    column_names.append(result_col_name)

    if is_pai:
        driver = "pai_maxcompute"
        conn = None
        pai_table_parts = pai_table.split(".")
        formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                       pai_table_parts[1])
        predict_generator = db.pai_maxcompute_db_generator(
            formatted_pai_table, feature_column_names, None, feature_metas)()
    else:
        driver = conn.driver
        predict_generator = db.db_generator(conn.driver, conn, select,
                                            feature_column_names, None,
                                            feature_metas)()
    # load from the exported model
    with open("exported_path", "r") as fn:
        export_path = fn.read()
    if tf_is_version2():
        imported = tf.saved_model.load(export_path)
    else:
        imported = tf.saved_model.load_v2(export_path)

    def add_to_example(example, x, i):
        feature_name = feature_column_names[i]
        dtype_str = feature_metas[feature_name]["dtype"]
        if feature_metas[feature_name]["delimiter"] != "":
            if feature_metas[feature_name]["is_sparse"]:
                # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only
                values = x[0][i][0].flatten()
            else:
                values = x[0][i].flatten()
            if dtype_str == "float32" or dtype_str == "float64":
                example.features.feature[feature_name].float_list.value.extend(
                    list(values))
            elif dtype_str == "int32" or dtype_str == "int64":
                example.features.feature[feature_name].int64_list.value.extend(
                    list(values))
        else:
            if "feature_columns" in feature_columns:
                idx = feature_column_names.index(feature_name)
                fc = feature_columns["feature_columns"][idx]
            else:
                # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param.
                idx = -1
                try:
                    idx = feature_column_names_map[
                        "dnn_feature_columns"].index(feature_name)
                    fc = feature_columns["dnn_feature_columns"][idx]
                except:
                    try:
                        idx = feature_column_names_map[
                            "linear_feature_columns"].index(feature_name)
                        fc = feature_columns["linear_feature_columns"][idx]
                    except:
                        pass
                if idx == -1:
                    raise ValueError(
                        "can not found feature %s in all feature columns")
            if dtype_str == "float32" or dtype_str == "float64":
                # need to pass a tuple(float, )
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "int32" or dtype_str == "int64":
                numeric_type = type(tf.feature_column.numeric_column("tmp"))
                if type(fc) == numeric_type:
                    example.features.feature[
                        feature_name].float_list.value.extend(
                            (float(x[0][i][0]), ))
                else:
                    example.features.feature[
                        feature_name].int64_list.value.extend(
                            (int(x[0][i][0]), ))
            elif dtype_str == "string":
                example.features.feature[feature_name].bytes_list.value.extend(
                    x[0][i])

    def predict(x):
        example = tf.train.Example()
        for i in range(len(feature_column_names)):
            add_to_example(example, x, i)
        return imported.signatures["predict"](
            examples=tf.constant([example.SerializeToString()]))

    with db.buffered_db_writer(driver, conn, result_table, column_names, 100,
                               hdfs_namenode_addr, hive_location, hdfs_user,
                               hdfs_pass) as w:
        for features in predict_generator:
            result = predict(features)
            row = []
            for idx, _ in enumerate(feature_column_names):
                per_feature = features[0][idx]
                if isinstance(per_feature, tuple) or isinstance(
                        per_feature, list):
                    # is sparse feature: tuple (indices, values, shape) or scalar
                    val = per_feature[0]
                elif isinstance(per_feature, np.ndarray):
                    val = per_feature
                # val = features[0][idx][0]
                row.append(str(val))
            if "class_ids" in result:
                row.append(str(result["class_ids"].numpy()[0][0]))
            else:
                # regression predictions
                row.append(str(result["predictions"].numpy()[0][0]))
            w.write(row)
Пример #13
0
def evaluate_and_store_result(bst, dpred, feature_file_id, validation_metrics,
                              model_params, feature_column_names, label_meta,
                              is_pai, conn, result_table, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass):
    preds = bst.predict(dpred)
    # FIXME(typhoonzero): copied from predict.py
    if model_params:
        obj = model_params["objective"]
        if obj.startswith("binary:"):
            preds = (preds > 0.5).astype(int)
        elif obj.startswith("multi:"):
            preds = np.argmax(np.array(preds), axis=1)
        else:
            # using the original prediction result of predict API by default
            pass
    else:
        # prediction output with multi-class job has two dimensions, this is a temporary
        # way, can remove this else branch when we can load the model meta not only on PAI submitter.
        if len(preds.shape) == 2:
            preds = np.argmax(np.array(preds), axis=1)

    if is_pai:
        feature_file_read = open("predict.txt", "r")
    else:
        feature_file_read = open("predict.txt_%d" % feature_file_id, "r")

    y_test_list = []
    for line in feature_file_read:
        row = [i for i in line.strip().split("\t")]
        # DMatrix store label in the first column
        if label_meta["dtype"] == "float32":
            label = float(row[0])
        elif label_meta["dtype"] == "int64" or label_meta["dtype"] == "int32":
            label = int(row[0])
        else:
            raise ValueError("unsupported label dtype: %s" %
                             label_meta["dtype"])
        y_test_list.append(label)
    y_test = np.array(y_test_list)

    evaluate_results = dict()
    for metric_name in validation_metrics:
        metric_func = eval(metric_name)
        metric_value = metric_func(y_test, preds)
        evaluate_results[metric_name] = metric_value

    # write evaluation result to result table
    if is_pai:
        driver = "pai_maxcompute"
    else:
        driver = conn.driver
    result_columns = ["loss"] + validation_metrics
    with db.buffered_db_writer(driver,
                               conn,
                               result_table,
                               result_columns,
                               100,
                               hdfs_namenode_addr=hdfs_namenode_addr,
                               hive_location=hive_location,
                               hdfs_user=hdfs_user,
                               hdfs_pass=hdfs_pass) as w:
        row = ["0.0"]
        for mn in validation_metrics:
            row.append(str(evaluate_results[mn]))
        w.write(row)
Пример #14
0
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, result_col_name,
                      datasource, select, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass, is_pai, pai_table):
    if not is_pai:
        conn = connect_with_data_source(datasource)

    column_names = feature_column_names[:]
    column_names.append(result_col_name)

    if is_pai:
        driver = "pai_maxcompute"
        conn = None
        pai_table_parts = pai_table.split(".")
        formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                      pai_table_parts[1])
        predict_generator = pai_maxcompute_db_generator(
            formated_pai_table, feature_column_names, None, feature_metas)()
    else:
        driver = conn.driver
        predict_generator = db_generator(conn.driver, conn, select,
                                         feature_column_names, None,
                                         feature_metas)()
    # load from the exported model
    if save.startswith("oss://"):
        with open("exported_path", "r") as fn:
            export_path = fn.read()
        parts = save.split("?")
        export_path_oss = parts[0] + export_path
        if TF_VERSION_2:
            imported = tf.saved_model.load(export_path_oss)
        else:
            imported = tf.saved_model.load_v2(export_path_oss)
    else:
        with open("exported_path", "r") as fn:
            export_path = fn.read()
        if TF_VERSION_2:
            imported = tf.saved_model.load(export_path)
        else:
            imported = tf.saved_model.load_v2(export_path)

    def add_to_example(example, x, i):
        feature_name = feature_column_names[i]
        dtype_str = feature_metas[feature_name]["dtype"]
        if feature_metas[feature_name]["delimiter"] != "":
            if feature_metas[feature_name]["is_sparse"]:
                # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only
                values = x[0][i][0].flatten()
            else:
                values = x[0][i].flatten()
            if dtype_str == "float32" or dtype_str == "float64":
                example.features.feature[feature_name].float_list.value.extend(
                    list(values))
            elif dtype_str == "int32" or dtype_str == "int64":
                example.features.feature[feature_name].int64_list.value.extend(
                    list(values))
        else:
            if dtype_str == "float32" or dtype_str == "float64":
                # need to pass a tuple(float, )
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "int32" or dtype_str == "int64":
                # FIXME(typhoonzero): figure out why int64 features need to convert to float
                example.features.feature[feature_name].float_list.value.extend(
                    (float(x[0][i][0]), ))
            elif dtype_str == "string":
                example.features.feature[feature_name].bytes_list.value.extend(
                    x[0][i])

    def predict(x):
        example = tf.train.Example()
        for i in range(len(feature_column_names)):
            add_to_example(example, x, i)
        return imported.signatures["predict"](
            examples=tf.constant([example.SerializeToString()]))

    with buffered_db_writer(driver, conn, result_table, column_names, 100,
                            hdfs_namenode_addr, hive_location, hdfs_user,
                            hdfs_pass) as w:
        for features in predict_generator:
            result = predict(features)
            row = []
            for idx, _ in enumerate(feature_column_names):
                per_feature = features[0][idx]
                if isinstance(per_feature, tuple) or isinstance(
                        per_feature, list):
                    # is sparse feature: tuple (indices, values, shape) or scalar
                    val = per_feature[0]
                elif isinstance(per_feature, np.ndarray):
                    val = per_feature
                # val = features[0][idx][0]
                row.append(str(val))
            if "class_ids" in result:
                row.append(str(result["class_ids"].numpy()[0][0]))
            else:
                # regression predictions
                row.append(str(result["predictions"].numpy()[0][0]))
            w.write(row)
Пример #15
0
def predict_and_store_result(bst, dpred, feature_file_id, model_params,
                             selected_cols, label_name, is_pai, conn,
                             result_table, hdfs_namenode_addr, hive_location,
                             hdfs_user, hdfs_pass):
    preds = bst.predict(dpred)

    #TODO(yancey1989): should save train_params and model_params not only on PAI submitter
    #TODO(yancey1989): output the original result for various objective function.
    if model_params:
        obj = model_params["objective"]
        if obj.startswith("binary:"):
            preds = (preds > 0.5).astype(int)
        elif obj.startswith("multi:"):
            preds = np.argmax(np.array(preds), axis=1)
        else:
            # using the original prediction result of predict API by default
            pass
    else:
        # prediction output with multi-class job has two dimensions, this is a temporary
        # way, can remove this else branch when we can load the model meta not only on PAI submitter.
        if len(preds.shape) == 2:
            preds = np.argmax(np.array(preds), axis=1)

    if is_pai:
        feature_file_read = open("predict.txt.raw", "r")
    else:
        feature_file_read = open(
            "predict.raw.dir/predict.txt_%d" % feature_file_id, "r")

    result_column_names = selected_cols

    # Users may use "SELECT ..., label ... TO PREDICT new_table.new_label" to
    # write both the actual label and the prediction label into the result
    # table for comparision. So if "new_label == label", we should use
    # "INSERT INTO new_table (..., label) VALUES ..." to write the result table,
    # and if new_label != label, we should use
    # "INSERT INTO new_table (..., label, new_label) VALUES..." to write the result table.
    # "new_label == label" is equivalent to "label_name in selected_cols" .
    label_index = selected_cols.index(
        label_name) if label_name in selected_cols else None
    if label_index is None:
        result_column_names.append(label_name)

    line_no = 0
    if is_pai:
        driver = "pai_maxcompute"
    else:
        driver = conn.driver
    with db.buffered_db_writer(driver,
                               conn,
                               result_table,
                               result_column_names,
                               100,
                               hdfs_namenode_addr=hdfs_namenode_addr,
                               hive_location=hive_location,
                               hdfs_user=hdfs_user,
                               hdfs_pass=hdfs_pass) as w:
        while True:
            line = feature_file_read.readline()
            if not line:
                break
            row = [
                item.split(":")[1]
                for i, item in enumerate(line.strip().split("\t"))
                if i != label_index
            ]
            row.append(str(preds[line_no]))
            w.write(row)
            line_no += 1
Пример #16
0
def pred(is_keras_model,
         datasource,
         estimator,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         feature_metas={},
         label_meta={},
         model_params={},
         save="",
         batch_size=1,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass=""):
    conn = connect_with_data_source(datasource)
    model_params.update(feature_columns)
    if not is_keras_model:
        model_params['model_dir'] = save
        classifier = estimator(**model_params)
    else:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        classifier = estimator(**model_params)
        classifier_pkg = sys.modules[estimator.__module__]

    if is_keras_model:

        def eval_input_fn(batch_size, cache=False):
            feature_types = []
            for name in feature_column_names:
                # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
                if feature_metas[name]["is_sparse"]:
                    feature_types.append((tf.int64, tf.int32, tf.int64))
                else:
                    feature_types.append(
                        get_dtype(feature_metas[name]["dtype"]))

            gen = db_generator(conn.driver, conn, select, feature_column_names,
                               label_meta["feature_name"], feature_metas)

            dataset = tf.data.Dataset.from_generator(
                gen,
                (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])))
            ds_mapper = functools.partial(
                parse_sparse_feature,
                feature_column_names=feature_column_names,
                feature_metas=feature_metas)
            dataset = dataset.map(ds_mapper).batch(batch_size)
            if cache:
                dataset = dataset.cache()
            return dataset

        # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results
        #       to insert into result table.
        pred_dataset = eval_input_fn(1)
        one_batch = pred_dataset.__iter__().next()
        # NOTE: must run predict one batch to initialize parameters
        # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models
        classifier.predict_on_batch(one_batch[0])
        classifier.load_weights(save)
        del pred_dataset
        pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()
        buff_rows = []
        column_names = feature_column_names[:]
        column_names.append(label_meta["feature_name"])
        with buffered_db_writer(conn.driver, conn, result_table, column_names,
                                100, hdfs_namenode_addr, hive_location,
                                hdfs_user, hdfs_pass) as w:
            while True:
                try:
                    features = pred_dataset.get_next()
                except tf.errors.OutOfRangeError:
                    break
                result = classifier.predict_on_batch(features[0])
                result = classifier_pkg.prepare_prediction_column(result[0])
                row = []
                for idx, name in enumerate(feature_column_names):
                    val = features[0][name].numpy()[0]
                    row.append(str(val))
                row.append(str(result))
                w.write(row)
        del pred_dataset

    else:

        def fast_input_fn(generator):
            feature_types = []
            for name in feature_column_names:
                if feature_metas[name]["is_sparse"]:
                    feature_types.append((tf.int64, tf.int32, tf.int64))
                else:
                    feature_types.append(
                        get_dtype(feature_metas[name]["dtype"]))

            def _inner_input_fn():
                dataset = tf.data.Dataset.from_generator(
                    generator,
                    (tuple(feature_types), eval(
                        "tf.%s" % label_meta["dtype"])))
                ds_mapper = functools.partial(
                    parse_sparse_feature,
                    feature_column_names=feature_column_names,
                    feature_metas=feature_metas)
                dataset = dataset.map(ds_mapper).batch(1).cache()
                iterator = dataset.make_one_shot_iterator()
                features = iterator.get_next()
                return features

            return _inner_input_fn

        column_names = feature_column_names[:]
        column_names.append(label_meta["feature_name"])
        pred_gen = db_generator(conn.driver, conn, select,
                                feature_column_names,
                                label_meta["feature_name"], feature_metas)()
        fast_predictor = FastPredict(classifier, fast_input_fn)

        with buffered_db_writer(conn.driver, conn, result_table, column_names,
                                100, hdfs_namenode_addr, hive_location,
                                hdfs_user, hdfs_pass) as w:
            while True:
                try:
                    features = next(pred_gen)
                except StopIteration:
                    break
                result = fast_predictor.predict(features)
                row = []
                for idx, _ in enumerate(feature_column_names):
                    val = features[0][idx]
                    row.append(str(val))
                if "class_ids" in list(result)[0]:
                    row.append(str(list(result)[0]["class_ids"][0]))
                else:
                    # regression predictions
                    row.append(str(list(result)[0]["predictions"][0]))
                w.write(row)
        fast_predictor.close()

    print("Done predicting. Predict table : %s" % result_table)
Пример #17
0
def pred(is_keras_model,
         datasource,
         estimator,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         feature_metas={},
         label_meta={},
         model_params={},
         save="",
         batch_size=1,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         is_pai=False,
         pai_table=""):
    global FLAGS
    define_tf_flags()
    if not is_pai:
        conn = connect_with_data_source(datasource)
    model_params.update(feature_columns)

    if is_keras_model:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        classifier = estimator(**model_params)
        classifier_pkg = sys.modules[estimator.__module__]

        def eval_input_fn(batch_size, cache=False):
            feature_types = []
            for name in feature_column_names:
                # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
                if feature_metas[name]["is_sparse"]:
                    feature_types.append((tf.int64, tf.int32, tf.int64))
                else:
                    feature_types.append(get_dtype(feature_metas[name]["dtype"]))

            gen = db_generator(conn.driver, conn, select,
                feature_column_names, label_meta["feature_name"], feature_metas)
                
            dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])))
            ds_mapper = functools.partial(parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas)
            dataset = dataset.map(ds_mapper).batch(batch_size)
            if cache:
                dataset = dataset.cache()
            return dataset

        # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results
        #       to insert into result table.
        pred_dataset = eval_input_fn(1)
        one_batch = pred_dataset.__iter__().next()
        # NOTE: must run predict one batch to initialize parameters
        # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models
        classifier.predict_on_batch(one_batch[0])
        classifier.load_weights(save)
        del pred_dataset
        pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()
        buff_rows = []
        column_names = feature_column_names[:]
        column_names.append(label_meta["feature_name"])
        with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w:
            while True:
                try:
                    features = pred_dataset.get_next()
                except tf.errors.OutOfRangeError:
                    break
                result = classifier.predict_on_batch(features[0])
                result = classifier_pkg.prepare_prediction_column(result[0])
                row = []
                for idx, name in enumerate(feature_column_names):
                    val = features[0][name].numpy()[0]
                    row.append(str(val))
                row.append(str(result))
                w.write(row)
        del pred_dataset

    else:
        if is_pai:
            model_params["model_dir"] = FLAGS.checkpointDir
        else:
            model_params['model_dir'] = save
        classifier = estimator(**model_params)

        # FIXME(typhoonzero): copied from train.py
        def pai_maxcompute_input_fn():
            table_parts = pai_table.split(".")
            if len(table_parts) == 2:
                database, table_name = table_parts
            elif len(table_parts) == 1:
                table_name = pai_table
                driver, dsn = datasource.split("://")
                database = parseMaxComputeDSN(dsn)[-1]
            else:
                raise ValueError("error database.table format: %s" % pai_table)

            tables = ["odps://%s/tables/%s" % (database, table_name)]
            record_defaults = []
            for name in feature_column_names:
                dtype = get_dtype(feature_metas[name]["dtype"])
                record_defaults.append(tf.constant(0, dtype=dtype, shape=feature_metas[name]["shape"]))

            dataset = tf.data.TableRecordDataset(tables,
                                        record_defaults=record_defaults,
                                        selected_cols=",".join(feature_column_names))
            def tensor_to_dict(*args):
                num_features = len(feature_column_names)
                features_dict = dict()
                for idx in range(num_features):
                    name = feature_column_names[idx]
                    features_dict[name] = tf.reshape(args[idx], [-1])
                return features_dict

            return dataset.map(tensor_to_dict)

        def fast_input_fn(generator):
            feature_types = []
            for name in feature_column_names:
                if feature_metas[name]["is_sparse"]:
                    feature_types.append((tf.int64, tf.int32, tf.int64))
                else:
                    feature_types.append(get_dtype(feature_metas[name]["dtype"]))

            def _inner_input_fn():
                if is_pai:
                    dataset = pai_maxcompute_input_fn()
                else:
                    dataset = tf.data.Dataset.from_generator(generator, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])))
                    ds_mapper = functools.partial(parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas)
                    dataset = dataset.map(ds_mapper)
                dataset = dataset.batch(1).cache()
                iterator = dataset.make_one_shot_iterator()
                features = iterator.get_next()
                return features

            return _inner_input_fn


        column_names = feature_column_names[:]
        column_names.append(label_meta["feature_name"])
        pred_gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas)()
        fast_predictor = FastPredict(classifier, fast_input_fn)

        with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w:
            while True:
                try:
                    features = next(pred_gen)
                except StopIteration:
                    break
                result = fast_predictor.predict(features)
                row = []
                for idx, _ in enumerate(feature_column_names):
                    val = features[0][idx]
                    row.append(str(val))
                if "class_ids" in list(result)[0]:
                    row.append(str(list(result)[0]["class_ids"][0]))
                else:
                    # regression predictions
                    row.append(str(list(result)[0]["predictions"][0]))
                w.write(row)
        fast_predictor.close()

    print("Done predicting. Predict table : %s" % result_table)