Exemplo n.º 1
0
def main():
    CONFIG = Config()
    model_conf = CONFIG.read_model_conf()['model_conf']
    traindata_list = FileListGenerator(model_conf['data_dir_train']).generate()
    testdata_list = FileListGenerator(model_conf['data_dir_pred']).generate()

    if model_conf['mode'] == 'train':
        traindata = next(traindata_list)
        tf.logging.info('Start training {}'.format(traindata))
        t0 = time.time()
        train1 = LR(traindata, mode='train').lr_model()
        t1 = time.time()
        tf.logging.info('Finish training {}, take {} mins'.format(
            traindata, float((t1 - t0) / 60)))

    else:
        testdata = next(testdata_list)
        tf.logging.info('Start evaluation {}'.format(testdata))
        t0 = time.time()
        Accuracy, AUC = LR(testdata, mode='pred').lr_model()
        t1 = time.time()
        tf.logging.info('Finish evaluation {}, take {} mins'.format(
            testdata, float((t1 - t0) / 60)))
        print("LR_Accuracy: %f" % Accuracy)
        print("LR_AUC: %f" % AUC)
Exemplo n.º 2
0
def gen_analyzed_data():
    """
    Generate the data to be analyzed from the original pred data
    """
    # schemas
    SCHEMA = Config().read_schema(
    )  # dict id -> col_name, e.g. SCHEMA[1]='clk'
    del SCHEMA[1]
    header_str = [v for k, v in SCHEMA.iteritems()]
    header_int = [k for k, v in SCHEMA.iteritems()]
    col2id = {v: k for k, v in SCHEMA.iteritems()}
    feature_conf_dic = CONF.read_feature_conf()
    cross_feature_list = CONF.read_cross_feature_conf()

    # load data
    df = pd.read_table(FLAGS.pred_data + "/pred1", header=header_int)

    # reformat the table, only analyzed columns are left
    keep_columns_str = get_analyzed_columns(feature_conf_dic)
    keep_columns_int = [col2id[v] for v in keep_columns_str]
    keep_columns_int.sort()
    df_keep_columns_int = [
        col - 2 for col in keep_columns_int
    ]  # dataframe starts from column 0; while our map start from 2
    analyzed_table = df.iloc[:, df_keep_columns_int]

    # save to csv
    analyzed_table.to_csv(FLAGS.analyzed_data,
                          header=[SCHEMA[k] for k in keep_columns_int],
                          index=False)
    print("Analyzed data generation finished.")
Exemplo n.º 3
0
 def __init__(self, data_file):
     self._conf = Config()
     self._data_file = data_file
     self._feature_conf_dic = self._conf.read_feature_conf()[0]
     self._feature_used = self._conf.read_feature_conf()[1]
     self._all_features = self._conf.read_schema_conf()
     self.model_conf = self._conf.read_model_conf()['model_conf']
     self._csv_defaults = self._column_to_csv_defaults()
Exemplo n.º 4
0
    def __init__(self, data_file, mode):
        self._conf = Config()
        self._data_file = data_file
        self._Tf_Data = TF_Data(self._data_file)
        self.dataset_train = self._Tf_Data.gbdt_input()
        self.lr_conf = self._conf.read_model_conf()['lr_conf']

        self._mode = mode
        self._gbdt_spr = GBDT_spr(self._data_file).gbdt_model(self._mode)
Exemplo n.º 5
0
 def __init__(self, data_file):
     self._data_file = data_file
     self._DataSet = DataSet(self._data_file)
     self._conf = Config()
     self.dataset = self._DataSet.input_fn()
     self.batch_dataset = self._DataSet.iter_minibatches()
     self._feature_colums = self._feature_colums()
     self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf']
     self.model_conf = self._conf.read_model_conf()['model_conf']
Exemplo n.º 6
0
 def __init__(self, data_file):
     self._data_file = data_file
     self._Tf_Data = TF_Data(self._data_file)
     self._conf = Config()
     self.dataset_train = self._Tf_Data.gbdt_input()
     self.dataset_trans = self._Tf_Data.gbdt_input()
     self.dataset_pred = self._Tf_Data.gbdt_input()
     self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf']
     self.model_conf = self._conf.read_model_conf()['model_conf']
Exemplo n.º 7
0
def main():
    CONFIG = Config()
    model_conf = CONFIG.read_model_conf()['model_conf']
    if model_conf['mode'] == 'train':
        train1 = LR(model_conf['data_dir_train'], mode='train').lr_model()
    else:
        Accuracy, AUC = LR(model_conf['data_dir_pred'], mode='pred').lr_model()
        print("LR_Accuracy: %f" % Accuracy)
        print("LR_AUC: %f" % AUC)
    def __init__(self):

        self._conf = Config()
        self._train_conf = self._conf.train
        self._cnn_conf = self._conf.model

        x_train, y_train, x_test, y_test, x_train_categ, x_test_categ, x_train_conti, x_test_conti, all_data \
            = preprocessing()
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.x_train_categ = x_train_categ  # 訓練セットの中のカテゴリーデータ
        self.x_test_categ = x_test_categ  # テストセットの中のカテゴリーデータ
        self.x_train_conti = x_train_conti  # 訓練セットの中の連続的データ
        self.x_test_conti = x_test_conti  # テストセットの中の連続的データ
        self.all_data = all_data
        self.poly = PolynomialFeatures(degree=2, interaction_only=True)
        # カテゴリーデータをcross product化
        self.x_train_categ_poly = self.poly.fit_transform(x_train_categ)
        self.x_test_categ_poly = self.poly.transform(x_test_categ)
        self.categ_inputs = None
        self.conti_input = None
        self.deep_component_outlayer = None
        self.logistic_input = None
        self.model = None
Exemplo n.º 9
0
def gen_pred_csv():
    """
    Save the pred data as csv
    """
    # schemas
    SCHEMA = Config().read_schema(
    )  # dict id -> col_name, e.g. SCHEMA[1]='clk'
    del SCHEMA[1]

    # load data
    df = pd.read_table(FLAGS.pred_data + "/pred1")

    # save to csv
    df.to_csv("../data/pred/pred1.csv",
              header=[v for k, v in SCHEMA.iteritems()],
              index=False)
    print("Csv generation finished.")
Exemplo n.º 10
0
def pred_input_fn(csv_data):
    """Prediction input fn for a single data, used for serving client"""
    conf = Config()
    feature = conf.get_feature_name()
    feature_unused = conf.get_feature_name('unused')
    feature_conf = conf.read_feature_conf()
    csv_default = column_to_dtype(feature, feature_conf)
    csv_default.pop('label')

    feature_dict = {}
    for idx, f in enumerate(csv_default.keys()):
        if f in feature_unused:
            continue
        else:
            if csv_default[f] == tf.string:
                feature_dict[f] = _bytes_feature(csv_data[idx])
            else:
                feature_dict[f] = _float_feature(float(csv_data[idx]))
    return feature_dict
Exemplo n.º 11
0
def main():

    CONFIG = Config()
    model_conf = CONFIG.read_model_conf()['model_conf']
    traindata_list = FileListGenerator(model_conf['data_dir_train']).generate()
    testdata_list = FileListGenerator(model_conf['data_dir_pred']).generate()

    model = build_estimator()

    traindata = next(traindata_list)
    testdata = next(testdata_list)

    t0 = time.time()
    tf.logging.info('Start training {}'.format(traindata))

    model.train(input_fn=lambda: input_fn(traindata, 'train'),
                hooks=None,
                steps=None,
                max_steps=None,
                saving_listeners=None)
    t1 = time.time()
    tf.logging.info('Finish training {}, take {} mins'.format(
        traindata, float((t1 - t0) / 60)))

    tf.logging.info('Start evaluating {}'.format(testdata))
    t2 = time.time()

    results = model.evaluate(
        input_fn=lambda: input_fn(testdata, 'eval'),
        steps=None,  # Number of steps for which to evaluate model.
        hooks=None,
        checkpoint_path=None,  # latest checkpoint in model_dir is used.
        name=None)
    t3 = time.time()
    tf.logging.info('Finish evaluation {}, take {} mins'.format(
        testdata, float((t3 - t2) / 60)))

    # Display evaluation metrics
    for key in sorted(results):
        print('{}: {}'.format(key, results[key]))
Exemplo n.º 12
0
def pred_input_fn(csv_data):
    """Prediction input fn for a single data, used for serving client"""
    conf = Config()
  #  feature = conf.read_schema_conf().values()
  #  feature_unused = conf.get_feature_name('unused')
    feature_conf = conf.read_feature_conf()[1]
    csv_default = TF_Data('/home/zhangqifan/data/part_0.csv')._column_to_csv_defaults()
    csv_default.pop('label')
    print(csv_default)

    feature_dict = {}
    for idx, f in enumerate(csv_default.keys()):
        print(f)
        print(type(csv_default[f]))
        if f in feature_conf:

            if csv_default[f] == ['']:
                print('yes')
                feature_dict[f] = _bytes_feature(csv_data[idx])
            else:
                feature_dict[f] = _float_feature(float(csv_data[idx]))
    return feature_dict
Exemplo n.º 13
0
 def __init__(self, data_file):
     # check file exsits, turn to list so that data_file can be both file or directory.
     assert tf.gfile.Exists(data_file), (
         'data file: {} not found. Please check input data path'.format(data_file))
     if tf.gfile.IsDirectory(data_file):
         data_file_list = [f for f in tf.gfile.ListDirectory(data_file) if not f.startswith('.')]
         data_file = [data_file + '/' + file_name for file_name in data_file_list]
     self._data_file = data_file
     self._conf = Config()
     self._train_conf = self._conf.train
     self._dist_conf = self._conf.distribution
     self._shuffle_buffer_size = self._train_conf["num_examples"]
     self._num_parallel_calls = self._train_conf["num_parallel_calls"]
     self._train_epochs = self._train_conf["train_epochs"]
Exemplo n.º 14
0
def main():
    CONFIG = Config()
    model_conf = CONFIG.read_model_conf()['model_conf']
    model = build_estimator()
    predictions = model.predict(input_fn=lambda: input_fn('/home/leadtek/zhangqifan/reflux_user_pro/data/pred_data/all_data.csv','pred'),
                                predict_keys=None,
                                hooks=None,
                                checkpoint_path=None)  # defaults None to use latest_checkpoint
    res = []
    for pred_dict in predictions:  # dict{probabilities, classes, class_ids}
        opt = []
        class_id = pred_dict['class_ids'][0]
        opt.append(class_id)
        probability = pred_dict['probabilities']
        opt.append(probability[1])
        res.append(opt)
        # print('class_id:',class_id,'probability:',probability)
    res_df = pd.DataFrame(res, columns=['class_id','probability'])
    x = res_df[res_df['class_id'].isin([1])]
    sample = pd.read_csv("/home/leadtek/zhangqifan/reflux_user_pro/data/opt_all_data.csv",sep=' ')
    res_sample = pd.concat([sample,res_df],axis=1)
    res_sample.to_csv(r"/home/leadtek/zhangqifan/reflux_user_pro/res.csv", header=True, index=False,
                                    sep=' ')
def wenqi_pred_input_fn(csv_data):
    """Prediction input fn for a single data, used for serving client"""
    conf = Config()
    feature = conf.get_feature_name()
    feature_unused = conf.get_feature_name('unused')
    feature_conf = conf.read_feature_conf()
    csv_default = column_to_dtype(feature, feature_conf)
    csv_default.pop('label')

    feature_dict = {}
    for idx, f in enumerate(csv_default.keys()):
        if f in feature_unused:
            continue
        else:
            # print(csv_default[f])
            if csv_default[f] == tf.string:
                # for i in range(FLAGS.num_tests):
                csv_data_list = [csv_data[idx] for i in range(FLAGS.num_tests)]
                feature_dict[f] = _bytes_feature(csv_data_list)
            elif csv_default[f] == tf.int32 or csv_default[f] == tf.int64:
                feature_dict[f] = _int_feature(int(csv_data[idx]))
            else:
                feature_dict[f] = _float_feature(float(csv_data[idx]))
    return feature_dict
Exemplo n.º 16
0
def gen_sample_csv():
    """
    Generate sample csv that contains both hashed and one-hot-encoded features
    """
    # schemas
    SCHEMA = Config().read_schema(
    )  # dict id -> col_name, e.g. SCHEMA[1]='clk'
    del SCHEMA[1]

    # load data
    df = pd.read_csv("../data/pred/pred1.csv")

    # save to csv
    sample_col = [
        "request_id", "account_id", "adplan_id", "os", "client_type", "hour"
    ]
    sample_table = df.loc[:, sample_col]
    sample_table.to_csv("../data/sample/sample.csv",
                        header=sample_col,
                        index=False)
    print("Csv generation finished.")
Exemplo n.º 17
0
class LR(object):
    '''
    LR class
    LR模型训练,预测
    '''
    def __init__(self, data_file, mode):
        self._conf = Config()
        self.lr_conf = self._conf.read_model_conf()['lr_conf']
        self._data_file = data_file
        self._mode = mode
        self._gbdt_spr = GBDT_spr(self._data_file)

    def lr_model(self):
        '''
        lr模型训练及预测
        :return: AUC
        '''
        if self._mode == 'train':
            gbdt_features, y_label = self._gbdt_spr.gbdt_model(self._mode)
            grd_lm = LogisticRegression(penalty=self.lr_conf['penalty'],
                                        solver=self.lr_conf['solver'],
                                        C=float(self.lr_conf['c']))
            grd_lm.fit(gbdt_features, y_label)
            joblib.dump(grd_lm, os.path.join(MODEL_DIR, "lr_model.m"))

        else:
            gbdt_features, y_label = self._gbdt_spr.gbdt_model(self._mode)
            grd_lm = joblib.load(os.path.join(MODEL_DIR, "lr_model.m"))

            y_pred_grd_lm = grd_lm.predict_proba(gbdt_features)[:, 1]
            pred_res = grd_lm.predict(gbdt_features)
            accuracy_score = metrics.accuracy_score(y_label, pred_res)

            fpr_grd_lm, tpr_grd_lm, _ = metrics.roc_curve(
                y_label, y_pred_grd_lm)
            roc_auc = metrics.auc(fpr_grd_lm, tpr_grd_lm)

            AUC_Score = metrics.roc_auc_score(y_label, y_pred_grd_lm)

            return accuracy_score, AUC_Score
Exemplo n.º 18
0
def main(unused_argv):
    CONFIG = Config()
    print("Using TensorFlow Version %s" % tf.__version__)
    assert "1.4" <= tf.__version__, "Need TensorFlow r1.4 or Later."
    print('\nModel Type: {}'.format(FLAGS.model_type))
    model_dir = os.path.join(FLAGS.model_dir, FLAGS.model_type)
    print('\nModel Directory: {}'.format(model_dir))

    print("\nUsing Train Config:")
    for k, v in CONFIG.train.items():
        print('{}: {}'.format(k, v))
    print("\nUsing Model Config:")
    for k, v in CONFIG.model.items():
        print('{}: {}'.format(k, v))

    if not FLAGS.keep_train:
        # Clean up the model directory if not keep training
        shutil.rmtree(model_dir, ignore_errors=True)
        print('Remove model directory: {}'.format(model_dir))
    model = build_custom_estimator(model_dir, FLAGS.model_type)
    tf.logging.info('Build estimator: {}'.format(model))
    train_and_eval_api(model)
Exemplo n.º 19
0
from __future__ import print_function
from __future__ import unicode_literals

import argparse
import os
import sys
import time

import tensorflow as tf

from lib.read_conf import Config
from lib.dataset import input_fn
from lib.build_estimator import build_estimator
from lib.utils.util import elapse_time

CONFIG = Config().train
parser = argparse.ArgumentParser(description='Evaluate Wide and Deep Model.')

parser.add_argument('--model_dir',
                    type=str,
                    default=CONFIG["model_dir"],
                    help='Model checkpoint dir for evaluating.')

parser.add_argument('--model_type',
                    type=str,
                    default=CONFIG["model_type"],
                    help="Valid model types: {'wide', 'deep', 'wide_deep'}.")

parser.add_argument('--test_data',
                    type=str,
                    default=CONFIG["test_data"],
Exemplo n.º 20
0
from lib.read_conf import Config
from lib.utils.model_util import activation_fn
from lib.joint import WideAndDeepClassifier

# wide_deep columns
categorical_column_with_identity = tf.feature_column.categorical_column_with_identity
categorical_column_with_hash_bucket = tf.feature_column.categorical_column_with_hash_bucket
categorical_column_with_vocabulary_list = tf.feature_column.categorical_column_with_vocabulary_list
crossed_column = tf.feature_column.crossed_column
bucketized_column = tf.feature_column.bucketized_column
# deep columns
embedding_column = tf.feature_column.embedding_column
indicator_column = tf.feature_column.indicator_column
numeric_column = tf.feature_column.numeric_column

CONF = Config()
if CONF.train['pos_sample_loss_weight'] is None and CONF.train[
        'neg_sample_loss_weight'] is None:
    weight_column = None
else:
    weight_column = 'weight_column'


def _build_model_columns():
    """
    Build wide_deep and deep feature columns from custom feature conf using tf.feature_column API
    wide_columns: category features + cross_features + [discretized continuous features]
    deep_columns: continuous features + category features(onehot or embedding for sparse features) + [cross_features(embedding)]
    Return: 
        _CategoricalColumn and __DenseColumn instance in tf.feature_column API
    """
Exemplo n.º 21
0
from absl import logging
import tensorflow as tf
import numpy as np

import os
import sys
PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, PACKAGE_DIR)

from lib.read_conf import Config
from lib.dataset import input_fn
from lib.build_estimator import build_custom_estimator, build_estimator

TEST_CSV = os.path.join(os.path.dirname(PACKAGE_DIR), 'data/test/test2')
USED_FEATURE_KEY = Config().get_feature_name('used')


def _read_test_input(all_lines=False):
    if all_lines:
        return open(TEST_CSV).readlines()
    else:
        return open(TEST_CSV).readline()


TEST_INPUT_VALUES = _read_test_input()
TEST_INPUT_KEYS = Config().get_feature_name()
TEST_INPUT = dict(
    zip(TEST_INPUT_KEYS,
        TEST_INPUT_VALUES.strip().split("\t")[1:]))
for key in TEST_INPUT:
    TEST_INPUT[key] = TEST_INPUT[key].split(',')
https://www.tensorflow.org/programmers_guide/saved_model#using_savedmodel_with_estimators
"""
from __future__ import print_function

import os
import sys

import tensorflow as tf

PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, PACKAGE_DIR)

from lib.build_estimator import _build_model_columns, build_custom_estimator
from lib.read_conf import Config

model_base_dir = Config().train['model_dir']
CONF = Config().serving['SavedModel']

tf.app.flags.DEFINE_string('model_type', CONF['model_type'],
                           """Model type to export""")
tf.app.flags.DEFINE_string(
    'checkpoint_path', CONF['checkpoint_path'],
    """Directory to read training checkpoints. If None, use latest.""")
tf.app.flags.DEFINE_string('export_dir', CONF['model_dir'],
                           """Directory to export inference model.""")
tf.app.flags.DEFINE_integer('model_version', CONF['model_version'],
                            'version number of the model.')
FLAGS = tf.app.flags.FLAGS


def main(_):
Exemplo n.º 23
0
class GBDT_spr(object):
    '''
    GBDT_spr class
    GBDT模型训练,生成离散特征
    '''
    def __init__(self, data_file):
        self._data_file = data_file
        self._DataSet = DataSet(self._data_file)
        self._conf = Config()
        self.dataset = self._DataSet.input_fn()
        self.batch_dataset = self._DataSet.iter_minibatches()
        self._feature_colums = self._feature_colums()
        self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf']
        self.model_conf = self._conf.read_model_conf()['model_conf']

    def _feature_colums(self):
        '''
        特征列处理
        :return:
            gbdt_colums, type: list
        '''
        gbdt_colums = []
        feature_conf_dic = self._conf.read_feature_conf()[0]
        for feature, conf in feature_conf_dic.items():
            f_type, f_tran = conf["type"], conf["transform"]
            if f_type == 'category':
                if f_tran == 'multivalue':
                    opt = (feature, multivalue())
                    gbdt_colums.append(opt)
                if f_tran == 'one_hot':
                    opt = (feature, one_hot())
                    gbdt_colums.append(opt)

            else:
                opt = ([feature], min_max())
                gbdt_colums.append(opt)
        return gbdt_colums

    def gbdt_model(self, mode):
        '''
        gbdt模型训练,生成离散特征
        :param
            mode: ‘train’ or  ‘pred’
        :return:
            lr_feat:gbdt生成的离散特征
            y:对应数据的label
        '''
        mapper = DataFrameMapper(self._feature_colums, sparse=True)
        if mode == 'train':
            X = mapper.fit_transform(self.dataset)
            y = list(self.dataset['label'])
            grd = GradientBoostingClassifier(
                n_estimators=int(self.gbdt_conf['n_estimators']),
                #    random_state=int(self.gbdt_conf['random_state']),
                learning_rate=float(self.gbdt_conf['learning_rate']),
                #    subsample=float(self.gbdt_conf['subsample']),
                min_samples_leaf=int(self.gbdt_conf['min_samples_leaf']),
                max_depth=int(self.gbdt_conf['max_depth']),
                max_leaf_nodes=int(self.gbdt_conf['max_leaf_nodes']),
                min_samples_split=int(self.gbdt_conf['min_samples_split']))
            if self.model_conf['batch_size'] == '0':
                grd.fit(X, y)
                joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m"))
                new_feature = grd.apply(X)
                new_feature = new_feature.reshape(
                    -1, int(self.gbdt_conf['n_estimators']))
                enc = OneHotEncoder()
                enc.fit(new_feature)
                lr_feat = np.array(enc.transform(new_feature).toarray())
            else:
                for i, dataset in enumerate(self.batch_dataset):
                    #    print(dataset)
                    batch_X = mapper.fit_transform(dataset)
                    batch_y = list(dataset['label'])
                    grd.fit(batch_X, batch_y)
                    new_feature = grd.apply(batch_X)
                    new_feature = new_feature.reshape(
                        -1, int(self.gbdt_conf['n_estimators']))
                    enc = OneHotEncoder()
                    enc.fit(new_feature)
                    new_feature2 = np.array(
                        enc.transform(new_feature).toarray())
                    print(new_feature2)
                    if i == 0:
                        lr_feat = new_feature2
                    else:
                        lr_feat = np.concatenate([lr_feat, new_feature2],
                                                 axis=0)
                joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m"))

        else:
            X = mapper.fit_transform(self.dataset)
            y = list(self.dataset['label'])
            grd = joblib.load(os.path.join(MODEL_DIR, "gbdt_model.m"))
            new_feature = grd.apply(X)
            new_feature = new_feature.reshape(
                -1, int(self.gbdt_conf['n_estimators']))
            enc = OneHotEncoder()
            enc.fit(new_feature)
            lr_feat = np.array(enc.transform(new_feature).toarray())
        return lr_feat, y
Exemplo n.º 24
0
# from tensorflow.python.ops import partitioned_variables
# from tensorflow.python.ops import state_ops
# from tensorflow.python.ops import variable_scope
# from tensorflow.python.summary import summary
# from tensorflow.python.training import sync_replicas_optimizer
# from tensorflow.python.training import training_util

# # The default learning rates are a historical artifact of the initial implementation.
# _DNN_LEARNING_RATE = 0.001  # 0.05
# _LINEAR_LEARNING_RATE = 0.005
# _CNN_LEARNING_RATE = 0.001

# # Weight decay learning rate implementation.
# decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)

CONF = Config().model
_linear_init_learning_rate = CONF['linear_initial_learning_rate'] or 0.005
_dnn_init_learning_rate = CONF['dnn_initial_learning_rate'] or 0.001
_cnn_init_learning_rate = CONF['cnn_initial_learning_rate'] or 0.001
_linear_decay_rate = CONF['linear_decay_rate'] or 1
_dnn_decay_rate = CONF['dnn_decay_rate'] or 1
_cnn_decay_rate = CONF['cnn_decay_rate'] or 1

_batch_size = Config().train['batch_size']
_num_examples = Config().train['num_examples']
decay_steps = _num_examples / _batch_size


def _wide_deep_combined_model_fn(
        features, labels, mode, head,
        model_type,
Exemplo n.º 25
0
from tensorflow.python.ops import init_ops
from tensorflow.python.layers import core as core_layers
from tensorflow.python.layers import normalization
from tensorflow.python.ops.losses import losses
from tensorflow.python.keras.engine import training
# from tensorflow.keras.regularizers import l1, l2, l1_l2

import os
import sys
PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, PACKAGE_DIR)

from lib.read_conf import Config
from lib.utils.model_util import add_layer_summary, _get_optimizer_instance, _get_activation_fn

CONF = Config().model
ACTIVATION_FN = _get_activation_fn(CONF['dnn_activation_function'])
DROPOUT = CONF['dnn_dropout']
BATCH_NORM = CONF['dnn_batch_normalization']
DNN_L1 = CONF['dnn_l1']
DNN_L2 = CONF['dnn_l2']
regularizer_list = []
if DNN_L1:
    regularizer_list.append(tf.contrib.layers.l1_regularizer(DNN_L1))
if DNN_L2:
    regularizer_list.append(tf.contrib.layers.l2_regularizer(DNN_L2))
if len(regularizer_list) == 0:
    REG = None
else:
    REG = tf.contrib.layers.sum_regularizer(regularizer_list)
def build_model_columns():
    def embedding_dim(dim):
        """empirical embedding dim"""
        return int(np.power(2, np.ceil(np.log(dim**0.5))))

    wide_columns = []
    wide_dim = 0
    deep_columns = []
    deep_dim = 0
    normalizer_scaler = 'min_max'
    _feature_conf_dic = Config().read_feature_conf()[0]
    for feature, conf in _feature_conf_dic.items():
        f_type, f_tran, f_param, is_deep = conf["type"], conf[
            "transform"], conf["parameter"], conf["is_deep"]
        if feature == 'tag' or feature == 'main_actor':
            col = tf.feature_column.categorical_column_with_vocabulary_file(
                feature, vocabulary_file=f_param)
            wide_columns.append(col)
            wide_dim += int(conf["dim"])
            if is_deep:
                embed_dim = 20
                deep_columns.append(
                    tf.feature_column.embedding_column(
                        col,
                        dimension=embed_dim,
                        combiner='mean',
                        initializer=None,
                        ckpt_to_load_from=None,
                        tensor_name_in_ckpt=None,
                        max_norm=None,
                        trainable=True))
                deep_dim += embed_dim

        else:
            if f_type == 'category':
                if f_tran == 'hash_bucket':
                    hash_bucket_size = int(f_param)
                    col = tf.feature_column.categorical_column_with_hash_bucket(
                        feature,
                        hash_bucket_size=hash_bucket_size,
                        dtype=tf.string)
                    wide_columns.append(col)
                    wide_dim += hash_bucket_size
                    if is_deep:
                        embed_dim = embedding_dim(hash_bucket_size)
                        deep_columns.append(
                            tf.feature_column.embedding_column(
                                col,
                                dimension=embed_dim,
                                combiner='mean',
                                initializer=None,
                                ckpt_to_load_from=None,
                                tensor_name_in_ckpt=None,
                                max_norm=None,
                                trainable=True))
                        deep_dim += embed_dim
                elif f_tran == 'vocab':
                    col = tf.feature_column.categorical_column_with_vocabulary_list(
                        feature,
                        vocabulary_list=list(map(str, f_param)),
                        dtype=None,
                        default_value=-1,
                        num_oov_buckets=0)
                    wide_columns.append(col)
                    wide_dim += len(f_param)
                    if is_deep:
                        deep_columns.append(
                            tf.feature_column.indicator_column(col))
                        deep_dim += len(f_param)
                elif f_tran == 'identity':
                    num_buckets = f_param
                    col = tf.feature_column.categorical_column_with_identity(
                        feature, num_buckets=num_buckets, default_value=0)
                    wide_columns.append(col)
                    wide_dim += num_buckets
                    if is_deep:
                        deep_columns.append(
                            tf.feature_column.indicator_column(col))
                        deep_dim += num_buckets
            else:
                normalization_params = []
                normalization_params.append(int(f_param[0]))
                normalization_params.append(int(f_param[2]))
                normalizer_fn = normalizer_fn_builder(
                    normalizer_scaler, tuple(normalization_params))
                col = tf.feature_column.numeric_column(
                    feature,
                    shape=(1, ),
                    default_value=0,
                    dtype=tf.float32,
                    normalizer_fn=normalizer_fn)
                wide_columns.append(col)
                wide_dim += 1
                if is_deep:
                    deep_columns.append(col)
                    deep_dim += 1

    # for cross_features, hash_bucket_size, is_deep in cross_feature_list:
    #     cf_list = []
    #     for f in cross_features:
    #
    #         f_type = feature_conf_dic[f]["type"]
    #         f_tran = feature_conf_dic[f]["transform"]
    #         f_param = feature_conf_dic[f]["parameter"]
    #         if f_tran == 'identity':
    #             cf_list.append(tf.feature_column.categorical_column_with_identity(f, num_buckets=f_param,
    #                                                                               default_value=0))
    #         else:
    #             cf_list.append(f)
    #     col = tf.feature_column.crossed_column(cf_list, int(hash_bucket_size))
    #     wide_columns.append(col)
    #     wide_dim += int(hash_bucket_size)
    #     if is_deep:
    #         deep_columns.append(tf.feature_column.embedding_column(col, dimension=embedding_dim(int(hash_bucket_size))))
    #         deep_dim += embedding_dim(int(hash_bucket_size))

    tf.logging.info('Build total {} wide columns'.format(len(wide_columns)))
    for col in wide_columns:
        tf.logging.debug('Wide columns: {}'.format(col))
    tf.logging.info('Wide input dimension is: {}'.format(wide_dim))

    tf.logging.info('Build total {} deep columns'.format(len(deep_columns)))
    for col in deep_columns:
        tf.logging.debug('Deep columns: {}'.format(col))
    tf.logging.info('Deep input dimension is: {}'.format(deep_dim))

    return wide_columns, deep_columns
Exemplo n.º 27
0
class LR(object):
    '''
    LR class
    LR模型训练,预测
    '''
    def __init__(self, data_file, mode):
        self._conf = Config()
        self._data_file = data_file
        self._Tf_Data = TF_Data(self._data_file)
        self.dataset_train = self._Tf_Data.gbdt_input()
        self.lr_conf = self._conf.read_model_conf()['lr_conf']

        self._mode = mode
        self._gbdt_spr = GBDT_spr(self._data_file).gbdt_model(self._mode)

    def lr_model(self):
        '''
        lr模型训练及预测
        :return: AUC
        '''
        if self._mode == 'train':
            grd_lm = SGDClassifier(penalty=self.lr_conf['penalty'],
                                   loss='log',
                                   warm_start=True)
            i = 0
            while True:
                try:
                    dataset = next(self._gbdt_spr)
                    batch_X = dataset[0]
                    batch_y = dataset[1]
                    print('start training LR epochs_%d' % i)
                    grd_lm = grd_lm.partial_fit(batch_X,
                                                batch_y,
                                                classes=[0, 1])
                    i += 1
                    del (dataset)
                    del (batch_y)
                    del (batch_X)
                    gc.collect()
                except StopIteration as e:
                    print('Generator return value:', e.value)
                    break
            joblib.dump(grd_lm, os.path.join(MODEL_DIR, "lr_model.m"))
        else:
            y_all_label = []
            y_all_pred_grd_lm = []
            pred_all_res = []
            grd_lm = joblib.load(os.path.join(MODEL_DIR, "lr_model.m"))
            while True:
                try:
                    dataset = next(self._gbdt_spr)
                    gbdt_features = dataset[0]
                    y_label = dataset[1]
                    y_pred_grd_lm = grd_lm.predict_proba(gbdt_features)[:, 1]
                    pred_res = grd_lm.predict(gbdt_features)
                    y_all_label.extend(y_label)
                    y_all_pred_grd_lm.extend(y_pred_grd_lm)
                    pred_all_res.extend(pred_res)
                    del (dataset)
                    del (gbdt_features)
                    gc.collect()
                except StopIteration as e:
                    print('Generator return value:', e.value)
                    break
            accuracy_score = metrics.accuracy_score(y_all_label, pred_all_res)
            fpr_grd_lm, tpr_grd_lm, _ = metrics.roc_curve(
                y_all_label, y_all_pred_grd_lm)
            roc_auc = metrics.auc(fpr_grd_lm, tpr_grd_lm)
            AUC_Score = metrics.roc_auc_score(y_all_label, y_all_pred_grd_lm)
            return accuracy_score, AUC_Score
Exemplo n.º 28
0
                                                     fractions={
                                                         '0': keep_prob,
                                                         '1': 1
                                                     },
                                                     seed=0).values()
    print('down sampling finished.')
    print(data.first())
    if os.path.exists(outpath):
        shutil.rmtree(outpath)
    data.map(lambda x: "\t".join(x)).saveAsTextFile(outpath)
    sc.stop()
    ss.stop()


if __name__ == '__main__':
    CONF = Config().read_data_process_conf()
    SCHEMA = Config().read_schema()
    feature_index_list = CONF['category_feature_index_list']
    keep_prob = CONF['downsampling_keep_ratio']
    conf = SparkConf().setAppName('wide_deep'). \
        set('spark.executor.memory', '10g').set('spark.driver.memory', '10g').setMaster('local[*]')
    sc = SparkContext(conf=conf)
    ss = SparkSession.builder.getOrCreate()
    inpath = '/Users/lapis-hong/Documents/NetEase/wide_deep/data/train'
    outpath = '/Users/lapis-hong/Documents/NetEase/wide_deep/data/spark'
    # if len(sys.argv) < 3:
    #     exit('Missing arguments: \nUsage: $ python data_process_local_test.py $inpath $outpath')
    if len(sys.argv) == 3:
        inpath = sys.argv[1]
        outpath = sys.argv[2]
    local_data_preprocess2(inpath, outpath)
Exemplo n.º 29
0
from tensorflow.python.estimator.canned import head as head_lib

import os
import sys
PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, PACKAGE_DIR)

from lib.read_conf import Config
from lib.linear import linear_logit_fn_builder
from lib.dnn import multidnn_logit_fn_builder
from lib.utils.model_util import add_layer_summary, check_no_sync_replicas_optimizer, activation_fn, get_optimizer_instance

# # Weight decay learning rate implementation.
# decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)

CONF = Config().model
_linear_init_learning_rate = CONF['linear_initial_learning_rate'] or 0.005
_dnn_init_learning_rate = CONF['dnn_initial_learning_rate'] or 0.001
_linear_decay_rate = CONF['linear_decay_rate'] or 1
_dnn_decay_rate = CONF['dnn_decay_rate'] or 1

_batch_size = Config().train['batch_size']
_num_examples = Config().train['num_examples']
decay_steps = _num_examples / _batch_size

_feature_sequence = Config().get_feature_name('sequence')  # sequence features


def _wide_deep_combined_model_fn(features,
                                 labels,
                                 mode,
Exemplo n.º 30
0
def main(unused_argv):
    CONFIG = Config()
    print("Using TensorFlow Version %s" % tf.__version__)
    # assert "1.4" <= tf.__version__, "Need TensorFlow r1.4 or Later."
    print('\nModel Type: {}'.format(FLAGS.model_type))
    model_dir = os.path.join(FLAGS.model_dir, FLAGS.model_type)
    print('\nModel Directory: {}'.format(model_dir))

    print("\nUsing Train Config:")
    for k, v in CONFIG.train.items():
        print('{}: {}'.format(k, v))
    print("\nUsing Model Config:")
    for k, v in CONFIG.model.items():
        print('{}: {}'.format(k, v))

    if not FLAGS.keep_train:
        # Clean up the model directory if not keep training
        shutil.rmtree(model_dir, ignore_errors=True)
        print('Remove model directory: {}'.format(model_dir))
    # model = build_estimator(model_dir, FLAGS.model_type)
    model = build_custom_estimator(model_dir, FLAGS.model_type)
    tf.logging.info('Build estimator: {}'.format(model))

    if CONFIG.train['dynamic_train']:
        train_fn = dynamic_train
        print("Using dynamic train mode.")
    else:
        train_fn = train_and_eval

    if CONFIG.distribution["is_distribution"]:
        print("Using PID: {}".format(os.getpid()))
        cluster = CONFIG.distribution["cluster"]
        job_name = CONFIG.distribution["job_name"]
        task_index = CONFIG.distribution["task_index"]
        print(
            "Using Distributed TensorFlow. Local host: {} Job_name: {} Task_index: {}"
            .format(cluster[job_name][task_index], job_name, task_index))
        cluster = tf.train.ClusterSpec(CONFIG.distribution["cluster"])
        server = tf.train.Server(cluster,
                                 job_name=job_name,
                                 task_index=task_index)
        # distributed can not including eval.
        train_fn = train
        if job_name == 'ps':
            # wait for incoming connection forever
            server.join()
            # sess = tf.Session(server.target)
            # queue = create_done_queue(task_index, num_workers)
            # for i in range(num_workers):
            #     sess.run(queue.dequeue())
            #     print("ps {} received worker {} done".format(task_index, i)
            # print("ps {} quitting".format(task_index))
        else:  # TODO:supervisor & MonotoredTrainingSession & experiment (deprecated)
            train_fn(model)
            # train_and_eval(model)
            # Each worker only needs to contact the PS task(s) and the local worker task.
            # config = tf.ConfigProto(device_filters=[
            #     '/job:ps', '/job:worker/task:%d' % arguments.task_index])
            # with tf.device(tf.train.replica_device_setter(
            #         worker_device="/job:worker/task:%d" % task_index,
            #         cluster=cluster)):
            # e = _create_experiment_fn()
            # e.train_and_evaluate()  # call estimator's train() and evaluate() method
            # hooks = [tf.train.StopAtStepHook(last_step=10000)]
            # with tf.train.MonitoredTrainingSession(
            #         master=server.target,
            #         is_chief=(task_index == 0),
            #         checkpoint_dir=args.model_dir,
            #         hooks=hooks) as mon_sess:
            #     while not mon_sess.should_stop():
            #         # mon_sess.run()
            #         classifier.fit(input_fn=train_input_fn, steps=1)
    else:
        # local run
        train_fn(model)