Exemplo n.º 1
0
def pred_input_fn(csv_data):
    """Prediction input fn for a single data, used for serving client"""
    conf = Config()
    feature = conf.get_feature_name()
    feature_unused = conf.get_feature_name('unused')
    feature_conf = conf.read_feature_conf()
    csv_default = column_to_dtype(feature, feature_conf)
    csv_default.pop('label')

    feature_dict = {}
    for idx, f in enumerate(csv_default.keys()):
        if f in feature_unused:
            continue
        else:
            if csv_default[f] == tf.string:
                feature_dict[f] = _bytes_feature(csv_data[idx])
            else:
                feature_dict[f] = _float_feature(float(csv_data[idx]))
    return feature_dict
Exemplo n.º 2
0
def pred_input_fn(csv_data):
    """Prediction input fn for a single data, used for serving client"""
    conf = Config()
  #  feature = conf.read_schema_conf().values()
  #  feature_unused = conf.get_feature_name('unused')
    feature_conf = conf.read_feature_conf()[1]
    csv_default = TF_Data('/home/zhangqifan/data/part_0.csv')._column_to_csv_defaults()
    csv_default.pop('label')
    print(csv_default)

    feature_dict = {}
    for idx, f in enumerate(csv_default.keys()):
        print(f)
        print(type(csv_default[f]))
        if f in feature_conf:

            if csv_default[f] == ['']:
                print('yes')
                feature_dict[f] = _bytes_feature(csv_data[idx])
            else:
                feature_dict[f] = _float_feature(float(csv_data[idx]))
    return feature_dict
def wenqi_pred_input_fn(csv_data):
    """Prediction input fn for a single data, used for serving client"""
    conf = Config()
    feature = conf.get_feature_name()
    feature_unused = conf.get_feature_name('unused')
    feature_conf = conf.read_feature_conf()
    csv_default = column_to_dtype(feature, feature_conf)
    csv_default.pop('label')

    feature_dict = {}
    for idx, f in enumerate(csv_default.keys()):
        if f in feature_unused:
            continue
        else:
            # print(csv_default[f])
            if csv_default[f] == tf.string:
                # for i in range(FLAGS.num_tests):
                csv_data_list = [csv_data[idx] for i in range(FLAGS.num_tests)]
                feature_dict[f] = _bytes_feature(csv_data_list)
            elif csv_default[f] == tf.int32 or csv_default[f] == tf.int64:
                feature_dict[f] = _int_feature(int(csv_data[idx]))
            else:
                feature_dict[f] = _float_feature(float(csv_data[idx]))
    return feature_dict
Exemplo n.º 4
0
class GBDT_spr(object):
    '''
    GBDT_spr class
    GBDT模型训练,生成离散特征
    '''
    def __init__(self, data_file):
        self._data_file = data_file
        self._DataSet = DataSet(self._data_file)
        self._conf = Config()
        self.dataset = self._DataSet.input_fn()
        self.batch_dataset = self._DataSet.iter_minibatches()
        self._feature_colums = self._feature_colums()
        self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf']
        self.model_conf = self._conf.read_model_conf()['model_conf']

    def _feature_colums(self):
        '''
        特征列处理
        :return:
            gbdt_colums, type: list
        '''
        gbdt_colums = []
        feature_conf_dic = self._conf.read_feature_conf()[0]
        for feature, conf in feature_conf_dic.items():
            f_type, f_tran = conf["type"], conf["transform"]
            if f_type == 'category':
                if f_tran == 'multivalue':
                    opt = (feature, multivalue())
                    gbdt_colums.append(opt)
                if f_tran == 'one_hot':
                    opt = (feature, one_hot())
                    gbdt_colums.append(opt)

            else:
                opt = ([feature], min_max())
                gbdt_colums.append(opt)
        return gbdt_colums

    def gbdt_model(self, mode):
        '''
        gbdt模型训练,生成离散特征
        :param
            mode: ‘train’ or  ‘pred’
        :return:
            lr_feat:gbdt生成的离散特征
            y:对应数据的label
        '''
        mapper = DataFrameMapper(self._feature_colums, sparse=True)
        if mode == 'train':
            X = mapper.fit_transform(self.dataset)
            y = list(self.dataset['label'])
            grd = GradientBoostingClassifier(
                n_estimators=int(self.gbdt_conf['n_estimators']),
                #    random_state=int(self.gbdt_conf['random_state']),
                learning_rate=float(self.gbdt_conf['learning_rate']),
                #    subsample=float(self.gbdt_conf['subsample']),
                min_samples_leaf=int(self.gbdt_conf['min_samples_leaf']),
                max_depth=int(self.gbdt_conf['max_depth']),
                max_leaf_nodes=int(self.gbdt_conf['max_leaf_nodes']),
                min_samples_split=int(self.gbdt_conf['min_samples_split']))
            if self.model_conf['batch_size'] == '0':
                grd.fit(X, y)
                joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m"))
                new_feature = grd.apply(X)
                new_feature = new_feature.reshape(
                    -1, int(self.gbdt_conf['n_estimators']))
                enc = OneHotEncoder()
                enc.fit(new_feature)
                lr_feat = np.array(enc.transform(new_feature).toarray())
            else:
                for i, dataset in enumerate(self.batch_dataset):
                    #    print(dataset)
                    batch_X = mapper.fit_transform(dataset)
                    batch_y = list(dataset['label'])
                    grd.fit(batch_X, batch_y)
                    new_feature = grd.apply(batch_X)
                    new_feature = new_feature.reshape(
                        -1, int(self.gbdt_conf['n_estimators']))
                    enc = OneHotEncoder()
                    enc.fit(new_feature)
                    new_feature2 = np.array(
                        enc.transform(new_feature).toarray())
                    print(new_feature2)
                    if i == 0:
                        lr_feat = new_feature2
                    else:
                        lr_feat = np.concatenate([lr_feat, new_feature2],
                                                 axis=0)
                joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m"))

        else:
            X = mapper.fit_transform(self.dataset)
            y = list(self.dataset['label'])
            grd = joblib.load(os.path.join(MODEL_DIR, "gbdt_model.m"))
            new_feature = grd.apply(X)
            new_feature = new_feature.reshape(
                -1, int(self.gbdt_conf['n_estimators']))
            enc = OneHotEncoder()
            enc.fit(new_feature)
            lr_feat = np.array(enc.transform(new_feature).toarray())
        return lr_feat, y
Exemplo n.º 5
0
class DataSet(object):
    '''
    DataSet class
    处理输入数据
    '''
    def __init__(self, data_file):
        self._conf = Config()
        self._data_file = data_file
        self._feature_conf_dic = self._conf.read_feature_conf()[0]
        self._feature_used = self._conf.read_feature_conf()[1]
        self._all_features = self._conf.read_schema_conf()
        self.model_conf = self._conf.read_model_conf()['model_conf']
        self._csv_defaults = self._column_to_csv_defaults()

    def _column_to_csv_defaults(self):
        '''
        定义输入数据类型,获取数据特征名
        :return:
            all_columns:数据每一列对应的名称 type:list
            csv_defaults:csv默认数据类型 ['feature name': [''],...]
        '''
        features = []
        for i in range(1, len(self._all_features) + 1):
            features.append(self._all_features[str(i)])
        all_columns = ['label'] + features
        csv_defaults = {}
        csv_defaults['label'] = np.int
        for f in self._all_features.values():
            if f in self._feature_used:
                conf = self._feature_conf_dic[f]
                if conf['type'] == 'category':
                    if conf['transform'] == 'identity':
                        csv_defaults[f] = np.int
                    else:
                        csv_defaults[f] = np.str
                else:
                    csv_defaults[f] = np.float
            else:
                csv_defaults[f] = np.str
        return all_columns, csv_defaults

    def iter_minibatches(self):
        '''
        迭代器,给定文件流(比如一个大文件),每次输出minibatch_size行
        :return:
            将输出转化成dataframe输出
        '''

        cur_line_num = 0
        dataset = []
        csvfile = open(self._data_file, 'rt', encoding="utf-8")
        reader = csv.reader(csvfile, delimiter=' ')
        all_columns, csv_defaults = self._csv_defaults
        for line in reader:
            dataset.append(line)
            cur_line_num += 1
            if cur_line_num >= int(self.model_conf['batch_size']):
                dataset = pd.DataFrame(dataset, columns=all_columns)
                dataset = dataset.astype(csv_defaults)
                yield dataset
                dataset = []
                cur_line_num = 0
        dataset = pd.DataFrame(dataset, columns=all_columns)
        dataset = dataset.astype(csv_defaults)
        yield dataset
        csvfile.close()

    def input_fn(self):
        '''
        读取csv文件,转化为dataframe,填充nan值
        :return:
            dataset
        '''
        all_columns, csv_defaults = self._csv_defaults
        dataset = pd.read_csv(self._data_file,
                              sep=' ',
                              names=all_columns,
                              dtype=csv_defaults)
        dataset = dataset.fillna('-')
        return dataset
Exemplo n.º 6
0
class TF_Data(object):
    def __init__(self, data_file):
        self._conf = Config()
        self._data_file = data_file
        self._feature_conf_dic = self._conf.read_feature_conf()[0]
        self._feature_used = self._conf.read_feature_conf()[1]
        self._all_features = self._conf.read_schema_conf()
        self.model_conf = self._conf.read_model_conf()['model_conf']
        self._csv_defaults = self._column_to_csv_defaults()

    def _column_to_csv_defaults(self):
        """
        定义csv文件中各个特征默认的数据类型
        :return:
            OrderedDict {'feature name': [''],...}
        """
        csv_defaults = OrderedDict()
        csv_defaults['label'] = [0]
        for f in self._all_features.values():
            if f == 'label':
                csv_defaults['label'] = [0]
            elif f in self._feature_used:
                conf = self._feature_conf_dic[f]
                if conf['type'] == 'category':
                    if conf['transform'] == 'identity':
                        csv_defaults[f] = [0]
                    else:
                        csv_defaults[f] = ['']
                else:
                    csv_defaults[f] = [0.0]
            else:
                csv_defaults[f] = ['']
        return csv_defaults

    def _parse_csv(self, field_delim='&', na_value='-'):
        """
        csv数据的解析函数
        :param field_delim: csv字段分隔符
        :param na_value: 使用csv默认值填充na_value
        :return:
            feature dict: {feature: Tensor ... }
        """
        csv_defaults = self._csv_defaults

        def decode_csv(value):
            parsed_line = tf.decode_csv(value,
                                        record_defaults=list(
                                            csv_defaults.values()),
                                        field_delim=field_delim,
                                        na_value=na_value)
            features = dict(zip(self._csv_defaults.keys(), parsed_line))
            label = None
            for f in self._all_features.values():
                if f != 'label':
                    if f not in self._feature_used:
                        features.pop(f)
                else:
                    label = features.pop('label')
            return features, label

        return decode_csv

    def input_fn(self, mode):
        """
        生成dataset(tensor)
        :return:
            generator
        """
        dataset = tf.data.TextLineDataset(self._data_file)
        dataset = dataset.map(self._parse_csv())  # Decode each line

        # Shuffle, repeat, and batch the examples.
        if mode == 'train':
            dataset = dataset.repeat(10)
        padding_dic = {k: () for k in self._feature_used}
        padded_shapes = (padding_dic, ())
        dataset = dataset.padded_batch(int(self.model_conf['batch_size']),
                                       padded_shapes=padded_shapes)

        # Return the read end of the pipeline.
        return dataset.make_one_shot_iterator().get_next()
# fix ImportError: No mudule named lib.*
import sys
import xgb_model_zzr
import xgb2tensorflow

conf = Config()
train_conf = conf.train
num_parallel_calls = train_conf["num_parallel_calls"]
shuffle_buffer_size = train_conf["num_examples"]
train_epochs = train_conf["train_epochs"]

use_weight = False
feature = conf.get_feature_name()  # all features
feature_used = conf.get_feature_name('used')  # used features
feature_unused = conf.get_feature_name('unused')  # unused features
feature_conf = conf.read_feature_conf()  # feature conf dict
csv_defaults_values = [0.0] * 31 + [0.0]
feature_name = [
    "id", "vars0", "vars1", "vars2", "vars3", "vars4", "vars5", "vars6",
    "vars7", "vars8", "vars9", "vars10", "vars11", "vars12", "vars13",
    "vars14", "vars15", "vars16", "vars17", "vars18", "vars19", "vars20",
    "vars21", "vars22", "vars23", "vars24", "vars25", "vars26", "vars27",
    "vars28", "vars29", "label"
]
# self._multivalue = self._train_conf["multivalue"]

#
# csv_defaults_keys = ["var01", "var02", "var03", "var04", "var05", "var06", "var07", "var08", "var09", "var10", "var11",
#                      "var12", "var13", "var14", "var15", "var16", "var17", "var18", "var19", "var20", "var21", "var22",
#                      "var23", "var24", "var25", "var26", "var27", "var28", "var29", "var30", "var31", "var32", "var33",
#                      "var34", "var35", "var36", "var37", "var38", "var39", "var40", "var41", "var42", "var43", "var44",
Exemplo n.º 8
0
class TF_Data(object):
    def __init__(self, data_file):
        self._conf = Config()
        self._data_file = data_file
        self._feature_conf_dic = self._conf.read_feature_conf()[0]
        self._feature_used = self._conf.read_feature_conf()[1]
        self._all_features = self._conf.read_schema_conf()
        self.model_conf = self._conf.read_model_conf()['model_conf']
        self._csv_defaults = self._column_to_csv_defaults()

    def _normalizer_fn_builder(self, scaler, normalization_params):
        """normalizer_fn builder"""
        if scaler == 'min_max':

            return lambda x: (x - normalization_params[0]) / (
                max(normalization_params[1] - normalization_params[0], 0.001))
        elif scaler == 'standard':
            return lambda x: (x - normalization_params[0]) / normalization_params[1]
        else:
            return lambda x: tf.log(x)


    def _column_to_csv_defaults(self):
        """
        定义csv文件中各个特征默认的数据类型
        :return:
            OrderedDict {'feature name': [''],...}
        """
        csv_defaults = OrderedDict()
        csv_defaults['label'] = [0]
        for f in self._all_features.values():
            if f in self._feature_used:
                conf = self._feature_conf_dic[f]
                if conf['type'] == 'category':
                    if conf['transform'] == 'identity':
                        csv_defaults[f] = [0]
                    else:
                        csv_defaults[f] = ['']
                else:
                    csv_defaults[f] = [0.0]
            else:
                csv_defaults[f] = ['']
        return csv_defaults

    def _parse_csv(self, field_delim=' ', na_value='-'):
        """
        csv数据的解析函数
        :param field_delim: csv字段分隔符
        :param na_value: 使用csv默认值填充na_value
        :return:
            feature dict: {feature: Tensor ... }
        """
        csv_defaults = self._csv_defaults
        def decode_csv(value):
            parsed_line = tf.decode_csv(value, record_defaults = list(csv_defaults.values()), field_delim=field_delim, na_value = na_value)
            features = dict(zip(self._csv_defaults.keys(), parsed_line))
            for f in self._all_features.values():
                if f not in self._feature_used:
                    features.pop(f)
                    continue
            for f, tensor in features.items():
                if f == 'tag':
                    features[f] = tf.string_split([tensor], ',').values
                if f == 'main_actor':
                    features[f] = tf.string_split([tensor], ',').values
            label = features.pop('label')
            return features, label
        return decode_csv

    def input_fn(self):
        """
        生成dataset(tensor)
        :return:
            generator
        """
        dataset = tf.data.TextLineDataset(self._data_file)
        dataset = dataset.map(self._parse_csv())  # Decode each line

        # Shuffle, repeat, and batch the examples.
        # dataset = dataset.shuffle(10).repeat(1)
        padding_dic = {k: () for k in self._feature_used}
        padding_dic['tag'] = [None]
        # padding_dic['main_actor'] = [None]
        padded_shapes = (padding_dic, ())
        dataset = dataset.padded_batch(int(self.model_conf['batch_size']), padded_shapes=padded_shapes)

        # Return the read end of the pipeline.
        return dataset.make_one_shot_iterator().get_next()

    def feat_column(self):
        """
        特征列处理
        :return:
            wide_columns
        """
        wide_columns = []
        wide_dim = 0
        for feature, conf in self._feature_conf_dic.items():
            f_type, f_tran, f_param = conf["type"], conf["transform"], conf["parameter"]
            if feature == 'tag' or feature == 'main_actor':
                col = tf.feature_column.categorical_column_with_vocabulary_file(feature,
                                                                                vocabulary_file=f_param)
                col = tf.feature_column.indicator_column(col)
                wide_columns.append(col)
                wide_dim += int(conf["dim"])
            else:
                if f_type == 'category':
                    if f_tran == 'hash_bucket':
                        hash_bucket_size = int(f_param)
                        col = tf.feature_column.categorical_column_with_hash_bucket(feature,
                                                                                    hash_bucket_size=hash_bucket_size,
                                                                                    dtype=tf.string)
                        col = tf.feature_column.indicator_column(col)
                        wide_columns.append(col)
                        wide_dim += hash_bucket_size
                    elif f_tran == 'vocab':
                        col = tf.feature_column.categorical_column_with_vocabulary_list(feature,
                                                                                        vocabulary_list=list(map(str, f_param)),
                                                                                        dtype=None,
                                                                                        default_value=-1,
                                                                                        num_oov_buckets=0)
                        col = tf.feature_column.indicator_column(col)
                        wide_columns.append(col)
                        wide_dim += len(f_param)
                    elif f_tran == 'identity':
                        num_buckets = f_param
                        col = tf.feature_column.categorical_column_with_identity(feature,
                                                                                 num_buckets=num_buckets,
                                                                                 default_value=0)
                        col = tf.feature_column.indicator_column(col)
                        wide_columns.append(col)
                        wide_dim += num_buckets
                else:
                    normalizer_fn = self._normalizer_fn_builder(scaler = f_tran, normalization_params = tuple([0, 1]))
                    col = tf.feature_column.numeric_column(feature,
                                                           shape=(1,),
                                                           default_value=0,
                                                           dtype=tf.float32)
                                            #               normalizer_fn=normalizer_fn)
             #       col = tf.feature_column.indicator_column(col)
                    wide_columns.append(col)
                    wide_dim += 1
        return wide_columns

    def gbdt_input(self):
        """
        将特征列处理后的数据转化为array输出
        :return:
            process_data:训练或预估数据集; type:array
            label:数据集对应的标签; type:array
        """
        tensor = tf.feature_column.input_layer(self.input_fn()[0], self.feat_column())
        label_element = self.input_fn()[1]
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.tables_initializer())
            while True:
                try:
                    process_data = sess.run(tensor)
                    label = sess.run(label_element)
                    yield process_data, label
                except tf.errors.OutOfRangeError:
                    break