def pred_input_fn(csv_data): """Prediction input fn for a single data, used for serving client""" conf = Config() feature = conf.get_feature_name() feature_unused = conf.get_feature_name('unused') feature_conf = conf.read_feature_conf() csv_default = column_to_dtype(feature, feature_conf) csv_default.pop('label') feature_dict = {} for idx, f in enumerate(csv_default.keys()): if f in feature_unused: continue else: if csv_default[f] == tf.string: feature_dict[f] = _bytes_feature(csv_data[idx]) else: feature_dict[f] = _float_feature(float(csv_data[idx])) return feature_dict
def pred_input_fn(csv_data): """Prediction input fn for a single data, used for serving client""" conf = Config() # feature = conf.read_schema_conf().values() # feature_unused = conf.get_feature_name('unused') feature_conf = conf.read_feature_conf()[1] csv_default = TF_Data('/home/zhangqifan/data/part_0.csv')._column_to_csv_defaults() csv_default.pop('label') print(csv_default) feature_dict = {} for idx, f in enumerate(csv_default.keys()): print(f) print(type(csv_default[f])) if f in feature_conf: if csv_default[f] == ['']: print('yes') feature_dict[f] = _bytes_feature(csv_data[idx]) else: feature_dict[f] = _float_feature(float(csv_data[idx])) return feature_dict
def wenqi_pred_input_fn(csv_data): """Prediction input fn for a single data, used for serving client""" conf = Config() feature = conf.get_feature_name() feature_unused = conf.get_feature_name('unused') feature_conf = conf.read_feature_conf() csv_default = column_to_dtype(feature, feature_conf) csv_default.pop('label') feature_dict = {} for idx, f in enumerate(csv_default.keys()): if f in feature_unused: continue else: # print(csv_default[f]) if csv_default[f] == tf.string: # for i in range(FLAGS.num_tests): csv_data_list = [csv_data[idx] for i in range(FLAGS.num_tests)] feature_dict[f] = _bytes_feature(csv_data_list) elif csv_default[f] == tf.int32 or csv_default[f] == tf.int64: feature_dict[f] = _int_feature(int(csv_data[idx])) else: feature_dict[f] = _float_feature(float(csv_data[idx])) return feature_dict
class GBDT_spr(object): ''' GBDT_spr class GBDT模型训练,生成离散特征 ''' def __init__(self, data_file): self._data_file = data_file self._DataSet = DataSet(self._data_file) self._conf = Config() self.dataset = self._DataSet.input_fn() self.batch_dataset = self._DataSet.iter_minibatches() self._feature_colums = self._feature_colums() self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf'] self.model_conf = self._conf.read_model_conf()['model_conf'] def _feature_colums(self): ''' 特征列处理 :return: gbdt_colums, type: list ''' gbdt_colums = [] feature_conf_dic = self._conf.read_feature_conf()[0] for feature, conf in feature_conf_dic.items(): f_type, f_tran = conf["type"], conf["transform"] if f_type == 'category': if f_tran == 'multivalue': opt = (feature, multivalue()) gbdt_colums.append(opt) if f_tran == 'one_hot': opt = (feature, one_hot()) gbdt_colums.append(opt) else: opt = ([feature], min_max()) gbdt_colums.append(opt) return gbdt_colums def gbdt_model(self, mode): ''' gbdt模型训练,生成离散特征 :param mode: ‘train’ or ‘pred’ :return: lr_feat:gbdt生成的离散特征 y:对应数据的label ''' mapper = DataFrameMapper(self._feature_colums, sparse=True) if mode == 'train': X = mapper.fit_transform(self.dataset) y = list(self.dataset['label']) grd = GradientBoostingClassifier( n_estimators=int(self.gbdt_conf['n_estimators']), # random_state=int(self.gbdt_conf['random_state']), learning_rate=float(self.gbdt_conf['learning_rate']), # subsample=float(self.gbdt_conf['subsample']), min_samples_leaf=int(self.gbdt_conf['min_samples_leaf']), max_depth=int(self.gbdt_conf['max_depth']), max_leaf_nodes=int(self.gbdt_conf['max_leaf_nodes']), min_samples_split=int(self.gbdt_conf['min_samples_split'])) if self.model_conf['batch_size'] == '0': grd.fit(X, y) joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m")) new_feature = grd.apply(X) new_feature = new_feature.reshape( -1, int(self.gbdt_conf['n_estimators'])) enc = OneHotEncoder() enc.fit(new_feature) lr_feat = np.array(enc.transform(new_feature).toarray()) else: for i, dataset in enumerate(self.batch_dataset): # print(dataset) batch_X = mapper.fit_transform(dataset) batch_y = list(dataset['label']) grd.fit(batch_X, batch_y) new_feature = grd.apply(batch_X) new_feature = new_feature.reshape( -1, int(self.gbdt_conf['n_estimators'])) enc = OneHotEncoder() enc.fit(new_feature) new_feature2 = np.array( enc.transform(new_feature).toarray()) print(new_feature2) if i == 0: lr_feat = new_feature2 else: lr_feat = np.concatenate([lr_feat, new_feature2], axis=0) joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m")) else: X = mapper.fit_transform(self.dataset) y = list(self.dataset['label']) grd = joblib.load(os.path.join(MODEL_DIR, "gbdt_model.m")) new_feature = grd.apply(X) new_feature = new_feature.reshape( -1, int(self.gbdt_conf['n_estimators'])) enc = OneHotEncoder() enc.fit(new_feature) lr_feat = np.array(enc.transform(new_feature).toarray()) return lr_feat, y
class DataSet(object): ''' DataSet class 处理输入数据 ''' def __init__(self, data_file): self._conf = Config() self._data_file = data_file self._feature_conf_dic = self._conf.read_feature_conf()[0] self._feature_used = self._conf.read_feature_conf()[1] self._all_features = self._conf.read_schema_conf() self.model_conf = self._conf.read_model_conf()['model_conf'] self._csv_defaults = self._column_to_csv_defaults() def _column_to_csv_defaults(self): ''' 定义输入数据类型,获取数据特征名 :return: all_columns:数据每一列对应的名称 type:list csv_defaults:csv默认数据类型 ['feature name': [''],...] ''' features = [] for i in range(1, len(self._all_features) + 1): features.append(self._all_features[str(i)]) all_columns = ['label'] + features csv_defaults = {} csv_defaults['label'] = np.int for f in self._all_features.values(): if f in self._feature_used: conf = self._feature_conf_dic[f] if conf['type'] == 'category': if conf['transform'] == 'identity': csv_defaults[f] = np.int else: csv_defaults[f] = np.str else: csv_defaults[f] = np.float else: csv_defaults[f] = np.str return all_columns, csv_defaults def iter_minibatches(self): ''' 迭代器,给定文件流(比如一个大文件),每次输出minibatch_size行 :return: 将输出转化成dataframe输出 ''' cur_line_num = 0 dataset = [] csvfile = open(self._data_file, 'rt', encoding="utf-8") reader = csv.reader(csvfile, delimiter=' ') all_columns, csv_defaults = self._csv_defaults for line in reader: dataset.append(line) cur_line_num += 1 if cur_line_num >= int(self.model_conf['batch_size']): dataset = pd.DataFrame(dataset, columns=all_columns) dataset = dataset.astype(csv_defaults) yield dataset dataset = [] cur_line_num = 0 dataset = pd.DataFrame(dataset, columns=all_columns) dataset = dataset.astype(csv_defaults) yield dataset csvfile.close() def input_fn(self): ''' 读取csv文件,转化为dataframe,填充nan值 :return: dataset ''' all_columns, csv_defaults = self._csv_defaults dataset = pd.read_csv(self._data_file, sep=' ', names=all_columns, dtype=csv_defaults) dataset = dataset.fillna('-') return dataset
class TF_Data(object): def __init__(self, data_file): self._conf = Config() self._data_file = data_file self._feature_conf_dic = self._conf.read_feature_conf()[0] self._feature_used = self._conf.read_feature_conf()[1] self._all_features = self._conf.read_schema_conf() self.model_conf = self._conf.read_model_conf()['model_conf'] self._csv_defaults = self._column_to_csv_defaults() def _column_to_csv_defaults(self): """ 定义csv文件中各个特征默认的数据类型 :return: OrderedDict {'feature name': [''],...} """ csv_defaults = OrderedDict() csv_defaults['label'] = [0] for f in self._all_features.values(): if f == 'label': csv_defaults['label'] = [0] elif f in self._feature_used: conf = self._feature_conf_dic[f] if conf['type'] == 'category': if conf['transform'] == 'identity': csv_defaults[f] = [0] else: csv_defaults[f] = [''] else: csv_defaults[f] = [0.0] else: csv_defaults[f] = [''] return csv_defaults def _parse_csv(self, field_delim='&', na_value='-'): """ csv数据的解析函数 :param field_delim: csv字段分隔符 :param na_value: 使用csv默认值填充na_value :return: feature dict: {feature: Tensor ... } """ csv_defaults = self._csv_defaults def decode_csv(value): parsed_line = tf.decode_csv(value, record_defaults=list( csv_defaults.values()), field_delim=field_delim, na_value=na_value) features = dict(zip(self._csv_defaults.keys(), parsed_line)) label = None for f in self._all_features.values(): if f != 'label': if f not in self._feature_used: features.pop(f) else: label = features.pop('label') return features, label return decode_csv def input_fn(self, mode): """ 生成dataset(tensor) :return: generator """ dataset = tf.data.TextLineDataset(self._data_file) dataset = dataset.map(self._parse_csv()) # Decode each line # Shuffle, repeat, and batch the examples. if mode == 'train': dataset = dataset.repeat(10) padding_dic = {k: () for k in self._feature_used} padded_shapes = (padding_dic, ()) dataset = dataset.padded_batch(int(self.model_conf['batch_size']), padded_shapes=padded_shapes) # Return the read end of the pipeline. return dataset.make_one_shot_iterator().get_next()
# fix ImportError: No mudule named lib.* import sys import xgb_model_zzr import xgb2tensorflow conf = Config() train_conf = conf.train num_parallel_calls = train_conf["num_parallel_calls"] shuffle_buffer_size = train_conf["num_examples"] train_epochs = train_conf["train_epochs"] use_weight = False feature = conf.get_feature_name() # all features feature_used = conf.get_feature_name('used') # used features feature_unused = conf.get_feature_name('unused') # unused features feature_conf = conf.read_feature_conf() # feature conf dict csv_defaults_values = [0.0] * 31 + [0.0] feature_name = [ "id", "vars0", "vars1", "vars2", "vars3", "vars4", "vars5", "vars6", "vars7", "vars8", "vars9", "vars10", "vars11", "vars12", "vars13", "vars14", "vars15", "vars16", "vars17", "vars18", "vars19", "vars20", "vars21", "vars22", "vars23", "vars24", "vars25", "vars26", "vars27", "vars28", "vars29", "label" ] # self._multivalue = self._train_conf["multivalue"] # # csv_defaults_keys = ["var01", "var02", "var03", "var04", "var05", "var06", "var07", "var08", "var09", "var10", "var11", # "var12", "var13", "var14", "var15", "var16", "var17", "var18", "var19", "var20", "var21", "var22", # "var23", "var24", "var25", "var26", "var27", "var28", "var29", "var30", "var31", "var32", "var33", # "var34", "var35", "var36", "var37", "var38", "var39", "var40", "var41", "var42", "var43", "var44",
class TF_Data(object): def __init__(self, data_file): self._conf = Config() self._data_file = data_file self._feature_conf_dic = self._conf.read_feature_conf()[0] self._feature_used = self._conf.read_feature_conf()[1] self._all_features = self._conf.read_schema_conf() self.model_conf = self._conf.read_model_conf()['model_conf'] self._csv_defaults = self._column_to_csv_defaults() def _normalizer_fn_builder(self, scaler, normalization_params): """normalizer_fn builder""" if scaler == 'min_max': return lambda x: (x - normalization_params[0]) / ( max(normalization_params[1] - normalization_params[0], 0.001)) elif scaler == 'standard': return lambda x: (x - normalization_params[0]) / normalization_params[1] else: return lambda x: tf.log(x) def _column_to_csv_defaults(self): """ 定义csv文件中各个特征默认的数据类型 :return: OrderedDict {'feature name': [''],...} """ csv_defaults = OrderedDict() csv_defaults['label'] = [0] for f in self._all_features.values(): if f in self._feature_used: conf = self._feature_conf_dic[f] if conf['type'] == 'category': if conf['transform'] == 'identity': csv_defaults[f] = [0] else: csv_defaults[f] = [''] else: csv_defaults[f] = [0.0] else: csv_defaults[f] = [''] return csv_defaults def _parse_csv(self, field_delim=' ', na_value='-'): """ csv数据的解析函数 :param field_delim: csv字段分隔符 :param na_value: 使用csv默认值填充na_value :return: feature dict: {feature: Tensor ... } """ csv_defaults = self._csv_defaults def decode_csv(value): parsed_line = tf.decode_csv(value, record_defaults = list(csv_defaults.values()), field_delim=field_delim, na_value = na_value) features = dict(zip(self._csv_defaults.keys(), parsed_line)) for f in self._all_features.values(): if f not in self._feature_used: features.pop(f) continue for f, tensor in features.items(): if f == 'tag': features[f] = tf.string_split([tensor], ',').values if f == 'main_actor': features[f] = tf.string_split([tensor], ',').values label = features.pop('label') return features, label return decode_csv def input_fn(self): """ 生成dataset(tensor) :return: generator """ dataset = tf.data.TextLineDataset(self._data_file) dataset = dataset.map(self._parse_csv()) # Decode each line # Shuffle, repeat, and batch the examples. # dataset = dataset.shuffle(10).repeat(1) padding_dic = {k: () for k in self._feature_used} padding_dic['tag'] = [None] # padding_dic['main_actor'] = [None] padded_shapes = (padding_dic, ()) dataset = dataset.padded_batch(int(self.model_conf['batch_size']), padded_shapes=padded_shapes) # Return the read end of the pipeline. return dataset.make_one_shot_iterator().get_next() def feat_column(self): """ 特征列处理 :return: wide_columns """ wide_columns = [] wide_dim = 0 for feature, conf in self._feature_conf_dic.items(): f_type, f_tran, f_param = conf["type"], conf["transform"], conf["parameter"] if feature == 'tag' or feature == 'main_actor': col = tf.feature_column.categorical_column_with_vocabulary_file(feature, vocabulary_file=f_param) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += int(conf["dim"]) else: if f_type == 'category': if f_tran == 'hash_bucket': hash_bucket_size = int(f_param) col = tf.feature_column.categorical_column_with_hash_bucket(feature, hash_bucket_size=hash_bucket_size, dtype=tf.string) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += hash_bucket_size elif f_tran == 'vocab': col = tf.feature_column.categorical_column_with_vocabulary_list(feature, vocabulary_list=list(map(str, f_param)), dtype=None, default_value=-1, num_oov_buckets=0) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += len(f_param) elif f_tran == 'identity': num_buckets = f_param col = tf.feature_column.categorical_column_with_identity(feature, num_buckets=num_buckets, default_value=0) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += num_buckets else: normalizer_fn = self._normalizer_fn_builder(scaler = f_tran, normalization_params = tuple([0, 1])) col = tf.feature_column.numeric_column(feature, shape=(1,), default_value=0, dtype=tf.float32) # normalizer_fn=normalizer_fn) # col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += 1 return wide_columns def gbdt_input(self): """ 将特征列处理后的数据转化为array输出 :return: process_data:训练或预估数据集; type:array label:数据集对应的标签; type:array """ tensor = tf.feature_column.input_layer(self.input_fn()[0], self.feat_column()) label_element = self.input_fn()[1] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) while True: try: process_data = sess.run(tensor) label = sess.run(label_element) yield process_data, label except tf.errors.OutOfRangeError: break