def main(): CONFIG = Config() model_conf = CONFIG.read_model_conf()['model_conf'] traindata_list = FileListGenerator(model_conf['data_dir_train']).generate() testdata_list = FileListGenerator(model_conf['data_dir_pred']).generate() if model_conf['mode'] == 'train': traindata = next(traindata_list) tf.logging.info('Start training {}'.format(traindata)) t0 = time.time() train1 = LR(traindata, mode='train').lr_model() t1 = time.time() tf.logging.info('Finish training {}, take {} mins'.format( traindata, float((t1 - t0) / 60))) else: testdata = next(testdata_list) tf.logging.info('Start evaluation {}'.format(testdata)) t0 = time.time() Accuracy, AUC = LR(testdata, mode='pred').lr_model() t1 = time.time() tf.logging.info('Finish evaluation {}, take {} mins'.format( testdata, float((t1 - t0) / 60))) print("LR_Accuracy: %f" % Accuracy) print("LR_AUC: %f" % AUC)
def main(): CONFIG = Config() model_conf = CONFIG.read_model_conf()['model_conf'] if model_conf['mode'] == 'train': train1 = LR(model_conf['data_dir_train'], mode='train').lr_model() else: Accuracy, AUC = LR(model_conf['data_dir_pred'], mode='pred').lr_model() print("LR_Accuracy: %f" % Accuracy) print("LR_AUC: %f" % AUC)
def main(): CONFIG = Config() model_conf = CONFIG.read_model_conf()['model_conf'] traindata_list = FileListGenerator(model_conf['data_dir_train']).generate() testdata_list = FileListGenerator(model_conf['data_dir_pred']).generate() model = build_estimator() traindata = next(traindata_list) testdata = next(testdata_list) t0 = time.time() tf.logging.info('Start training {}'.format(traindata)) model.train(input_fn=lambda: input_fn(traindata, 'train'), hooks=None, steps=None, max_steps=None, saving_listeners=None) t1 = time.time() tf.logging.info('Finish training {}, take {} mins'.format( traindata, float((t1 - t0) / 60))) tf.logging.info('Start evaluating {}'.format(testdata)) t2 = time.time() results = model.evaluate( input_fn=lambda: input_fn(testdata, 'eval'), steps=None, # Number of steps for which to evaluate model. hooks=None, checkpoint_path=None, # latest checkpoint in model_dir is used. name=None) t3 = time.time() tf.logging.info('Finish evaluation {}, take {} mins'.format( testdata, float((t3 - t2) / 60))) # Display evaluation metrics for key in sorted(results): print('{}: {}'.format(key, results[key]))
class LR(object): ''' LR class LR模型训练,预测 ''' def __init__(self, data_file, mode): self._conf = Config() self.lr_conf = self._conf.read_model_conf()['lr_conf'] self._data_file = data_file self._mode = mode self._gbdt_spr = GBDT_spr(self._data_file) def lr_model(self): ''' lr模型训练及预测 :return: AUC ''' if self._mode == 'train': gbdt_features, y_label = self._gbdt_spr.gbdt_model(self._mode) grd_lm = LogisticRegression(penalty=self.lr_conf['penalty'], solver=self.lr_conf['solver'], C=float(self.lr_conf['c'])) grd_lm.fit(gbdt_features, y_label) joblib.dump(grd_lm, os.path.join(MODEL_DIR, "lr_model.m")) else: gbdt_features, y_label = self._gbdt_spr.gbdt_model(self._mode) grd_lm = joblib.load(os.path.join(MODEL_DIR, "lr_model.m")) y_pred_grd_lm = grd_lm.predict_proba(gbdt_features)[:, 1] pred_res = grd_lm.predict(gbdt_features) accuracy_score = metrics.accuracy_score(y_label, pred_res) fpr_grd_lm, tpr_grd_lm, _ = metrics.roc_curve( y_label, y_pred_grd_lm) roc_auc = metrics.auc(fpr_grd_lm, tpr_grd_lm) AUC_Score = metrics.roc_auc_score(y_label, y_pred_grd_lm) return accuracy_score, AUC_Score
def main(): CONFIG = Config() model_conf = CONFIG.read_model_conf()['model_conf'] model = build_estimator() predictions = model.predict(input_fn=lambda: input_fn('/home/leadtek/zhangqifan/reflux_user_pro/data/pred_data/all_data.csv','pred'), predict_keys=None, hooks=None, checkpoint_path=None) # defaults None to use latest_checkpoint res = [] for pred_dict in predictions: # dict{probabilities, classes, class_ids} opt = [] class_id = pred_dict['class_ids'][0] opt.append(class_id) probability = pred_dict['probabilities'] opt.append(probability[1]) res.append(opt) # print('class_id:',class_id,'probability:',probability) res_df = pd.DataFrame(res, columns=['class_id','probability']) x = res_df[res_df['class_id'].isin([1])] sample = pd.read_csv("/home/leadtek/zhangqifan/reflux_user_pro/data/opt_all_data.csv",sep=' ') res_sample = pd.concat([sample,res_df],axis=1) res_sample.to_csv(r"/home/leadtek/zhangqifan/reflux_user_pro/res.csv", header=True, index=False, sep=' ')
class GBDT_spr(object): ''' GBDT_spr class GBDT模型训练,生成离散特征 ''' def __init__(self, data_file): self._data_file = data_file self._DataSet = DataSet(self._data_file) self._conf = Config() self.dataset = self._DataSet.input_fn() self.batch_dataset = self._DataSet.iter_minibatches() self._feature_colums = self._feature_colums() self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf'] self.model_conf = self._conf.read_model_conf()['model_conf'] def _feature_colums(self): ''' 特征列处理 :return: gbdt_colums, type: list ''' gbdt_colums = [] feature_conf_dic = self._conf.read_feature_conf()[0] for feature, conf in feature_conf_dic.items(): f_type, f_tran = conf["type"], conf["transform"] if f_type == 'category': if f_tran == 'multivalue': opt = (feature, multivalue()) gbdt_colums.append(opt) if f_tran == 'one_hot': opt = (feature, one_hot()) gbdt_colums.append(opt) else: opt = ([feature], min_max()) gbdt_colums.append(opt) return gbdt_colums def gbdt_model(self, mode): ''' gbdt模型训练,生成离散特征 :param mode: ‘train’ or ‘pred’ :return: lr_feat:gbdt生成的离散特征 y:对应数据的label ''' mapper = DataFrameMapper(self._feature_colums, sparse=True) if mode == 'train': X = mapper.fit_transform(self.dataset) y = list(self.dataset['label']) grd = GradientBoostingClassifier( n_estimators=int(self.gbdt_conf['n_estimators']), # random_state=int(self.gbdt_conf['random_state']), learning_rate=float(self.gbdt_conf['learning_rate']), # subsample=float(self.gbdt_conf['subsample']), min_samples_leaf=int(self.gbdt_conf['min_samples_leaf']), max_depth=int(self.gbdt_conf['max_depth']), max_leaf_nodes=int(self.gbdt_conf['max_leaf_nodes']), min_samples_split=int(self.gbdt_conf['min_samples_split'])) if self.model_conf['batch_size'] == '0': grd.fit(X, y) joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m")) new_feature = grd.apply(X) new_feature = new_feature.reshape( -1, int(self.gbdt_conf['n_estimators'])) enc = OneHotEncoder() enc.fit(new_feature) lr_feat = np.array(enc.transform(new_feature).toarray()) else: for i, dataset in enumerate(self.batch_dataset): # print(dataset) batch_X = mapper.fit_transform(dataset) batch_y = list(dataset['label']) grd.fit(batch_X, batch_y) new_feature = grd.apply(batch_X) new_feature = new_feature.reshape( -1, int(self.gbdt_conf['n_estimators'])) enc = OneHotEncoder() enc.fit(new_feature) new_feature2 = np.array( enc.transform(new_feature).toarray()) print(new_feature2) if i == 0: lr_feat = new_feature2 else: lr_feat = np.concatenate([lr_feat, new_feature2], axis=0) joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m")) else: X = mapper.fit_transform(self.dataset) y = list(self.dataset['label']) grd = joblib.load(os.path.join(MODEL_DIR, "gbdt_model.m")) new_feature = grd.apply(X) new_feature = new_feature.reshape( -1, int(self.gbdt_conf['n_estimators'])) enc = OneHotEncoder() enc.fit(new_feature) lr_feat = np.array(enc.transform(new_feature).toarray()) return lr_feat, y
class LR(object): ''' LR class LR模型训练,预测 ''' def __init__(self, data_file, mode): self._conf = Config() self._data_file = data_file self._Tf_Data = TF_Data(self._data_file) self.dataset_train = self._Tf_Data.gbdt_input() self.lr_conf = self._conf.read_model_conf()['lr_conf'] self._mode = mode self._gbdt_spr = GBDT_spr(self._data_file).gbdt_model(self._mode) def lr_model(self): ''' lr模型训练及预测 :return: AUC ''' if self._mode == 'train': grd_lm = SGDClassifier(penalty=self.lr_conf['penalty'], loss='log', warm_start=True) i = 0 while True: try: dataset = next(self._gbdt_spr) batch_X = dataset[0] batch_y = dataset[1] print('start training LR epochs_%d' % i) grd_lm = grd_lm.partial_fit(batch_X, batch_y, classes=[0, 1]) i += 1 del (dataset) del (batch_y) del (batch_X) gc.collect() except StopIteration as e: print('Generator return value:', e.value) break joblib.dump(grd_lm, os.path.join(MODEL_DIR, "lr_model.m")) else: y_all_label = [] y_all_pred_grd_lm = [] pred_all_res = [] grd_lm = joblib.load(os.path.join(MODEL_DIR, "lr_model.m")) while True: try: dataset = next(self._gbdt_spr) gbdt_features = dataset[0] y_label = dataset[1] y_pred_grd_lm = grd_lm.predict_proba(gbdt_features)[:, 1] pred_res = grd_lm.predict(gbdt_features) y_all_label.extend(y_label) y_all_pred_grd_lm.extend(y_pred_grd_lm) pred_all_res.extend(pred_res) del (dataset) del (gbdt_features) gc.collect() except StopIteration as e: print('Generator return value:', e.value) break accuracy_score = metrics.accuracy_score(y_all_label, pred_all_res) fpr_grd_lm, tpr_grd_lm, _ = metrics.roc_curve( y_all_label, y_all_pred_grd_lm) roc_auc = metrics.auc(fpr_grd_lm, tpr_grd_lm) AUC_Score = metrics.roc_auc_score(y_all_label, y_all_pred_grd_lm) return accuracy_score, AUC_Score
class GBDT_spr(object): ''' GBDT_spr class GBDT模型训练,生成离散特征 ''' def __init__(self, data_file): self._data_file = data_file self._Tf_Data = TF_Data(self._data_file) self._conf = Config() self.dataset_train = self._Tf_Data.gbdt_input() self.dataset_trans = self._Tf_Data.gbdt_input() self.dataset_pred = self._Tf_Data.gbdt_input() self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf'] self.model_conf = self._conf.read_model_conf()['model_conf'] def gbdt_model(self, mode): ''' gbdt模型训练,生成离散特征 :param mode: ‘train’ or ‘pred’ :return: transformed_training_matrix:gbdt生成的离散特征 batch_y:对应数据的label ''' params = { 'task': 'train', 'boosting_type': self.gbdt_conf['boosting_type'], 'objective': 'binary', 'metric': {'binary_logloss'}, 'num_leaves': int(self.gbdt_conf['num_leaves']), # 'num_trees': 60, 'min_data_in_leaf': int(self.gbdt_conf['min_data_in_leaf']), 'learning_rate': float(self.gbdt_conf['learning_rate']), 'feature_fraction': float(self.gbdt_conf['feature_fraction']), 'bagging_fraction': float(self.gbdt_conf['bagging_fraction']), # 'bagging_freq': 5, 'verbose': -1 } if mode == 'train': if self.model_conf['batch_size'] == '0': print('TODO') else: i = 0 while True: try: dataset = next(self.dataset_train) batch_X = dataset[0] batch_y = dataset[1] lgb_train = lgb.Dataset(batch_X, batch_y) if i == 0: gbm = lgb.train(params, lgb_train, valid_sets=lgb_train, keep_training_booster=True) i += 1 else: gbm = lgb.train( params, lgb_train, valid_sets=lgb_train, keep_training_booster=True, init_model='/home/zhangqifan/LightGBM_model.txt' ) i += 1 gbm.save_model('/home/zhangqifan/LightGBM_model.txt') del (dataset) del (batch_y) del (batch_X) gc.collect() except StopIteration: break joblib.dump(gbm, os.path.join(MODEL_DIR, "gbdt_model.m")) while True: try: dataset = next(self.dataset_trans) batch_X = dataset[0] batch_y = dataset[1] gbm_trans = joblib.load( os.path.join(MODEL_DIR, "gbdt_model.m")) y_pred = gbm_trans.predict(batch_X, pred_leaf=True) transformed_training_matrix = np.zeros( [ len(y_pred), len(y_pred[1]) * int(self.gbdt_conf['num_leaves']) ], dtype=np.int64) # N * num_tress * num_leafs for m in range(0, len(y_pred)): # temp表示在每棵树上预测的值所在节点的序号(0,64,128,...,6436 为100棵树的序号,中间的值为对应树的节点序号) temp = np.arange(len(y_pred[0])) * int( self.gbdt_conf['num_leaves']) + np.array( y_pred[m]) # 构造one-hot 训练数据集 transformed_training_matrix[m][temp] += 1 del (dataset) del (batch_X) gc.collect() yield transformed_training_matrix, batch_y except StopIteration: break else: while True: try: dataset = next(self.dataset_pred) gbm_trans = joblib.load( os.path.join(MODEL_DIR, "gbdt_model.m")) batch_X = dataset[0] batch_y = dataset[1] y_pred = gbm_trans.predict(batch_X, pred_leaf=True) transformed_training_matrix = np.zeros( [ len(y_pred), len(y_pred[1]) * int(self.gbdt_conf['num_leaves']) ], dtype=np.int64) # N * num_tress * num_leafs for m in range(0, len(y_pred)): # temp表示在每棵树上预测的值所在节点的序号(0,64,128,...,6436 为100棵树的序号,中间的值为对应树的节点序号) temp = np.arange(len(y_pred[0])) * int( self.gbdt_conf['num_leaves']) + np.array(y_pred[m]) # 构造one-hot 训练数据集 transformed_training_matrix[m][temp] += 1 yield transformed_training_matrix, batch_y except StopIteration: break
class DataSet(object): ''' DataSet class 处理输入数据 ''' def __init__(self, data_file): self._conf = Config() self._data_file = data_file self._feature_conf_dic = self._conf.read_feature_conf()[0] self._feature_used = self._conf.read_feature_conf()[1] self._all_features = self._conf.read_schema_conf() self.model_conf = self._conf.read_model_conf()['model_conf'] self._csv_defaults = self._column_to_csv_defaults() def _column_to_csv_defaults(self): ''' 定义输入数据类型,获取数据特征名 :return: all_columns:数据每一列对应的名称 type:list csv_defaults:csv默认数据类型 ['feature name': [''],...] ''' features = [] for i in range(1, len(self._all_features) + 1): features.append(self._all_features[str(i)]) all_columns = ['label'] + features csv_defaults = {} csv_defaults['label'] = np.int for f in self._all_features.values(): if f in self._feature_used: conf = self._feature_conf_dic[f] if conf['type'] == 'category': if conf['transform'] == 'identity': csv_defaults[f] = np.int else: csv_defaults[f] = np.str else: csv_defaults[f] = np.float else: csv_defaults[f] = np.str return all_columns, csv_defaults def iter_minibatches(self): ''' 迭代器,给定文件流(比如一个大文件),每次输出minibatch_size行 :return: 将输出转化成dataframe输出 ''' cur_line_num = 0 dataset = [] csvfile = open(self._data_file, 'rt', encoding="utf-8") reader = csv.reader(csvfile, delimiter=' ') all_columns, csv_defaults = self._csv_defaults for line in reader: dataset.append(line) cur_line_num += 1 if cur_line_num >= int(self.model_conf['batch_size']): dataset = pd.DataFrame(dataset, columns=all_columns) dataset = dataset.astype(csv_defaults) yield dataset dataset = [] cur_line_num = 0 dataset = pd.DataFrame(dataset, columns=all_columns) dataset = dataset.astype(csv_defaults) yield dataset csvfile.close() def input_fn(self): ''' 读取csv文件,转化为dataframe,填充nan值 :return: dataset ''' all_columns, csv_defaults = self._csv_defaults dataset = pd.read_csv(self._data_file, sep=' ', names=all_columns, dtype=csv_defaults) dataset = dataset.fillna('-') return dataset
class TF_Data(object): def __init__(self, data_file): self._conf = Config() self._data_file = data_file self._feature_conf_dic = self._conf.read_feature_conf()[0] self._feature_used = self._conf.read_feature_conf()[1] self._all_features = self._conf.read_schema_conf() self.model_conf = self._conf.read_model_conf()['model_conf'] self._csv_defaults = self._column_to_csv_defaults() def _column_to_csv_defaults(self): """ 定义csv文件中各个特征默认的数据类型 :return: OrderedDict {'feature name': [''],...} """ csv_defaults = OrderedDict() csv_defaults['label'] = [0] for f in self._all_features.values(): if f == 'label': csv_defaults['label'] = [0] elif f in self._feature_used: conf = self._feature_conf_dic[f] if conf['type'] == 'category': if conf['transform'] == 'identity': csv_defaults[f] = [0] else: csv_defaults[f] = [''] else: csv_defaults[f] = [0.0] else: csv_defaults[f] = [''] return csv_defaults def _parse_csv(self, field_delim='&', na_value='-'): """ csv数据的解析函数 :param field_delim: csv字段分隔符 :param na_value: 使用csv默认值填充na_value :return: feature dict: {feature: Tensor ... } """ csv_defaults = self._csv_defaults def decode_csv(value): parsed_line = tf.decode_csv(value, record_defaults=list( csv_defaults.values()), field_delim=field_delim, na_value=na_value) features = dict(zip(self._csv_defaults.keys(), parsed_line)) label = None for f in self._all_features.values(): if f != 'label': if f not in self._feature_used: features.pop(f) else: label = features.pop('label') return features, label return decode_csv def input_fn(self, mode): """ 生成dataset(tensor) :return: generator """ dataset = tf.data.TextLineDataset(self._data_file) dataset = dataset.map(self._parse_csv()) # Decode each line # Shuffle, repeat, and batch the examples. if mode == 'train': dataset = dataset.repeat(10) padding_dic = {k: () for k in self._feature_used} padded_shapes = (padding_dic, ()) dataset = dataset.padded_batch(int(self.model_conf['batch_size']), padded_shapes=padded_shapes) # Return the read end of the pipeline. return dataset.make_one_shot_iterator().get_next()
class TF_Data(object): def __init__(self, data_file): self._conf = Config() self._data_file = data_file self._feature_conf_dic = self._conf.read_feature_conf()[0] self._feature_used = self._conf.read_feature_conf()[1] self._all_features = self._conf.read_schema_conf() self.model_conf = self._conf.read_model_conf()['model_conf'] self._csv_defaults = self._column_to_csv_defaults() def _normalizer_fn_builder(self, scaler, normalization_params): """normalizer_fn builder""" if scaler == 'min_max': return lambda x: (x - normalization_params[0]) / ( max(normalization_params[1] - normalization_params[0], 0.001)) elif scaler == 'standard': return lambda x: (x - normalization_params[0]) / normalization_params[1] else: return lambda x: tf.log(x) def _column_to_csv_defaults(self): """ 定义csv文件中各个特征默认的数据类型 :return: OrderedDict {'feature name': [''],...} """ csv_defaults = OrderedDict() csv_defaults['label'] = [0] for f in self._all_features.values(): if f in self._feature_used: conf = self._feature_conf_dic[f] if conf['type'] == 'category': if conf['transform'] == 'identity': csv_defaults[f] = [0] else: csv_defaults[f] = [''] else: csv_defaults[f] = [0.0] else: csv_defaults[f] = [''] return csv_defaults def _parse_csv(self, field_delim=' ', na_value='-'): """ csv数据的解析函数 :param field_delim: csv字段分隔符 :param na_value: 使用csv默认值填充na_value :return: feature dict: {feature: Tensor ... } """ csv_defaults = self._csv_defaults def decode_csv(value): parsed_line = tf.decode_csv(value, record_defaults = list(csv_defaults.values()), field_delim=field_delim, na_value = na_value) features = dict(zip(self._csv_defaults.keys(), parsed_line)) for f in self._all_features.values(): if f not in self._feature_used: features.pop(f) continue for f, tensor in features.items(): if f == 'tag': features[f] = tf.string_split([tensor], ',').values if f == 'main_actor': features[f] = tf.string_split([tensor], ',').values label = features.pop('label') return features, label return decode_csv def input_fn(self): """ 生成dataset(tensor) :return: generator """ dataset = tf.data.TextLineDataset(self._data_file) dataset = dataset.map(self._parse_csv()) # Decode each line # Shuffle, repeat, and batch the examples. # dataset = dataset.shuffle(10).repeat(1) padding_dic = {k: () for k in self._feature_used} padding_dic['tag'] = [None] # padding_dic['main_actor'] = [None] padded_shapes = (padding_dic, ()) dataset = dataset.padded_batch(int(self.model_conf['batch_size']), padded_shapes=padded_shapes) # Return the read end of the pipeline. return dataset.make_one_shot_iterator().get_next() def feat_column(self): """ 特征列处理 :return: wide_columns """ wide_columns = [] wide_dim = 0 for feature, conf in self._feature_conf_dic.items(): f_type, f_tran, f_param = conf["type"], conf["transform"], conf["parameter"] if feature == 'tag' or feature == 'main_actor': col = tf.feature_column.categorical_column_with_vocabulary_file(feature, vocabulary_file=f_param) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += int(conf["dim"]) else: if f_type == 'category': if f_tran == 'hash_bucket': hash_bucket_size = int(f_param) col = tf.feature_column.categorical_column_with_hash_bucket(feature, hash_bucket_size=hash_bucket_size, dtype=tf.string) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += hash_bucket_size elif f_tran == 'vocab': col = tf.feature_column.categorical_column_with_vocabulary_list(feature, vocabulary_list=list(map(str, f_param)), dtype=None, default_value=-1, num_oov_buckets=0) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += len(f_param) elif f_tran == 'identity': num_buckets = f_param col = tf.feature_column.categorical_column_with_identity(feature, num_buckets=num_buckets, default_value=0) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += num_buckets else: normalizer_fn = self._normalizer_fn_builder(scaler = f_tran, normalization_params = tuple([0, 1])) col = tf.feature_column.numeric_column(feature, shape=(1,), default_value=0, dtype=tf.float32) # normalizer_fn=normalizer_fn) # col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += 1 return wide_columns def gbdt_input(self): """ 将特征列处理后的数据转化为array输出 :return: process_data:训练或预估数据集; type:array label:数据集对应的标签; type:array """ tensor = tf.feature_column.input_layer(self.input_fn()[0], self.feat_column()) label_element = self.input_fn()[1] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) while True: try: process_data = sess.run(tensor) label = sess.run(label_element) yield process_data, label except tf.errors.OutOfRangeError: break