def __init__(self, train_data, test_data, model): LrModel.__init__(self, train_data, test_data) if not model in config.model_list: print "Wrong model name when initializing EM model." exit(-1) self.model = model self.em_log = []
def main(): if len(sys.argv) < 3: print "Usage python test_lr.py campaign_id learn_rate (budget_prop)" exit(-1) data_folder = "../../make-ipinyou-data/" config.campaign_id = int(sys.argv[1]) # print config.campaign # print config.campaign_id # exit(-1) config.lr_alpha = float(sys.argv[2]) if len(sys.argv) == 4: config.budget_prop = int(sys.argv[3]) train_path = data_folder + ` config.campaign_id ` + "/train.yzx.txt" test_path = data_folder + ` config.campaign_id ` + "/test.yzx.txt" train_data = Dataset(train_path, config.campaign_id) train_data.shuffle() # make train data shuffled test_data = Dataset(test_path, config.campaign_id) print "Load done." lr_model = LrModel(train_data, test_data) print "campaign v = " + ` lr_model.camp_v ` print "learn_rate = " + ` config.lr_alpha ` print "budget = " + ` lr_model.budget ` if config.ds_ratio > 0: print "Need calibration." else: print "No calibration." print "Begin training ..." for i in range(0, config.lr_train_round): lr_model.train() lr_model.test() print "Round " + ` i + 1 ` + "\t" + ` tool.get_last_log( lr_model.test_log)['performance'] ` if tool.judge_stop(lr_model.test_log): break print "Train done." log_file = ` config.campaign_id ` + "_lrlin_" + ` config.lr_alpha ` + "_" + ` config.budget_prop ` + ".csv" fo = open("../output/" + log_file, 'w') print "Begin log ..." header = "camp_id\tmodel\tdataset\trevenue\tctr\tcpc\tauc\trmse\tcpm\tbids\timps\tclks\tlaplace\tinterval\tlearn_rate\tnds_ratio\tbudget_prop" best_test_log = lr_model.get_best_test_log() best_test_line = `config.campaign_id` + "\t" + "LR\ttest\t" \ + tool.gen_performance_line(best_test_log) + "\t" \ + 'None' + "\t" + "None" + "\t" + `config.lr_alpha` + "\t" \ + "None" + "\t" + `config.budget_prop` fo.write(header + "\n") fo.write(best_test_line + "\n") # search for best linear parameter opt_param = lr_model.lin_bid(best_test_log['weight']) fo.write( "prop\trevenue\troi\tctr\tcpc\tauc\trmse\tcpm\timps\tclks\tlin_param\n" ) for prop in config.budget_props: performance = lr_model.replay(best_test_log['weight'], lr_model.test_data, prop) fo.write( ` prop `) fo.write("\t") fo.write( ` performance['revenue'] `) fo.write("\t") fo.write( ` performance['roi'] `) fo.write("\t") fo.write( ` performance['ctr'] `) fo.write("\t") fo.write( ` performance['cpc'] `) fo.write("\t") fo.write( ` performance['auc'] `) fo.write("\t") fo.write( ` performance['rmse'] `) fo.write("\t") fo.write( ` performance['cpm'] `) fo.write("\t") fo.write( ` performance['imps'] `) fo.write("\t") fo.write( ` performance['clks'] `) fo.write("\t") fo.write( ` opt_param `) fo.write("\n") fo.write("\n") fo.write("Round\tTest\tctr\tcpc\tauc\trmse\tcpm\tclks\timps\tbids\n") for i in range(0, len(lr_model.test_log)): test_log = lr_model.test_log[i] line = `i+1` + "\t" + `test_log['performance']['revenue']` \ + "\t" + `test_log['performance']['ctr']` \ + "\t" + `test_log['performance']['cpc']` \ + "\t" + `test_log['performance']['auc']` \ + "\t" + `test_log['performance']['rmse']` \ + "\t" + `test_log['performance']['cpm']` \ + "\t" + `test_log['performance']['clks']` \ + "\t" + `test_log['performance']['imps']` \ + "\t" + `test_log['performance']['bids']` fo.write(line + "\n") fo.close() print "Log done." weight_path = `config.campaign_id` + "_" + "lrlin_best_weight" \ + "_" + `config.lr_alpha` + "_" + `config.budget_prop` \ + ".weight" lr_model.output_weight(best_test_log['weight'], "../output/" + weight_path)
def __init__(self, train_data, test_data): LrModel.__init__(self, train_data, test_data)
model.x: batch_xs, model.y_: batch_ys }) total_batch += 1 if total_batch - last_improved > require_improvement: # 验证集准确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break if flag: break # TODO:后续有需要再做 def test(): """ 目前直接输入一个语料,分为训练集和验证集合 也可以输入两个,一个训练集用sklearn分为训练集和验证集,单独找一个验证集再这测试 还可以输入训练集、验证集、测试集,测试集在这做测试 """ pass if __name__ == "__main__": config = LrConfig() data_get = DataProcess(config.dataset_path, config.stopwords_path, config.tfidf_model_save_path) X_train, X_test, y_train, y_test, seq_length = get_data() model = LrModel(config, seq_length) train(X_train, X_test, y_train, y_test)
text_list.append(' '.join(text)) return text_list def read_categories(): """读取类别""" with open(config.categories_save_path, 'r', encoding='utf-8') as f: categories = f.readlines() return categories[0].split('|') def predict_line(data, categories): """预测结果""" session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=config.lr_save_path) y_pred_cls = session.run(model.y_pred_cls, feed_dict={model.x: data}) return categories[y_pred_cls[0]] if __name__ == "__main__": data = "北京城区最大规模经适房昨摇号 比例可达3:1 11月28日,城八区年内最大规模经适房摇号在石景山区举行" config = LrConfig() line = pre_data(data, config) tfidf_model = joblib.load(config.tfidf_model_save_path) X_test = tfidf_model.transform(line).toarray() model = LrModel(config, len(X_test[0])) categories = read_categories() print(predict_line(X_test, categories))
sess.run(model.train_step, feed_dict={ model.x: batch_xs, model.y_: batch_ys }) total_batch += 1 if total_batch > max_batch: # 轮次,提前结束训练 print("Too much batchs, auto-stopping...") flag = True break if total_batch - last_improved > require_improvement: # 验证集准确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break if flag: break if __name__ == "__main__": config = Config() # 配置参数 # 数据部分 data_get = DataProcess(config.train_path, config.stopwords_path, config.seg_train_path, config.tfidf_model_save_path) X_train, X_test, y_train, y_test, seq_length, num_classes = get_data() model = LrModel(config, seq_length, num_classes) train(X_train, X_test, y_train, y_test)
print(text_list[0]) return text_list def read_categories(): """读取类别""" with open(config.categories_save_path, 'r', encoding='utf-8') as f: categories = f.readlines() return categories[0].split('|') def predict_line(data, categories): """预测结果""" session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=config.lr_save_path) y_pred_cls = session.run(model.y_pred_cls, feed_dict={model.x: data}) print(y_pred_cls[:100]) return [categories[i] for i in y_pred_cls] if __name__ == "__main__": config = Config() line = pre_data(config.dataset_path, config) tfidf_model = joblib.load(config.tfidf_model_save_path) X_test = tfidf_model.transform(line).toarray() categories = read_categories() model = LrModel(config, len(X_test[0]), len(categories)) print(predict_line(X_test, categories))