示例#1
0
 def __init__(self, train_data, test_data, model):
     LrModel.__init__(self, train_data, test_data)
     if not model in config.model_list:
         print "Wrong model name when initializing EM model."
         exit(-1)
     self.model = model
     self.em_log = []
def main():
    if len(sys.argv) < 3:
        print "Usage python test_lr.py campaign_id learn_rate (budget_prop)"
        exit(-1)
    data_folder = "../../make-ipinyou-data/"
    config.campaign_id = int(sys.argv[1])
    # print config.campaign
    # print config.campaign_id
    # exit(-1)
    config.lr_alpha = float(sys.argv[2])
    if len(sys.argv) == 4:
        config.budget_prop = int(sys.argv[3])
    train_path = data_folder + ` config.campaign_id ` + "/train.yzx.txt"
    test_path = data_folder + ` config.campaign_id ` + "/test.yzx.txt"

    train_data = Dataset(train_path, config.campaign_id)
    train_data.shuffle()  # make train data shuffled
    test_data = Dataset(test_path, config.campaign_id)
    print "Load done."

    lr_model = LrModel(train_data, test_data)
    print "campaign v = " + ` lr_model.camp_v `
    print "learn_rate = " + ` config.lr_alpha `
    print "budget = " + ` lr_model.budget `

    if config.ds_ratio > 0:
        print "Need calibration."
    else:
        print "No calibration."

    print "Begin training ..."
    for i in range(0, config.lr_train_round):
        lr_model.train()
        lr_model.test()
        print "Round " + ` i + 1 ` + "\t" + ` tool.get_last_log(
            lr_model.test_log)['performance'] `
        if tool.judge_stop(lr_model.test_log):
            break
    print "Train done."

    log_file = ` config.campaign_id ` + "_lrlin_" + ` config.lr_alpha ` + "_" + ` config.budget_prop ` + ".csv"
    fo = open("../output/" + log_file, 'w')

    print "Begin log ..."
    header = "camp_id\tmodel\tdataset\trevenue\tctr\tcpc\tauc\trmse\tcpm\tbids\timps\tclks\tlaplace\tinterval\tlearn_rate\tnds_ratio\tbudget_prop"
    best_test_log = lr_model.get_best_test_log()
    best_test_line = `config.campaign_id` + "\t" + "LR\ttest\t" \
         + tool.gen_performance_line(best_test_log) + "\t" \
         + 'None' + "\t" + "None" + "\t" + `config.lr_alpha` + "\t" \
         + "None" + "\t" + `config.budget_prop`
    fo.write(header + "\n")
    fo.write(best_test_line + "\n")

    # search for best linear parameter
    opt_param = lr_model.lin_bid(best_test_log['weight'])
    fo.write(
        "prop\trevenue\troi\tctr\tcpc\tauc\trmse\tcpm\timps\tclks\tlin_param\n"
    )
    for prop in config.budget_props:
        performance = lr_model.replay(best_test_log['weight'],
                                      lr_model.test_data, prop)
        fo.write( ` prop `)
        fo.write("\t")
        fo.write( ` performance['revenue'] `)
        fo.write("\t")
        fo.write( ` performance['roi'] `)
        fo.write("\t")
        fo.write( ` performance['ctr'] `)
        fo.write("\t")
        fo.write( ` performance['cpc'] `)
        fo.write("\t")
        fo.write( ` performance['auc'] `)
        fo.write("\t")
        fo.write( ` performance['rmse'] `)
        fo.write("\t")
        fo.write( ` performance['cpm'] `)
        fo.write("\t")
        fo.write( ` performance['imps'] `)
        fo.write("\t")
        fo.write( ` performance['clks'] `)
        fo.write("\t")
        fo.write( ` opt_param `)
        fo.write("\n")

    fo.write("\n")

    fo.write("Round\tTest\tctr\tcpc\tauc\trmse\tcpm\tclks\timps\tbids\n")
    for i in range(0, len(lr_model.test_log)):
        test_log = lr_model.test_log[i]
        line = `i+1` + "\t" + `test_log['performance']['revenue']` \
          + "\t" + `test_log['performance']['ctr']` \
          + "\t" + `test_log['performance']['cpc']` \
          + "\t" + `test_log['performance']['auc']` \
          + "\t" + `test_log['performance']['rmse']` \
          + "\t" + `test_log['performance']['cpm']` \
          + "\t" + `test_log['performance']['clks']` \
          + "\t" + `test_log['performance']['imps']` \
          + "\t" + `test_log['performance']['bids']`
        fo.write(line + "\n")
    fo.close()
    print "Log done."

    weight_path = `config.campaign_id` + "_" + "lrlin_best_weight" \
       + "_" + `config.lr_alpha` + "_" + `config.budget_prop` \
       + ".weight"
    lr_model.output_weight(best_test_log['weight'], "../output/" + weight_path)
	def __init__(self, train_data, test_data):
		LrModel.__init__(self, train_data, test_data)
                             model.x: batch_xs,
                             model.y_: batch_ys
                         })
                total_batch += 1

                if total_batch - last_improved > require_improvement:
                    #  验证集准确率长期不提升,提前结束训练
                    print("No optimization for a long time, auto-stopping...")
                    flag = True
                    break
            if flag:
                break


# TODO:后续有需要再做
def test():
    """
    目前直接输入一个语料,分为训练集和验证集合
    也可以输入两个,一个训练集用sklearn分为训练集和验证集,单独找一个验证集再这测试
    还可以输入训练集、验证集、测试集,测试集在这做测试
    """
    pass


if __name__ == "__main__":
    config = LrConfig()
    data_get = DataProcess(config.dataset_path, config.stopwords_path,
                           config.tfidf_model_save_path)
    X_train, X_test, y_train, y_test, seq_length = get_data()
    model = LrModel(config, seq_length)
    train(X_train, X_test, y_train, y_test)
    text_list.append(' '.join(text))
    return text_list


def read_categories():
    """读取类别"""
    with open(config.categories_save_path, 'r', encoding='utf-8') as f:
        categories = f.readlines()
    return categories[0].split('|')


def predict_line(data, categories):
    """预测结果"""
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=config.lr_save_path)
    y_pred_cls = session.run(model.y_pred_cls, feed_dict={model.x: data})
    return categories[y_pred_cls[0]]


if __name__ == "__main__":
    data = "北京城区最大规模经适房昨摇号 比例可达3:1 11月28日,城八区年内最大规模经适房摇号在石景山区举行"
    config = LrConfig()
    line = pre_data(data, config)
    tfidf_model = joblib.load(config.tfidf_model_save_path)
    X_test = tfidf_model.transform(line).toarray()
    model = LrModel(config, len(X_test[0]))
    categories = read_categories()
    print(predict_line(X_test, categories))
示例#6
0
                sess.run(model.train_step,
                         feed_dict={
                             model.x: batch_xs,
                             model.y_: batch_ys
                         })
                total_batch += 1

                if total_batch > max_batch:
                    #  轮次,提前结束训练
                    print("Too much batchs, auto-stopping...")
                    flag = True
                    break
                if total_batch - last_improved > require_improvement:
                    #  验证集准确率长期不提升,提前结束训练
                    print("No optimization for a long time, auto-stopping...")
                    flag = True
                    break
            if flag:
                break


if __name__ == "__main__":
    config = Config()  # 配置参数
    # 数据部分
    data_get = DataProcess(config.train_path, config.stopwords_path,
                           config.seg_train_path, config.tfidf_model_save_path)
    X_train, X_test, y_train, y_test, seq_length, num_classes = get_data()

    model = LrModel(config, seq_length, num_classes)
    train(X_train, X_test, y_train, y_test)
示例#7
0
    print(text_list[0])
    return text_list


def read_categories():
    """读取类别"""
    with open(config.categories_save_path, 'r', encoding='utf-8') as f:
        categories = f.readlines()
    return categories[0].split('|')


def predict_line(data, categories):
    """预测结果"""
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=config.lr_save_path)
    y_pred_cls = session.run(model.y_pred_cls, feed_dict={model.x: data})
    print(y_pred_cls[:100])
    return [categories[i] for i in y_pred_cls]


if __name__ == "__main__":
    config = Config()
    line = pre_data(config.dataset_path, config)
    tfidf_model = joblib.load(config.tfidf_model_save_path)
    X_test = tfidf_model.transform(line).toarray()
    categories = read_categories()
    model = LrModel(config, len(X_test[0]), len(categories))
    print(predict_line(X_test, categories))