예제 #1
0
 def get_optimizer(self):
     optimizer = AdagradOptimizer(learning_rate=0.2)
     return optimizer
예제 #2
0
def train_ptb_lm():
    args = parse_args()

    # check if set use_gpu=True in paddlepaddle cpu version
    model_check.check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    model_check.check_version()

    model_type = args.model_type

    vocab_size = 37484
    if model_type == "test":
        num_layers = 1
        batch_size = 2
        hidden_size = 10
        num_steps = 4
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 1
        max_epoch = 1
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "small":
        num_layers = 2
        batch_size = 20
        hidden_size = 200
        num_steps = 20
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 4
        max_epoch = 2
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "gru4rec":
        num_layers = 1
        batch_size = 500
        hidden_size = 100
        num_steps = 10
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 10
        max_epoch = 5
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 0.05
    elif model_type == "medium":
        num_layers = 2
        batch_size = 20
        hidden_size = 650
        num_steps = 35
        init_scale = 0.05
        max_grad_norm = 5.0
        epoch_start_decay = 6
        max_epoch = 39
        dropout = 0.5
        lr_decay = 0.8
        base_learning_rate = 1.0
    elif model_type == "large":
        num_layers = 2
        batch_size = 20
        hidden_size = 1500
        num_steps = 35
        init_scale = 0.04
        max_grad_norm = 10.0
        epoch_start_decay = 14
        max_epoch = 55
        dropout = 0.65
        lr_decay = 1.0 / 1.15
        base_learning_rate = 1.0
    else:
        print("model type not support")
        return

    with fluid.dygraph.guard(core.CUDAPlace(0)):
        if args.ce:
            print("ce mode")
            seed = 33
            np.random.seed(seed)
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            max_epoch = 1
        ptb_model = PtbModel("ptb_model",
                             hidden_size=hidden_size,
                             vocab_size=vocab_size,
                             num_layers=num_layers,
                             num_steps=num_steps,
                             init_scale=init_scale,
                             dropout=dropout)

        if args.init_from_pretrain_model:
            if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
                print(args.init_from_pretrain_model)
                raise Warning("The pretrained params do not exist.")
                return
            fluid.load_dygraph(args.init_from_pretrain_model)
            print("finish initing model from pretrained params from %s" %
                  (args.init_from_pretrain_model))

        dy_param_updated = dict()
        dy_param_init = dict()
        dy_loss = None
        last_hidden = None

        data_path = args.data_path
        print("begin to load data")
        ptb_data = reader.get_ptb_data(data_path)
        print("finished load data")
        train_data, valid_data, test_data = ptb_data

        batch_len = len(train_data) // batch_size
        total_batch_size = (batch_len - 1) // num_steps
        print("total_batch_size:", total_batch_size)
        log_interval = total_batch_size // 20

        bd = []
        lr_arr = [base_learning_rate]
        for i in range(1, max_epoch):
            bd.append(total_batch_size * i)
            new_lr = base_learning_rate * (lr_decay**max(
                i + 1 - epoch_start_decay, 0.0))
            lr_arr.append(new_lr)

        sgd = AdagradOptimizer(parameter_list=ptb_model.parameters(),
                               learning_rate=fluid.layers.piecewise_decay(
                                   boundaries=bd, values=lr_arr))

        print("parameters:--------------------------------")
        for para in ptb_model.parameters():
            print(para.name)
        print("parameters:--------------------------------")

        def eval(model, data):
            print("begion to eval")
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
                                        dtype='float32')

            model.eval()
            train_data_iter = reader.get_data_iter(data, batch_size, num_steps)
            init_hidden = to_variable(init_hidden_data)
            accum_num_recall = 0.0
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, num_steps, 1))
                x = to_variable(x_data)
                y = to_variable(y_data)
                dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden)

                out_loss = dy_loss.numpy()
                acc_ = acc.numpy()[0]
                accum_num_recall += acc_
                if batch_id % 1 == 0:
                    print("batch_id:%d  recall@20:%.4f" %
                          (batch_id, accum_num_recall / (batch_id + 1)))

                init_hidden = last_hidden

                total_loss += out_loss
                iters += num_steps

            print("eval finished")
            ppl = np.exp(total_loss / iters)
            print("recall@20 ", accum_num_recall / (batch_id + 1))
            if args.ce:
                print("kpis\ttest_ppl\t%0.3f" % ppl[0])

        grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm)
        for epoch_id in range(max_epoch):
            ptb_model.train()
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
                                        dtype='float32')

            train_data_iter = reader.get_data_iter(train_data, batch_size,
                                                   num_steps)
            init_hidden = to_variable(init_hidden_data)

            start_time = time.time()
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, num_steps, 1))
                x = to_variable(x_data)
                y = to_variable(y_data)
                dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden)

                out_loss = dy_loss.numpy()
                acc_ = acc.numpy()[0]

                init_hidden = last_hidden
                dy_loss.backward()
                sgd.minimize(dy_loss, grad_clip=grad_clip)
                ptb_model.clear_gradients()
                total_loss += out_loss
                iters += num_steps

                if batch_id > 0 and batch_id % 100 == 1:
                    ppl = np.exp(total_loss / iters)
                    print(
                        "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, acc: %.5f, lr: %.5f"
                        % (epoch_id, batch_id, ppl[0], acc_,
                           sgd._global_learning_rate().numpy()))

            print("one ecpoh finished", epoch_id)
            print("time cost ", time.time() - start_time)
            ppl = np.exp(total_loss / iters)
            print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0]))
            if args.ce:
                print("kpis\ttrain_ppl\t%0.3f" % ppl[0])
            save_model_dir = os.path.join(args.save_model_dir, str(epoch_id),
                                          'params')
            fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
            print("Saved model to: %s.\n" % save_model_dir)
            eval(ptb_model, test_data)
예제 #3
0
 def get_optimizer_dygraph(self, parameter_list):
     optimizer = AdagradOptimizer(
         learning_rate=0.2, parameter_list=parameter_list)
     return optimizer
예제 #4
0
def train_model_2_dnn(
        click_seq0_1,
        embedding_weight,
        save_model_path=f'./t2_dnngru/load_emb_save_model_{cfg.now_phase}'):
    model = Model_2_dnngru(click_seq0_1,
                           embedding_weight,
                           gru_steps=cfg.gru_steps,
                           gru_num_layers=1)
    if not os.path.exists('./t2_dnngru'):  # 创建储存中间数据,模型的文件夹
        os.makedirs('./t2_dnngru')
    data_path = f'./t2_dnngru/list6_0-{cfg.now_phase}.pkl'
    if not os.path.exists(data_path):
        # 处理数据
        user_data, click_info, click_id, click_txt, click_img, y_click_id = data_pre_dnngru(
            click_seq0_1)
        pickle.dump((user_data, click_info, click_id, click_txt, click_img,
                     y_click_id), open(data_path, 'wb'))
    else:
        user_data, click_info, click_id, click_txt, click_img, y_click_id = pickle.load(
            open(data_path, 'rb'))
    batch_len = len(user_data) // cfg.batch_size
    total_batch_size = (batch_len - 1) // cfg.gru_steps
    print("total_batch_size:", total_batch_size)
    #     opt = fluid.optimizer.Adam(learning_rate=0.05, parameter_list=model.parameters())
    bd = []
    lr_arr = [cfg.base_learning_rate]
    for i in range(1, cfg.max_epoch):
        bd.append(total_batch_size * i)
        new_lr = cfg.base_learning_rate * (cfg.lr_decay**max(
            i + 1 - cfg.epoch_start_decay, 0.0))
        lr_arr.append(new_lr)

    # 定义梯度的clip即取值范围
    grad_clip = fluid.clip.GradientClipByGlobalNorm(cfg.max_grad_norm)
    # 优化器选择adam,会降低训练准确率,选sgd会过拟合
    sgd = AdagradOptimizer(parameter_list=model.parameters(),
                           learning_rate=fluid.layers.piecewise_decay(
                               boundaries=bd, values=lr_arr),
                           grad_clip=grad_clip)

    model.train()
    for epoch in range(cfg.max_epoch):
        start_time = time.time()
        train_loader = data_loader_dnngru(user_data, click_info, click_id,
                                          click_txt, click_img, y_click_id,
                                          cfg.batch_size, cfg.gru_steps)

        init_hidden_data = np.zeros(
            (model.gru_num_layers, cfg.batch_size, model.gru_hidden_size),
            dtype='float32')
        init_hidden = to_variable(init_hidden_data)

        for batch_id, data in enumerate(train_loader):
            (user_data_pp, click_info_pp, click_id_pp, click_txt_pp,
             click_img_pp,
             y_click_id_pp) = (data[..., :4], data[..., 4:6], data[..., 6:7],
                               data[..., 7:128 + 7],
                               data[..., 128 + 7:256 + 7],
                               data[..., 256 + 7:256 + 8])
            user_data_pp = user_data_pp.astype(int)
            (user_id_pp, user_age_level_pp, user_gender_pp,
             user_city_level_pp) = (user_data_pp[:, :, 0],
                                    user_data_pp[:, :, 1],
                                    user_data_pp[:, :, 2],
                                    user_data_pp[:, :, 3])
            user_id_pp = to_variable(user_id_pp)
            user_age_level_pp = to_variable(user_age_level_pp)
            user_gender_pp = to_variable(user_gender_pp)
            user_city_level_pp = to_variable(user_city_level_pp)

            stay_data_pp = to_variable(
                click_info_pp[..., 0:1].astype('float32'))

            click_id_pp = to_variable(click_id_pp[..., 0].astype(int))
            click_txt_pp = to_variable(click_txt_pp.astype('float32'))
            click_img_pp = to_variable(click_img_pp.astype('float32'))

            y_click_id_pp = to_variable(y_click_id_pp.astype(int))

            pred_out, last_hidden = model([
                user_id_pp, user_age_level_pp, user_gender_pp,
                user_city_level_pp
            ], stay_data_pp, [click_id_pp, click_txt_pp, click_img_pp],
                                          init_hidden)
            init_hidden = last_hidden.detach()
            # 交叉熵
            loss = fluid.layers.softmax_with_cross_entropy(logits=pred_out,
                                                           label=y_click_id_pp,
                                                           soft_label=False,
                                                           axis=2)
            # 计算recall@50 指标
            pre_2d = fluid.layers.reshape(pred_out, shape=[-1, cfg.vocab_size])
            label_2d = fluid.layers.reshape(y_click_id_pp, shape=[-1, 1])
            acc = fluid.layers.accuracy(input=pre_2d, label=label_2d, k=50)
            acc_ = acc.numpy()[0]
            # 综合所有batch和序列长度的loss, 与5.2不同
            loss = fluid.layers.reduce_mean(loss)

            loss.backward()
            sgd.minimize(loss)
            model.clear_gradients()
            out_loss = loss.numpy()

            # 每隔一段时间可以打印信息
            if batch_id > 0 and batch_id % 100 == 1:
                print("-- Epoch:[%d]; Batch:[%d]; loss: %.5f, acc: %.5f" %
                      (epoch, batch_id, out_loss, acc_))

        print("one ecpoh finished", epoch)
        print("time cost ", time.time() - start_time)
        print("loss: %.5f, acc: %.5f" % (out_loss, acc_))

    fluid.save_dygraph(model.state_dict(), save_model_path)
    print("Saved model to: %s.\n" % save_model_path)