예제 #1
0
def train_ptb_lm():

    args = parse_args()
    model_type = args.model_type

    vocab_size = 10000
    if model_type == "test":
        num_layers = 1
        batch_size = 2
        hidden_size = 10
        num_steps = 3
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 1
        max_epoch = 1
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "small":
        num_layers = 2
        batch_size = 20
        hidden_size = 200
        num_steps = 20
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 4
        max_epoch = 13
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "medium":
        num_layers = 2
        batch_size = 20
        hidden_size = 650
        num_steps = 35
        init_scale = 0.05
        max_grad_norm = 5.0
        epoch_start_decay = 6
        max_epoch = 39
        dropout = 0.5
        lr_decay = 0.8
        base_learning_rate = 1.0
    elif model_type == "large":
        num_layers = 2
        batch_size = 20
        hidden_size = 1500
        num_steps = 35
        init_scale = 0.04
        max_grad_norm = 10.0
        epoch_start_decay = 14
        max_epoch = 55
        dropout = 0.65
        lr_decay = 1.0 / 1.15
        base_learning_rate = 1.0
    else:
        print("model type not support")
        return

    with fluid.dygraph.guard(core.CUDAPlace(0)):
        if args.ce:
            print("ce mode")
            seed = 33
            np.random.seed(seed)
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            max_epoch = 1
        ptb_model = PtbModel("ptb_model",
                             hidden_size=hidden_size,
                             vocab_size=vocab_size,
                             num_layers=num_layers,
                             num_steps=num_steps,
                             init_scale=init_scale,
                             dropout=dropout)

        dy_param_updated = dict()
        dy_param_init = dict()
        dy_loss = None
        last_hidden = None
        last_cell = None

        data_path = args.data_path
        print("begin to load data")
        ptb_data = reader.get_ptb_data(data_path)
        print("finished load data")
        train_data, valid_data, test_data = ptb_data

        batch_len = len(train_data) // batch_size
        total_batch_size = (batch_len - 1) // num_steps
        log_interval = total_batch_size // 20

        bd = []
        lr_arr = [1.0]
        for i in range(1, max_epoch):
            bd.append(total_batch_size * i)
            new_lr = base_learning_rate * (lr_decay**max(
                i + 1 - epoch_start_decay, 0.0))
            lr_arr.append(new_lr)

        sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
            boundaries=bd, values=lr_arr))

        def eval(model, data):
            print("begion to eval")
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
                                        dtype='float32')
            init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
                                      dtype='float32')

            model.eval()
            train_data_iter = reader.get_data_iter(data, batch_size, num_steps)
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, 1))
                x = to_variable(x_data)
                y = to_variable(y_data)
                init_hidden = to_variable(init_hidden_data)
                init_cell = to_variable(init_cell_data)
                dy_loss, last_hidden, last_cell = ptb_model(
                    x, y, init_hidden, init_cell)

                out_loss = dy_loss.numpy()

                init_hidden_data = last_hidden.numpy()
                init_cell_data = last_cell.numpy()

                total_loss += out_loss
                iters += num_steps

            print("eval finished")
            ppl = np.exp(total_loss / iters)
            print("ppl ", batch_id, ppl[0])
            if args.ce:
                print("kpis\ttest_ppl\t%0.3f" % ppl[0])

        grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm)
        for epoch_id in range(max_epoch):
            ptb_model.train()
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
                                        dtype='float32')
            init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
                                      dtype='float32')

            train_data_iter = reader.get_data_iter(train_data, batch_size,
                                                   num_steps)

            start_time = time.time()
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, 1))
                x = to_variable(x_data)
                y = to_variable(y_data)
                init_hidden = to_variable(init_hidden_data)
                init_cell = to_variable(init_cell_data)
                dy_loss, last_hidden, last_cell = ptb_model(
                    x, y, init_hidden, init_cell)

                out_loss = dy_loss.numpy()

                init_hidden_data = last_hidden.numpy()
                init_cell_data = last_cell.numpy()
                dy_loss.backward()
                sgd.minimize(dy_loss, grad_clip=grad_clip)

                ptb_model.clear_gradients()
                total_loss += out_loss
                iters += num_steps

                if batch_id > 0 and batch_id % log_interval == 0:
                    ppl = np.exp(total_loss / iters)
                    print(epoch_id, "ppl ", batch_id, ppl[0],
                          sgd._global_learning_rate().numpy())

            print("one ecpoh finished", epoch_id)
            print("time cost ", time.time() - start_time)
            ppl = np.exp(total_loss / iters)
            print("ppl ", epoch_id, ppl[0])
            if args.ce:
                print("kpis\ttrain_ppl\t%0.3f" % ppl[0])

        eval(ptb_model, test_data)
예제 #2
0
def main():
    args = parse_args()

    # check if set use_gpu=True in paddlepaddle cpu version
    check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    logger = logging.getLogger("lm")
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    if args.log_path:
        file_handler = logging.FileHandler(args.log_path)
        file_handler.setLevel(logging.INFO)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    else:
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)
    logger.info('Running with args : {}'.format(args))

    config = RNNConfig(args)

    # define train program
    main_program = fluid.Program()
    startup_program = fluid.Program()
    if args.enable_ce:
        startup_program.random_seed = SEED
    with fluid.program_guard(main_program, startup_program):
        with fluid.unique_name.guard():
            res_vars = lm_model.lm_model(
                config.hidden_size,
                config.vocab_size,
                config.batch_size,
                num_layers=config.num_layers,
                num_steps=config.num_steps,
                init_scale=config.init_scale,
                dropout=config.dropout,
                rnn_model=config.rnn_model,
                use_dataloader=args.use_dataloader)

            if args.use_dataloader:
                dataloader = res_vars[-1]
                res_vars = res_vars[:-1]
            loss, last_hidden, last_cell, feed_order = res_vars

            fluid.clip.set_gradient_clip(
                clip=fluid.clip.GradientClipByGlobalNorm(
                    clip_norm=config.max_grad_norm))

            learning_rate = fluid.layers.create_global_var(
                name="learning_rate",
                shape=[1],
                value=1.0,
                dtype='float32',
                persistable=True)

            optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
            optimizer.minimize(loss)

    # define inference program
    inference_program = fluid.Program()
    inference_startup_program = fluid.Program()
    with fluid.program_guard(inference_program, inference_startup_program):
        with fluid.unique_name.guard():
            lm_model.lm_model(
                config.hidden_size,
                config.vocab_size,
                config.batch_size,
                num_layers=config.num_layers,
                num_steps=config.num_steps,
                init_scale=config.init_scale,
                dropout=config.dropout,
                rnn_model=config.rnn_model,
                use_dataloader=False)
    # Some op behaves differently for train and inference, we need to call
    # this clone function to ensure every op is right for inference.
    inference_program = inference_program.clone(for_test=True)

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = Executor(place)
    exe.run(startup_program)

    if args.init_from_pretrain_model:
        if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
            print(args.init_from_pretrain_model)
            raise Warning("The pretrained params do not exist.")
            return
        fluid.load(main_program, args.init_from_pretrain_model)
        print("finish initing model from pretrained params from %s" %
              (args.init_from_pretrain_model))

    device_count = len(fluid.cuda_places()) if args.use_gpu else len(
        fluid.cpu_places())

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = device_count
    exec_strategy.num_iteration_per_drop_scope = 100

    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = True

    if args.parallel:
        train_program = fluid.compiler.CompiledProgram(
            main_program).with_data_parallel(
                loss_name=loss.name,
                build_strategy=build_strategy,
                exec_strategy=exec_strategy)
    else:
        train_program = fluid.compiler.CompiledProgram(main_program)

    data_path = args.data_path
    print("begin to load data")
    ptb_data = reader.get_ptb_data(data_path)
    print("finished load data")
    train_data, valid_data, test_data = ptb_data

    def generate_init_data():
        if args.rnn_model == "lod":
            init_hidden = np.zeros(
                (config.batch_size, config.num_layers, config.hidden_size),
                dtype='float32')
            init_cell = np.zeros(
                (config.batch_size, config.num_layers, config.hidden_size),
                dtype='float32')
        else:
            init_hidden = np.zeros(
                (config.num_layers, config.batch_size, config.hidden_size),
                dtype='float32')
            init_cell = np.zeros(
                (config.num_layers, config.batch_size, config.hidden_size),
                dtype='float32')
        return init_hidden, init_cell

    def generate_new_lr(epoch_id=0, device_count=1):
        new_lr = config.base_learning_rate * (config.lr_decay**max(
            epoch_id + 1 - config.epoch_start_decay, 0.0))
        lr = np.ones((device_count), dtype='float32') * new_lr
        return lr

    def prepare_input(batch,
                      init_hidden=None,
                      init_cell=None,
                      epoch_id=0,
                      with_lr=True,
                      device_count=1):
        x, y = batch
        batch_size = x.shape[0]
        x = x.reshape((-1, config.num_steps, 1))
        y = y.reshape((-1, 1))
        if args.rnn_model == "lod":
            x = to_lodtensor(x.reshape((-1, 1)), place, [
                range(0, (batch_size + 1) * config.num_steps, config.num_steps)
            ])
            y = to_lodtensor(y.reshape((-1, 1)), place, [
                range(0, (batch_size + 1) * config.num_steps, config.num_steps)
            ])
        res = {}
        res['x'] = x
        res['y'] = y
        if init_hidden is not None:
            res['init_hidden'] = init_hidden
        if init_cell is not None:
            res['init_cell'] = init_cell
        if with_lr:
            res['learning_rate'] = generate_new_lr(epoch_id, device_count)

        return res

    def eval(data):
        # when eval the batch_size set to 1
        eval_data_iter = reader.get_data_iter(data, config.batch_size,
                                              config.num_steps)
        total_loss = 0.0
        iters = 0
        init_hidden, init_cell = generate_init_data()
        for batch_id, batch in enumerate(eval_data_iter):
            input_data_feed = prepare_input(
                batch, init_hidden, init_cell, epoch_id=0, with_lr=False)
            fetch_outs = exe.run(
                program=inference_program,
                feed=input_data_feed,
                fetch_list=[loss.name, last_hidden.name, last_cell.name],
                use_program_cache=False)

            cost_eval = np.array(fetch_outs[0])
            init_hidden = np.array(fetch_outs[1])
            init_cell = np.array(fetch_outs[2])

            total_loss += cost_eval
            iters += config.num_steps

        ppl = np.exp(total_loss / iters)
        return ppl

    def get_log_interval(data_len):
        num_batchs = data_len // config.batch_size
        epoch_size = (num_batchs - 1) // config.num_steps
        log_interval = max(1, epoch_size // 10)
        return log_interval

    def train_an_epoch(epoch_id, batch_times):
        # get train epoch size
        log_interval = get_log_interval(len(train_data))
        train_data_iter = reader.get_data_iter(train_data, config.batch_size,
                                               config.num_steps)

        total_loss = 0
        iters = 0

        init_hidden, init_cell = generate_init_data()
        for batch_id, batch in enumerate(train_data_iter):
            input_data_feed = prepare_input(
                batch,
                init_hidden=init_hidden,
                init_cell=init_cell,
                epoch_id=epoch_id,
                with_lr=True,
                device_count=device_count)
            batch_start_time = time.time()
            fetch_outs = exe.run(train_program,
                                 feed=input_data_feed,
                                 fetch_list=[
                                     loss.name, "learning_rate",
                                     last_hidden.name, last_cell.name
                                 ],
                                 use_program_cache=True)
            batch_time = time.time() - batch_start_time
            batch_times.append(batch_time)

            cost_train = np.array(fetch_outs[0])
            lr = np.array(fetch_outs[1])
            init_hidden = np.array(fetch_outs[2])
            init_cell = np.array(fetch_outs[3])

            total_loss += cost_train
            iters += config.num_steps
            if batch_id > 0 and batch_id % log_interval == 0:
                ppl = np.exp(total_loss / iters)
                print(
                    "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
                    % (epoch_id, batch_id, batch_time, ppl[0], lr[0]))
        ppl = np.exp(total_loss / iters)
        return ppl

    def train_an_epoch_dataloader(epoch_id, batch_times):
        # get train epoch size
        log_interval = get_log_interval(len(train_data))

        init_hidden, init_cell = generate_init_data()

        total_loss = 0
        iters = 0

        dataloader.start()
        batch_id = 0
        try:
            while True:
                data_feeds = {}
                if batch_id == 0:
                    batch_time = 0
                    batch_start_time = time.time()
                else:
                    batch_time = time.time() - batch_start_time
                    batch_times.append(batch_time)
                    batch_start_time = time.time()

                new_lr = generate_new_lr(epoch_id, device_count)
                data_feeds['learning_rate'] = new_lr
                data_feeds["init_hidden"] = init_hidden
                data_feeds["init_cell"] = init_cell

                fetch_outs = exe.run(train_program,
                                     feed=data_feeds,
                                     fetch_list=[
                                         loss.name, "learning_rate",
                                         last_hidden.name, last_cell.name
                                     ],
                                     use_program_cache=True)

                cost_train = np.array(fetch_outs[0])
                lr = np.array(fetch_outs[1])
                init_hidden = np.array(fetch_outs[2])
                init_cell = np.array(fetch_outs[3])

                total_loss += cost_train
                iters += config.num_steps
                if batch_id > 0 and (log_interval == 0 or
                                     batch_id % log_interval == 0):
                    ppl = np.exp(total_loss / iters)
                    print(
                        "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
                        % (epoch_id, batch_id, batch_time, ppl[0], lr[0]))

                batch_id += 1
        except fluid.core.EOFException:
            dataloader.reset()

        batch_times.append(time.time() - batch_start_time)
        ppl = np.exp(total_loss / iters)
        return ppl

    def train():
        if args.use_dataloader:
            def data_gen():
                data_iter_size = config.batch_size // device_count
                train_batches = reader.get_data_iter(train_data, data_iter_size,
                                                     config.num_steps)
                for batch in train_batches:
                    x, y = batch
                    x = x.reshape((-1, config.num_steps, 1))
                    y = y.reshape((-1, 1))
                    if args.rnn_model == "lod":
                        x = to_lodtensor(x.reshape((-1, 1)), place, [
                            range(0, (data_iter_size + 1) * config.num_steps,
                                  config.num_steps)
                        ])
                        y = to_lodtensor(y.reshape((-1, 1)), place, [
                            range(0, (data_iter_size + 1) * config.num_steps,
                                  config.num_steps)
                        ])

                    yield x, y

            dataloader.set_batch_generator(data_gen)

        total_time = 0.0
        for epoch_id in range(config.max_epoch):
            batch_times = []
            epoch_start_time = time.time()
            if args.use_dataloader:
                train_ppl = train_an_epoch_dataloader(epoch_id, batch_times)
            else:
                train_ppl = train_an_epoch(epoch_id, batch_times)
            epoch_time = time.time() - epoch_start_time
            total_time += epoch_time
            print(
                "\nTrain epoch:[%d]; epoch Time: %.5f; ppl: %.5f; avg_time: %.5f steps/s \n"
                % (epoch_id, epoch_time, train_ppl[0],
                   len(batch_times) / sum(batch_times)))

            # FIXME(zjl): ppl[0] increases as batch_size increases.
            # We should find a better way to calculate ppl by normalizing batch_size.
            if device_count == 1 and config.batch_size <= 20 and epoch_id == 0 and train_ppl[
                    0] > 1000:
                # for bad init, after first epoch, the loss is over 1000
                # no more need to continue
                print(
                    "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch."
                )
                print("Abort this training process and please start again.")
                return

            if epoch_id == config.max_epoch - 1 and args.enable_ce:
                # kpis
                print("ptblm\tlstm_language_model_%s_duration_card%d\t%s" %
                      (args.rnn_model, device_count,
                       total_time / config.max_epoch))
                print("ptblm\tlstm_language_model_%s_loss_card%d\t%s" %
                      (args.rnn_model, device_count, train_ppl[0]))

            # NOTE(zjl): sometimes we have not enough data for eval if batch_size is large, i.e., 2100
            # Just skip to avoid error
            def is_valid_data(data, batch_size, num_steps):
                data_len = len(data)
                batch_len = data_len // batch_size
                epoch_size = (batch_len - 1) // num_steps
                return epoch_size >= 1

            valid_data_valid = is_valid_data(valid_data, config.batch_size,
                                             config.num_steps)
            if valid_data_valid:
                valid_ppl = eval(valid_data)
                print("Valid ppl: %.5f" % valid_ppl[0])
            else:
                print(
                    'WARNING: length of valid_data is {}, which is not enough for batch_size {} and num_steps {}'.
                    format(
                        len(valid_data), config.batch_size, config.num_steps))

            save_model_dir = os.path.join(args.save_model_dir,
                                          str(epoch_id), "params")
            fluid.save(main_program, save_model_dir)
            print("Saved model to: %s.\n" % save_model_dir)

    with profile_context(args.profile):
        train()

    test_ppl = eval(test_data)
    print("Test ppl:", test_ppl[0])
예제 #3
0
def train_ptb_lm():
    args = parse_args()

    # check if set use_gpu=True in paddlepaddle cpu version
    model_check.check_cuda(args.use_gpu)

    place = core.CPUPlace()
    if args.use_gpu == True:
        place = core.CUDAPlace(0)

    # check if paddlepaddle version is satisfied
    model_check.check_version()

    model_type = args.model_type

    vocab_size = 10000
    if model_type == "test":
        num_layers = 1
        batch_size = 2
        hidden_size = 10
        num_steps = 3
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 1
        max_epoch = 1
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "small":
        num_layers = 2
        batch_size = 20
        hidden_size = 200
        num_steps = 20
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 4
        max_epoch = 13
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "medium":
        num_layers = 2
        batch_size = 20
        hidden_size = 650
        num_steps = 35
        init_scale = 0.05
        max_grad_norm = 5.0
        epoch_start_decay = 6
        max_epoch = 39
        dropout = 0.5
        lr_decay = 0.8
        base_learning_rate = 1.0
    elif model_type == "large":
        num_layers = 2
        batch_size = 20
        hidden_size = 1500
        num_steps = 35
        init_scale = 0.04
        max_grad_norm = 10.0
        epoch_start_decay = 14
        max_epoch = 55
        dropout = 0.65
        lr_decay = 1.0 / 1.15
        base_learning_rate = 1.0
    else:
        print("model type not support")
        return

    with fluid.dygraph.guard(place):
        if args.ce:
            print("ce mode")
            seed = 33
            np.random.seed(seed)
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            max_epoch = 1
        ptb_model = PtbModel(
            hidden_size=hidden_size,
            vocab_size=vocab_size,
            num_layers=num_layers,
            num_steps=num_steps,
            init_scale=init_scale,
            dropout=dropout)

        if args.init_from_pretrain_model:
            if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
                print(args.init_from_pretrain_model)
                raise Warning("The pretrained params do not exist.")
                return
            fluid.load_dygraph(args.init_from_pretrain_model)
            print("finish initing model from pretrained params from %s" %
                  (args.init_from_pretrain_model))

        dy_param_updated = dict()
        dy_param_init = dict()
        dy_loss = None
        last_hidden = None
        last_cell = None

        data_path = args.data_path
        print("begin to load data")
        ptb_data = reader.get_ptb_data(data_path)
        print("finished load data")
        train_data, valid_data, test_data = ptb_data

        batch_len = len(train_data) // batch_size
        total_batch_size = (batch_len - 1) // num_steps
        log_interval = 200

        bd = []
        lr_arr = [1.0]
        for i in range(1, max_epoch):
            bd.append(total_batch_size * i)
            new_lr = base_learning_rate * (lr_decay**
                                           max(i + 1 - epoch_start_decay, 0.0))
            lr_arr.append(new_lr)

        sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
            boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters())

        def eval(model, data):
            print("begin to eval")
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros(
                (num_layers, batch_size, hidden_size), dtype='float32')
            init_cell_data = np.zeros(
                (num_layers, batch_size, hidden_size), dtype='float32')

            model.eval()
            train_data_iter = reader.get_data_iter(data, batch_size, num_steps)
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, num_steps, 1))
                x = to_variable(x_data)
                y = to_variable(y_data)
                init_hidden = to_variable(init_hidden_data)
                init_cell = to_variable(init_cell_data)
                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
                                                            init_cell)

                out_loss = dy_loss.numpy()

                init_hidden_data = last_hidden.numpy()
                init_cell_data = last_cell.numpy()

                total_loss += out_loss
                iters += num_steps

            print("eval finished")
            ppl = np.exp(total_loss / iters)
            print("ppl ", batch_id, ppl[0])
            if args.ce:
                print("kpis\ttest_ppl\t%0.3f" % ppl[0])

        grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm)
        
        for epoch_id in range(1):
            ptb_model.train()
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros(
                (num_layers, batch_size, hidden_size), dtype='float32')
            init_cell_data = np.zeros(
                (num_layers, batch_size, hidden_size), dtype='float32')

            train_data_iter = reader.get_data_iter(train_data, batch_size,
                                                   num_steps)
            init_hidden = to_variable(init_hidden_data)
            init_cell = to_variable(init_cell_data)
            start_time = time.time()
            start = time.time()
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch

                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, num_steps, 1))

                x = to_variable(x_data)
                y = to_variable(y_data)

                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
                                                            init_cell)
                init_hidden = last_hidden
                init_cell = last_cell
                init_hidden.stop_gradient = True
                init_cell.stop_gradient = True
                out_loss = dy_loss.numpy()

                dy_loss.backward()
                sgd.minimize(dy_loss, grad_clip=grad_clip)

                ptb_model.clear_gradients()
                total_loss += out_loss
                iters += num_steps

                if batch_id > 0 and batch_id % log_interval == 0:
                    ppl = np.exp(total_loss / iters)
                    print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f" %
                          (epoch_id, batch_id, ppl[0],
                           sgd._global_learning_rate().numpy(), out_loss))

            end = time.time()
            print("One epoch cost {}".format(end - start))
            print("one epoch finished", epoch_id)
            print("time cost ", time.time() - start_time)
            ppl = np.exp(total_loss / iters)
            print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0]))

            if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000:
                # for bad init, after first epoch, the loss is over 1000
                # no more need to continue
                print("Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch.")
                print("Abort this training process and please start again.")
                return 

            if args.ce:
                print("kpis\ttrain_ppl\t%0.3f" % ppl[0])
            save_model_dir = os.path.join(args.save_model_dir,
                                          str(epoch_id), 'params')
            fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
            print("Saved model to: %s.\n" % save_model_dir)

            eval(ptb_model, valid_data)

        eval(ptb_model, test_data)
예제 #4
0
def train_ptb_lm():
    args = parse_args()

    # check if set use_gpu=True in paddlepaddle cpu version
    model_check.check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    model_check.check_version()

    model_type = args.model_type

    vocab_size = 37484
    if model_type == "test":
        num_layers = 1
        batch_size = 2
        hidden_size = 10
        num_steps = 4
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 1
        max_epoch = 1
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "small":
        num_layers = 2
        batch_size = 20
        hidden_size = 200
        num_steps = 20
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 4
        max_epoch = 2
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "gru4rec":
        num_layers = 1
        batch_size = 500
        hidden_size = 100
        num_steps = 10
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 10
        max_epoch = 5
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 0.05
    elif model_type == "medium":
        num_layers = 2
        batch_size = 20
        hidden_size = 650
        num_steps = 35
        init_scale = 0.05
        max_grad_norm = 5.0
        epoch_start_decay = 6
        max_epoch = 39
        dropout = 0.5
        lr_decay = 0.8
        base_learning_rate = 1.0
    elif model_type == "large":
        num_layers = 2
        batch_size = 20
        hidden_size = 1500
        num_steps = 35
        init_scale = 0.04
        max_grad_norm = 10.0
        epoch_start_decay = 14
        max_epoch = 55
        dropout = 0.65
        lr_decay = 1.0 / 1.15
        base_learning_rate = 1.0
    else:
        print("model type not support")
        return

    with fluid.dygraph.guard(core.CUDAPlace(0)):
        if args.ce:
            print("ce mode")
            seed = 33
            np.random.seed(seed)
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            max_epoch = 1
        ptb_model = PtbModel("ptb_model",
                             hidden_size=hidden_size,
                             vocab_size=vocab_size,
                             num_layers=num_layers,
                             num_steps=num_steps,
                             init_scale=init_scale,
                             dropout=dropout)

        if args.init_from_pretrain_model:
            if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
                print(args.init_from_pretrain_model)
                raise Warning("The pretrained params do not exist.")
                return
            fluid.load_dygraph(args.init_from_pretrain_model)
            print("finish initing model from pretrained params from %s" %
                  (args.init_from_pretrain_model))

        dy_param_updated = dict()
        dy_param_init = dict()
        dy_loss = None
        last_hidden = None

        data_path = args.data_path
        print("begin to load data")
        ptb_data = reader.get_ptb_data(data_path)
        print("finished load data")
        train_data, valid_data, test_data = ptb_data

        batch_len = len(train_data) // batch_size
        total_batch_size = (batch_len - 1) // num_steps
        print("total_batch_size:", total_batch_size)
        log_interval = total_batch_size // 20

        bd = []
        lr_arr = [base_learning_rate]
        for i in range(1, max_epoch):
            bd.append(total_batch_size * i)
            new_lr = base_learning_rate * (lr_decay**max(
                i + 1 - epoch_start_decay, 0.0))
            lr_arr.append(new_lr)

        sgd = AdagradOptimizer(parameter_list=ptb_model.parameters(),
                               learning_rate=fluid.layers.piecewise_decay(
                                   boundaries=bd, values=lr_arr))

        print("parameters:--------------------------------")
        for para in ptb_model.parameters():
            print(para.name)
        print("parameters:--------------------------------")

        def eval(model, data):
            print("begion to eval")
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
                                        dtype='float32')

            model.eval()
            train_data_iter = reader.get_data_iter(data, batch_size, num_steps)
            init_hidden = to_variable(init_hidden_data)
            accum_num_recall = 0.0
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, num_steps, 1))
                x = to_variable(x_data)
                y = to_variable(y_data)
                dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden)

                out_loss = dy_loss.numpy()
                acc_ = acc.numpy()[0]
                accum_num_recall += acc_
                if batch_id % 1 == 0:
                    print("batch_id:%d  recall@20:%.4f" %
                          (batch_id, accum_num_recall / (batch_id + 1)))

                init_hidden = last_hidden

                total_loss += out_loss
                iters += num_steps

            print("eval finished")
            ppl = np.exp(total_loss / iters)
            print("recall@20 ", accum_num_recall / (batch_id + 1))
            if args.ce:
                print("kpis\ttest_ppl\t%0.3f" % ppl[0])

        grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm)
        for epoch_id in range(max_epoch):
            ptb_model.train()
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
                                        dtype='float32')

            train_data_iter = reader.get_data_iter(train_data, batch_size,
                                                   num_steps)
            init_hidden = to_variable(init_hidden_data)

            start_time = time.time()
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, num_steps, 1))
                x = to_variable(x_data)
                y = to_variable(y_data)
                dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden)

                out_loss = dy_loss.numpy()
                acc_ = acc.numpy()[0]

                init_hidden = last_hidden
                dy_loss.backward()
                sgd.minimize(dy_loss, grad_clip=grad_clip)
                ptb_model.clear_gradients()
                total_loss += out_loss
                iters += num_steps

                if batch_id > 0 and batch_id % 100 == 1:
                    ppl = np.exp(total_loss / iters)
                    print(
                        "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, acc: %.5f, lr: %.5f"
                        % (epoch_id, batch_id, ppl[0], acc_,
                           sgd._global_learning_rate().numpy()))

            print("one ecpoh finished", epoch_id)
            print("time cost ", time.time() - start_time)
            ppl = np.exp(total_loss / iters)
            print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0]))
            if args.ce:
                print("kpis\ttrain_ppl\t%0.3f" % ppl[0])
            save_model_dir = os.path.join(args.save_model_dir, str(epoch_id),
                                          'params')
            fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
            print("Saved model to: %s.\n" % save_model_dir)
            eval(ptb_model, test_data)
예제 #5
0
def train_ptb_lm():
    args = parse_args()

    # check if set use_gpu=True in paddlepaddle cpu version
    model_check.check_cuda(args.use_gpu)

    place = core.CPUPlace()
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    # check if paddlepaddle version is satisfied
    model_check.check_version()

    model_type = args.model_type

    vocab_size = 10000
    if model_type == "test":
        num_layers = 1
        batch_size = 2
        hidden_size = 10
        num_steps = 3
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 1
        max_epoch = 1
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "small":
        num_layers = 2
        batch_size = 20
        hidden_size = 200
        num_steps = 20
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 4
        max_epoch = 13
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "medium":
        num_layers = 2
        batch_size = 20
        hidden_size = 650
        num_steps = 35
        init_scale = 0.05
        max_grad_norm = 5.0
        epoch_start_decay = 6
        max_epoch = 39
        dropout = 0.5
        lr_decay = 0.8
        base_learning_rate = 1.0
    elif model_type == "large":
        num_layers = 2
        batch_size = 20
        hidden_size = 1500
        num_steps = 35
        init_scale = 0.04
        max_grad_norm = 10.0
        epoch_start_decay = 14
        max_epoch = 55
        dropout = 0.65
        lr_decay = 1.0 / 1.15
        base_learning_rate = 1.0
    else:
        print("model type not support")
        return

    with fluid.dygraph.guard(place):
        if args.ce:
            print("ce mode")
            seed = 33
            np.random.seed(seed)
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            max_epoch = 1
        ptb_model = PtbModel(hidden_size=hidden_size,
                             vocab_size=vocab_size,
                             num_layers=num_layers,
                             num_steps=num_steps,
                             init_scale=init_scale,
                             dropout=dropout)

        if args.init_from_pretrain_model:
            if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
                print(args.init_from_pretrain_model)
                raise Warning("The pretrained params do not exist.")
                return
            fluid.load_dygraph(args.init_from_pretrain_model)
            print("finish initing model from pretrained params from %s" %
                  (args.init_from_pretrain_model))

        dy_param_updated = dict()
        dy_param_init = dict()
        dy_loss = None
        last_hidden = None
        last_cell = None

        data_path = args.data_path
        print("begin to load data")
        ptb_data = reader.get_ptb_data(data_path)
        print("finished load data")
        train_data, valid_data, test_data = ptb_data

        batch_len = len(train_data) // batch_size
        total_batch_size = (batch_len - 1) // num_steps
        log_interval = 200

        bd = []
        lr_arr = [1.0]
        for i in range(1, max_epoch):
            bd.append(total_batch_size * i)
            new_lr = base_learning_rate * (lr_decay**max(
                i + 1 - epoch_start_decay, 0.0))
            lr_arr.append(new_lr)

        grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
        sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
            boundaries=bd, values=lr_arr),
                           parameter_list=ptb_model.parameters(),
                           grad_clip=grad_clip)

        def reader_decorator(reader):
            def __reader__():
                for item in reader:
                    x_data = item[0].reshape((-1, num_steps, 1))
                    y_data = item[1].reshape((-1, num_steps, 1))
                    yield x_data, y_data

            return __reader__

        def eval(model, data):
            print("begin to eval")
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
                                        dtype='float32')
            init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
                                      dtype='float32')

            model.eval()
            train_data_iter = reader_decorator(
                reader.get_data_iter(data, batch_size, num_steps))

            eval_data_loader = fluid.io.DataLoader.from_generator(capacity=200)
            eval_data_loader.set_batch_generator(train_data_iter, places=place)

            for batch_id, batch in enumerate(eval_data_loader):
                x, y = batch
                init_hidden = to_variable(init_hidden_data)
                init_cell = to_variable(init_cell_data)
                dy_loss, last_hidden, last_cell = ptb_model(
                    x, y, init_hidden, init_cell)

                out_loss = dy_loss.numpy()

                init_hidden_data = last_hidden.numpy()
                init_cell_data = last_cell.numpy()

                total_loss += out_loss
                iters += num_steps

            print("eval finished")
            ppl = np.exp(total_loss / iters)
            print("ppl ", batch_id, ppl[0])

        ce_time = []
        ce_ppl = []

        total_batch_num = 0  #this is for benchmark
        for epoch_id in range(max_epoch):
            epoch_start = time.time()

            ptb_model.train()
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
                                        dtype='float32')
            init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
                                      dtype='float32')

            train_data_iter = reader_decorator(
                reader.get_data_iter(train_data, batch_size, num_steps))

            train_data_loader = fluid.io.DataLoader.from_generator(
                capacity=200)
            train_data_loader.set_batch_generator(train_data_iter,
                                                  places=place)

            init_hidden = to_variable(init_hidden_data)
            init_cell = to_variable(init_cell_data)

            batch_cost_avg = TimeCostAverage()
            reader_cost_avg = TimeCostAverage()

            batch_start = time.time()
            for batch_id, batch in enumerate(train_data_loader):
                if args.max_iter and total_batch_num == args.max_iter:
                    return

                train_reader_cost = time.time() - batch_start
                reader_cost_avg.record(train_reader_cost)

                x, y = batch

                dy_loss, last_hidden, last_cell = ptb_model(
                    x, y, init_hidden, init_cell)
                init_hidden = last_hidden.detach()
                init_cell = last_cell.detach()
                out_loss = dy_loss.numpy()

                dy_loss.backward()
                sgd.minimize(dy_loss)
                ptb_model.clear_gradients()

                global_lr = sgd._global_learning_rate().numpy()
                total_loss += out_loss
                iters += num_steps
                total_batch_num = total_batch_num + 1  #this is for benchmark

                train_batch_cost = time.time() - batch_start
                batch_cost_avg.record(train_batch_cost)

                if batch_id > 0 and batch_id % log_interval == 0:
                    ppl = np.exp(total_loss / iters)
                    print(
                        "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f sec, reader_cost: %.5f sec, ips: %.5f words/sec"
                        % (epoch_id, batch_id, ppl[0], global_lr, out_loss,
                           batch_cost_avg.get_average(),
                           reader_cost_avg.get_average(),
                           batch_size / batch_cost_avg.get_average()))
                    batch_cost_avg.reset()
                    reader_cost_avg.reset()
                batch_start = time.time()

            ppl = np.exp(total_loss / iters)
            train_epoch_cost = time.time() - epoch_start
            print("-- Epoch:[%d]; ppl: %.5f, epoch_cost: %.5f s" %
                  (epoch_id, ppl[0], train_epoch_cost))

            ce_time.append(train_epoch_cost)
            ce_ppl.append(ppl[0])

            if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000:
                # for bad init, after first epoch, the loss is over 1000
                # no more need to continue
                print(
                    "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch."
                )
                print("Abort this training process and please start again.")
                return

            save_model_dir = os.path.join(args.save_model_dir, str(epoch_id),
                                          'params')
            fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
            print("Saved model to: %s.\n" % save_model_dir)

            eval(ptb_model, valid_data)

        if args.ce:
            _ppl = 0
            _time = 0
            try:
                _time = ce_time[-1]
                _ppl = ce_ppl[-1]
            except:
                print("ce info error")
            print("kpis\ttrain_duration_card%s\t%s" % (dev_count, _time))
            print("kpis\ttrain_ppl_card%s\t%f" % (dev_count, _ppl))

        eval(ptb_model, test_data)
예제 #6
0
def main():
    args = parse_args()
    print(args)
    train_data, _, _ = reader.get_ptb_data(args.data_path)
    train_ptb_model(train_data, use_dygraph=args.use_dygraph)