def train_ptb_lm(): args = parse_args() model_type = args.model_type vocab_size = 10000 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 3 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 13 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return with fluid.dygraph.guard(core.CUDAPlace(0)): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed max_epoch = 1 ptb_model = PtbModel("ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps log_interval = total_batch_size // 20 bd = [] lr_arr = [1.0] for i in range(1, max_epoch): bd.append(total_batch_size * i) new_lr = base_learning_rate * (lr_decay**max( i + 1 - epoch_start_decay, 0.0)) lr_arr.append(new_lr) sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr)) def eval(model, data): print("begion to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader.get_data_iter(data, batch_size, num_steps) for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0]) if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) for epoch_id in range(max_epoch): ptb_model.train() total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) start_time = time.time() for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() dy_loss.backward() sgd.minimize(dy_loss, grad_clip=grad_clip) ptb_model.clear_gradients() total_loss += out_loss iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print(epoch_id, "ppl ", batch_id, ppl[0], sgd._global_learning_rate().numpy()) print("one ecpoh finished", epoch_id) print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) print("ppl ", epoch_id, ppl[0]) if args.ce: print("kpis\ttrain_ppl\t%0.3f" % ppl[0]) eval(ptb_model, test_data)
def main(): args = parse_args() # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() logger = logging.getLogger("lm") logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') if args.log_path: file_handler = logging.FileHandler(args.log_path) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) else: console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.info('Running with args : {}'.format(args)) config = RNNConfig(args) # define train program main_program = fluid.Program() startup_program = fluid.Program() if args.enable_ce: startup_program.random_seed = SEED with fluid.program_guard(main_program, startup_program): with fluid.unique_name.guard(): res_vars = lm_model.lm_model( config.hidden_size, config.vocab_size, config.batch_size, num_layers=config.num_layers, num_steps=config.num_steps, init_scale=config.init_scale, dropout=config.dropout, rnn_model=config.rnn_model, use_dataloader=args.use_dataloader) if args.use_dataloader: dataloader = res_vars[-1] res_vars = res_vars[:-1] loss, last_hidden, last_cell, feed_order = res_vars fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=config.max_grad_norm)) learning_rate = fluid.layers.create_global_var( name="learning_rate", shape=[1], value=1.0, dtype='float32', persistable=True) optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) optimizer.minimize(loss) # define inference program inference_program = fluid.Program() inference_startup_program = fluid.Program() with fluid.program_guard(inference_program, inference_startup_program): with fluid.unique_name.guard(): lm_model.lm_model( config.hidden_size, config.vocab_size, config.batch_size, num_layers=config.num_layers, num_steps=config.num_steps, init_scale=config.init_scale, dropout=config.dropout, rnn_model=config.rnn_model, use_dataloader=False) # Some op behaves differently for train and inference, we need to call # this clone function to ensure every op is right for inference. inference_program = inference_program.clone(for_test=True) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = Executor(place) exe.run(startup_program) if args.init_from_pretrain_model: if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return fluid.load(main_program, args.init_from_pretrain_model) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) device_count = len(fluid.cuda_places()) if args.use_gpu else len( fluid.cpu_places()) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = device_count exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = True if args.parallel: train_program = fluid.compiler.CompiledProgram( main_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_program = fluid.compiler.CompiledProgram(main_program) data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data def generate_init_data(): if args.rnn_model == "lod": init_hidden = np.zeros( (config.batch_size, config.num_layers, config.hidden_size), dtype='float32') init_cell = np.zeros( (config.batch_size, config.num_layers, config.hidden_size), dtype='float32') else: init_hidden = np.zeros( (config.num_layers, config.batch_size, config.hidden_size), dtype='float32') init_cell = np.zeros( (config.num_layers, config.batch_size, config.hidden_size), dtype='float32') return init_hidden, init_cell def generate_new_lr(epoch_id=0, device_count=1): new_lr = config.base_learning_rate * (config.lr_decay**max( epoch_id + 1 - config.epoch_start_decay, 0.0)) lr = np.ones((device_count), dtype='float32') * new_lr return lr def prepare_input(batch, init_hidden=None, init_cell=None, epoch_id=0, with_lr=True, device_count=1): x, y = batch batch_size = x.shape[0] x = x.reshape((-1, config.num_steps, 1)) y = y.reshape((-1, 1)) if args.rnn_model == "lod": x = to_lodtensor(x.reshape((-1, 1)), place, [ range(0, (batch_size + 1) * config.num_steps, config.num_steps) ]) y = to_lodtensor(y.reshape((-1, 1)), place, [ range(0, (batch_size + 1) * config.num_steps, config.num_steps) ]) res = {} res['x'] = x res['y'] = y if init_hidden is not None: res['init_hidden'] = init_hidden if init_cell is not None: res['init_cell'] = init_cell if with_lr: res['learning_rate'] = generate_new_lr(epoch_id, device_count) return res def eval(data): # when eval the batch_size set to 1 eval_data_iter = reader.get_data_iter(data, config.batch_size, config.num_steps) total_loss = 0.0 iters = 0 init_hidden, init_cell = generate_init_data() for batch_id, batch in enumerate(eval_data_iter): input_data_feed = prepare_input( batch, init_hidden, init_cell, epoch_id=0, with_lr=False) fetch_outs = exe.run( program=inference_program, feed=input_data_feed, fetch_list=[loss.name, last_hidden.name, last_cell.name], use_program_cache=False) cost_eval = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) total_loss += cost_eval iters += config.num_steps ppl = np.exp(total_loss / iters) return ppl def get_log_interval(data_len): num_batchs = data_len // config.batch_size epoch_size = (num_batchs - 1) // config.num_steps log_interval = max(1, epoch_size // 10) return log_interval def train_an_epoch(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data)) train_data_iter = reader.get_data_iter(train_data, config.batch_size, config.num_steps) total_loss = 0 iters = 0 init_hidden, init_cell = generate_init_data() for batch_id, batch in enumerate(train_data_iter): input_data_feed = prepare_input( batch, init_hidden=init_hidden, init_cell=init_cell, epoch_id=epoch_id, with_lr=True, device_count=device_count) batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=input_data_feed, fetch_list=[ loss.name, "learning_rate", last_hidden.name, last_cell.name ], use_program_cache=True) batch_time = time.time() - batch_start_time batch_times.append(batch_time) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) init_hidden = np.array(fetch_outs[2]) init_cell = np.array(fetch_outs[3]) total_loss += cost_train iters += config.num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) ppl = np.exp(total_loss / iters) return ppl def train_an_epoch_dataloader(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data)) init_hidden, init_cell = generate_init_data() total_loss = 0 iters = 0 dataloader.start() batch_id = 0 try: while True: data_feeds = {} if batch_id == 0: batch_time = 0 batch_start_time = time.time() else: batch_time = time.time() - batch_start_time batch_times.append(batch_time) batch_start_time = time.time() new_lr = generate_new_lr(epoch_id, device_count) data_feeds['learning_rate'] = new_lr data_feeds["init_hidden"] = init_hidden data_feeds["init_cell"] = init_cell fetch_outs = exe.run(train_program, feed=data_feeds, fetch_list=[ loss.name, "learning_rate", last_hidden.name, last_cell.name ], use_program_cache=True) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) init_hidden = np.array(fetch_outs[2]) init_cell = np.array(fetch_outs[3]) total_loss += cost_train iters += config.num_steps if batch_id > 0 and (log_interval == 0 or batch_id % log_interval == 0): ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) batch_id += 1 except fluid.core.EOFException: dataloader.reset() batch_times.append(time.time() - batch_start_time) ppl = np.exp(total_loss / iters) return ppl def train(): if args.use_dataloader: def data_gen(): data_iter_size = config.batch_size // device_count train_batches = reader.get_data_iter(train_data, data_iter_size, config.num_steps) for batch in train_batches: x, y = batch x = x.reshape((-1, config.num_steps, 1)) y = y.reshape((-1, 1)) if args.rnn_model == "lod": x = to_lodtensor(x.reshape((-1, 1)), place, [ range(0, (data_iter_size + 1) * config.num_steps, config.num_steps) ]) y = to_lodtensor(y.reshape((-1, 1)), place, [ range(0, (data_iter_size + 1) * config.num_steps, config.num_steps) ]) yield x, y dataloader.set_batch_generator(data_gen) total_time = 0.0 for epoch_id in range(config.max_epoch): batch_times = [] epoch_start_time = time.time() if args.use_dataloader: train_ppl = train_an_epoch_dataloader(epoch_id, batch_times) else: train_ppl = train_an_epoch(epoch_id, batch_times) epoch_time = time.time() - epoch_start_time total_time += epoch_time print( "\nTrain epoch:[%d]; epoch Time: %.5f; ppl: %.5f; avg_time: %.5f steps/s \n" % (epoch_id, epoch_time, train_ppl[0], len(batch_times) / sum(batch_times))) # FIXME(zjl): ppl[0] increases as batch_size increases. # We should find a better way to calculate ppl by normalizing batch_size. if device_count == 1 and config.batch_size <= 20 and epoch_id == 0 and train_ppl[ 0] > 1000: # for bad init, after first epoch, the loss is over 1000 # no more need to continue print( "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch." ) print("Abort this training process and please start again.") return if epoch_id == config.max_epoch - 1 and args.enable_ce: # kpis print("ptblm\tlstm_language_model_%s_duration_card%d\t%s" % (args.rnn_model, device_count, total_time / config.max_epoch)) print("ptblm\tlstm_language_model_%s_loss_card%d\t%s" % (args.rnn_model, device_count, train_ppl[0])) # NOTE(zjl): sometimes we have not enough data for eval if batch_size is large, i.e., 2100 # Just skip to avoid error def is_valid_data(data, batch_size, num_steps): data_len = len(data) batch_len = data_len // batch_size epoch_size = (batch_len - 1) // num_steps return epoch_size >= 1 valid_data_valid = is_valid_data(valid_data, config.batch_size, config.num_steps) if valid_data_valid: valid_ppl = eval(valid_data) print("Valid ppl: %.5f" % valid_ppl[0]) else: print( 'WARNING: length of valid_data is {}, which is not enough for batch_size {} and num_steps {}'. format( len(valid_data), config.batch_size, config.num_steps)) save_model_dir = os.path.join(args.save_model_dir, str(epoch_id), "params") fluid.save(main_program, save_model_dir) print("Saved model to: %s.\n" % save_model_dir) with profile_context(args.profile): train() test_ppl = eval(test_data) print("Test ppl:", test_ppl[0])
def train_ptb_lm(): args = parse_args() # check if set use_gpu=True in paddlepaddle cpu version model_check.check_cuda(args.use_gpu) place = core.CPUPlace() if args.use_gpu == True: place = core.CUDAPlace(0) # check if paddlepaddle version is satisfied model_check.check_version() model_type = args.model_type vocab_size = 10000 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 3 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 13 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return with fluid.dygraph.guard(place): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed max_epoch = 1 ptb_model = PtbModel( hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) if args.init_from_pretrain_model: if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return fluid.load_dygraph(args.init_from_pretrain_model) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps log_interval = 200 bd = [] lr_arr = [1.0] for i in range(1, max_epoch): bd.append(total_batch_size * i) new_lr = base_learning_rate * (lr_decay** max(i + 1 - epoch_start_decay, 0.0)) lr_arr.append(new_lr) sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters()) def eval(model, data): print("begin to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader.get_data_iter(data, batch_size, num_steps) for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0]) if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) for epoch_id in range(1): ptb_model.train() total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) start_time = time.time() start = time.time() for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, init_cell) init_hidden = last_hidden init_cell = last_cell init_hidden.stop_gradient = True init_cell.stop_gradient = True out_loss = dy_loss.numpy() dy_loss.backward() sgd.minimize(dy_loss, grad_clip=grad_clip) ptb_model.clear_gradients() total_loss += out_loss iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f" % (epoch_id, batch_id, ppl[0], sgd._global_learning_rate().numpy(), out_loss)) end = time.time() print("One epoch cost {}".format(end - start)) print("one epoch finished", epoch_id) print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0])) if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000: # for bad init, after first epoch, the loss is over 1000 # no more need to continue print("Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch.") print("Abort this training process and please start again.") return if args.ce: print("kpis\ttrain_ppl\t%0.3f" % ppl[0]) save_model_dir = os.path.join(args.save_model_dir, str(epoch_id), 'params') fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) print("Saved model to: %s.\n" % save_model_dir) eval(ptb_model, valid_data) eval(ptb_model, test_data)
def train_ptb_lm(): args = parse_args() # check if set use_gpu=True in paddlepaddle cpu version model_check.check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied model_check.check_version() model_type = args.model_type vocab_size = 37484 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 4 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 2 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "gru4rec": num_layers = 1 batch_size = 500 hidden_size = 100 num_steps = 10 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 10 max_epoch = 5 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 0.05 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return with fluid.dygraph.guard(core.CUDAPlace(0)): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed max_epoch = 1 ptb_model = PtbModel("ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) if args.init_from_pretrain_model: if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return fluid.load_dygraph(args.init_from_pretrain_model) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps print("total_batch_size:", total_batch_size) log_interval = total_batch_size // 20 bd = [] lr_arr = [base_learning_rate] for i in range(1, max_epoch): bd.append(total_batch_size * i) new_lr = base_learning_rate * (lr_decay**max( i + 1 - epoch_start_decay, 0.0)) lr_arr.append(new_lr) sgd = AdagradOptimizer(parameter_list=ptb_model.parameters(), learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr)) print("parameters:--------------------------------") for para in ptb_model.parameters(): print(para.name) print("parameters:--------------------------------") def eval(model, data): print("begion to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader.get_data_iter(data, batch_size, num_steps) init_hidden = to_variable(init_hidden_data) accum_num_recall = 0.0 for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden) out_loss = dy_loss.numpy() acc_ = acc.numpy()[0] accum_num_recall += acc_ if batch_id % 1 == 0: print("batch_id:%d recall@20:%.4f" % (batch_id, accum_num_recall / (batch_id + 1))) init_hidden = last_hidden total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("recall@20 ", accum_num_recall / (batch_id + 1)) if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) for epoch_id in range(max_epoch): ptb_model.train() total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) init_hidden = to_variable(init_hidden_data) start_time = time.time() for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden) out_loss = dy_loss.numpy() acc_ = acc.numpy()[0] init_hidden = last_hidden dy_loss.backward() sgd.minimize(dy_loss, grad_clip=grad_clip) ptb_model.clear_gradients() total_loss += out_loss iters += num_steps if batch_id > 0 and batch_id % 100 == 1: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, acc: %.5f, lr: %.5f" % (epoch_id, batch_id, ppl[0], acc_, sgd._global_learning_rate().numpy())) print("one ecpoh finished", epoch_id) print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0])) if args.ce: print("kpis\ttrain_ppl\t%0.3f" % ppl[0]) save_model_dir = os.path.join(args.save_model_dir, str(epoch_id), 'params') fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) print("Saved model to: %s.\n" % save_model_dir) eval(ptb_model, test_data)
def train_ptb_lm(): args = parse_args() # check if set use_gpu=True in paddlepaddle cpu version model_check.check_cuda(args.use_gpu) place = core.CPUPlace() if args.use_gpu: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) # check if paddlepaddle version is satisfied model_check.check_version() model_type = args.model_type vocab_size = 10000 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 3 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 13 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return with fluid.dygraph.guard(place): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed max_epoch = 1 ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) if args.init_from_pretrain_model: if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return fluid.load_dygraph(args.init_from_pretrain_model) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps log_interval = 200 bd = [] lr_arr = [1.0] for i in range(1, max_epoch): bd.append(total_batch_size * i) new_lr = base_learning_rate * (lr_decay**max( i + 1 - epoch_start_decay, 0.0)) lr_arr.append(new_lr) grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm) sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters(), grad_clip=grad_clip) def reader_decorator(reader): def __reader__(): for item in reader: x_data = item[0].reshape((-1, num_steps, 1)) y_data = item[1].reshape((-1, num_steps, 1)) yield x_data, y_data return __reader__ def eval(model, data): print("begin to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader_decorator( reader.get_data_iter(data, batch_size, num_steps)) eval_data_loader = fluid.io.DataLoader.from_generator(capacity=200) eval_data_loader.set_batch_generator(train_data_iter, places=place) for batch_id, batch in enumerate(eval_data_loader): x, y = batch init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0]) ce_time = [] ce_ppl = [] total_batch_num = 0 #this is for benchmark for epoch_id in range(max_epoch): epoch_start = time.time() ptb_model.train() total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') train_data_iter = reader_decorator( reader.get_data_iter(train_data, batch_size, num_steps)) train_data_loader = fluid.io.DataLoader.from_generator( capacity=200) train_data_loader.set_batch_generator(train_data_iter, places=place) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) batch_cost_avg = TimeCostAverage() reader_cost_avg = TimeCostAverage() batch_start = time.time() for batch_id, batch in enumerate(train_data_loader): if args.max_iter and total_batch_num == args.max_iter: return train_reader_cost = time.time() - batch_start reader_cost_avg.record(train_reader_cost) x, y = batch dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) init_hidden = last_hidden.detach() init_cell = last_cell.detach() out_loss = dy_loss.numpy() dy_loss.backward() sgd.minimize(dy_loss) ptb_model.clear_gradients() global_lr = sgd._global_learning_rate().numpy() total_loss += out_loss iters += num_steps total_batch_num = total_batch_num + 1 #this is for benchmark train_batch_cost = time.time() - batch_start batch_cost_avg.record(train_batch_cost) if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f sec, reader_cost: %.5f sec, ips: %.5f words/sec" % (epoch_id, batch_id, ppl[0], global_lr, out_loss, batch_cost_avg.get_average(), reader_cost_avg.get_average(), batch_size / batch_cost_avg.get_average())) batch_cost_avg.reset() reader_cost_avg.reset() batch_start = time.time() ppl = np.exp(total_loss / iters) train_epoch_cost = time.time() - epoch_start print("-- Epoch:[%d]; ppl: %.5f, epoch_cost: %.5f s" % (epoch_id, ppl[0], train_epoch_cost)) ce_time.append(train_epoch_cost) ce_ppl.append(ppl[0]) if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000: # for bad init, after first epoch, the loss is over 1000 # no more need to continue print( "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch." ) print("Abort this training process and please start again.") return save_model_dir = os.path.join(args.save_model_dir, str(epoch_id), 'params') fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) print("Saved model to: %s.\n" % save_model_dir) eval(ptb_model, valid_data) if args.ce: _ppl = 0 _time = 0 try: _time = ce_time[-1] _ppl = ce_ppl[-1] except: print("ce info error") print("kpis\ttrain_duration_card%s\t%s" % (dev_count, _time)) print("kpis\ttrain_ppl_card%s\t%f" % (dev_count, _ppl)) eval(ptb_model, test_data)
def main(): args = parse_args() print(args) train_data, _, _ = reader.get_ptb_data(args.data_path) train_ptb_model(train_data, use_dygraph=args.use_dygraph)