def eval(data): # when eval the batch_size set to 1 eval_data_iter = reader.get_data_iter(data, config.batch_size, config.num_steps) total_loss = 0.0 iters = 0 init_hidden, init_cell = generate_init_data() for batch_id, batch in enumerate(eval_data_iter): input_data_feed = prepare_input( batch, init_hidden, init_cell, epoch_id=0, with_lr=False) fetch_outs = exe.run( program=inference_program, feed=input_data_feed, fetch_list=[loss.name, last_hidden.name, last_cell.name], use_program_cache=False) cost_eval = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) total_loss += cost_eval iters += config.num_steps ppl = np.exp(total_loss / iters) return ppl
def eval(data): # when eval the batch_size set to 1 eval_data_iter = reader.get_data_iter(data, batch_size, num_steps) total_loss = 0.0 iters = 0 init_hidden = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') for batch_id, batch in enumerate(eval_data_iter): input_data_feed = prepare_input(batch, init_hidden, init_cell, epoch_id, with_lr=False) fetch_outs = exe.run( inference_program, feed=input_data_feed, fetch_list=[loss.name, last_hidden.name, last_cell.name], use_program_cache=True) cost_train = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) total_loss += cost_train iters += num_steps ppl = np.exp(total_loss / iters) return ppl
def eval(model, data): print("begion to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader.get_data_iter(data, batch_size, num_steps) for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0]) if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0])
def eval(model, data): print("begin to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader_decorator( reader.get_data_iter(data, batch_size, num_steps)) eval_data_loader = fluid.io.DataLoader.from_generator(capacity=200) eval_data_loader.set_batch_generator(train_data_iter, places=place) for batch_id, batch in enumerate(eval_data_loader): x, y = batch init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0])
def data_gen(): data_iter_size = config.batch_size train_batches = reader.get_data_iter(train_data, data_iter_size, config.num_steps) for batch in train_batches: x, y = batch x = x.reshape((-1, config.num_steps, 1)) y = y.reshape((-1, 1)) yield x, y
def data_gen(): data_iter_size = batch_size // device_count train_batches = reader.get_data_iter(train_data, data_iter_size, num_steps) for batch in train_batches: x, y = batch x = x.reshape((-1, num_steps, 1)) y = y.reshape((-1, 1)) yield x, y
def fetch_loss_grad(x): data_iter = reader.get_data_iter(x, 1) for batch_id, batch in enumerate(data_iter): input_data_feed = prepare_input(batch) result = exe.run(test_program, fetch_list=fetch_list, feed=input_data_feed) loss = result[0] grad = result[1] return loss, grad
def eval(data, epoch_id=0): model.eval() eval_data_iter = reader.get_data_iter(data, batch_size, mode='eval') total_loss = 0.0 word_count = 0.0 for batch_id, batch in enumerate(eval_data_iter): input_data_feed, word_num = prepare_input( batch, epoch_id) loss = model(input_data_feed) total_loss += loss * batch_size word_count += word_num ppl = np.exp(total_loss.numpy() / word_count) model.train() return ppl
def train(): startup_program = fluid.default_startup_program() main_program = fluid.default_main_program() raw_data = reader.raw_data('fra.txt', num_samples=num_samples) train_data = raw_data[0] data_vars = raw_data[1] model = BaseModel(hidden_size=latent_dim, src_vocab_size=data_vars['num_encoder_tokens'], tar_vocab_size=data_vars['num_decoder_tokens'], batch_size=batch_size, batch_first=True) loss = model.build_graph() optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer.minimize(loss) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(framework.default_startup_program()) ce_ppl = [] for epoch_id in range(num_epochs): print("epoch ", epoch_id) train_data_iter = reader.get_data_iter(train_data, batch_size) total_loss = 0 word_count = 0.0 for batch_id, batch in enumerate(train_data_iter): input_data_feed, word_num = prepare_input(batch, epoch_id=epoch_id) fetch_outs = exe.run(feed=input_data_feed, fetch_list=[loss.name], use_program_cache=True) cost_train = np.array(fetch_outs[0]) total_loss += cost_train * batch_size word_count += word_num if batch_id > 0 and batch_id % batch_size == 0: print(" ppl", batch_id, np.exp(total_loss / word_count)) ce_ppl.append(np.exp(total_loss / word_count)) total_loss = 0.0 word_count = 0.0
def train_an_epoch(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data)) train_data_iter = reader.get_data_iter( train_data, config.batch_size * device_count, config.num_steps) total_loss = 0 iters = 0 init_hidden, init_cell = generate_init_data() for batch_id, batch in enumerate(train_data_iter): input_data_feed = prepare_input(batch, init_hidden=init_hidden, init_cell=init_cell, epoch_id=epoch_id, with_lr=True, device_count=device_count) batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=input_data_feed, fetch_list=[ loss.name, "learning_rate", last_hidden.name, last_cell.name ], use_program_cache=True) batch_time = time.time() - batch_start_time batch_times.append(batch_time) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) init_hidden = np.array(fetch_outs[2]) init_cell = np.array(fetch_outs[3]) total_loss += cost_train iters += config.num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) # profiler tools for benchmark if args.profile and batch_id == log_interval: profiler.reset_profiler() elif args.profile and batch_id == (log_interval + 5): break ppl = np.exp(total_loss / iters) return ppl
def train_an_epoch(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data), batch_size) train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) total_loss = 0 iters = 0 for batch_id, batch in enumerate(train_data_iter): if batch_id == 0: init_hidden, init_cell = get_init_data() else: init_hidden = None init_cell = None input_data_feed = prepare_input(batch, init_hidden=init_hidden, init_cell=init_cell, epoch_id=epoch_id, device_count=device_count) batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=input_data_feed, fetch_list=[loss.name, "learning_rate"], use_program_cache=True) batch_time = time.time() - batch_start_time batch_times.append(batch_time) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) total_loss += cost_train iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) if args.profile: if batch_id == 1: profiler.reset_profiler() elif batch_id >= 11: break ppl = np.exp(total_loss / iters) return ppl
def eval(sess, data): if args.inference_only: sess.run(init) batch_times = [] start_time = time.time() eval_loss = 0.0 eval_iters = 0 eval_data_iter = reader.get_data_iter(data, batch_size, num_steps) init_h = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_c = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') for batch in eval_data_iter: x, y = batch feed_dict = {} feed_dict[feeding_list[0]] = x feed_dict[feeding_list[1]] = y feed_dict[feeding_list[2]] = init_h feed_dict[feeding_list[3]] = init_c batch_start_time = time.time() output = sess.run([cost, final_h, final_c], feed_dict) batch_times.append(time.time() - batch_start_time) train_cost = output[0] init_h = output[1] init_c = output[2] eval_loss += train_cost eval_iters += num_steps ppl = np.exp(eval_loss / eval_iters) eval_time_total = time.time() - start_time eval_time_run = np.sum(batch_times) if args.inference_only: print( "Eval batch_size: %d; Time (total): %.5f s; Time (only run): %.5f s; ppl: %.5f" % (batch_size, eval_time_total, eval_time_run, ppl)) return ppl, eval_time_total
def eval(model, data): print("begion to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader.get_data_iter(data, batch_size, num_steps) init_hidden = paddle.to_tensor(data=init_hidden_data, dtype=None, place=None, stop_gradient=True) accum_num_recall = 0.0 for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = paddle.to_tensor(data=x_data, dtype=None, place=None, stop_gradient=True) y = paddle.to_tensor(data=y_data, dtype=None, place=None, stop_gradient=True) dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden) out_loss = dy_loss.numpy() acc_ = acc.numpy()[0] accum_num_recall += acc_ if batch_id % 1 == 0: print("batch_id:%d recall@20:%.4f" % (batch_id, accum_num_recall / (batch_id + 1))) init_hidden = last_hidden total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("recall@20 ", accum_num_recall / (batch_id + 1)) if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0])
def data_gen(): data_iter_size = config.batch_size // device_count train_batches = reader.get_data_iter(train_data, data_iter_size, config.num_steps) for batch in train_batches: x, y = batch x = x.reshape((-1, config.num_steps, 1)) y = y.reshape((-1, 1)) if args.rnn_model == "lod": x = to_lodtensor(x.reshape((-1, 1)), place, [ range(0, (data_iter_size + 1) * config.num_steps, config.num_steps) ]) y = to_lodtensor(y.reshape((-1, 1)), place, [ range(0, (data_iter_size + 1) * config.num_steps, config.num_steps) ]) yield x, y
def eval(data, epoch_id=0): eval_data_iter = reader.get_data_iter(data, batch_size, mode='eval') total_loss = 0.0 word_count = 0.0 for batch_id, batch in enumerate(eval_data_iter): input_data_feed, word_num = prepare_input(batch, epoch_id, with_lr=False) fetch_outs = exe.run(inference_program, feed=input_data_feed, fetch_list=[loss.name], use_program_cache=False) cost_train = np.array(fetch_outs[0]) total_loss += cost_train * batch_size word_count += word_num ppl = np.exp(total_loss / word_count) return ppl
def eval(data): eval_data_iter = reader.get_data_iter(data, batch_size, mode='eval') total_loss = 0.0 word_count = 0.0 batch_count = 0.0 for batch_id, batch in enumerate(eval_data_iter): input_data_feed, src_word_num, dec_word_sum = prepare_input(batch) fetch_outs = exe.run(inference_program, feed=input_data_feed, fetch_list=[loss.name], use_program_cache=False) cost_train = np.array(fetch_outs[0]) total_loss += cost_train * batch_size word_count += dec_word_sum batch_count += batch_size nll = total_loss / batch_count ppl = np.exp(total_loss / word_count) return nll, ppl
def train(): ce_time = [] ce_ppl = [] max_epoch = args.max_epoch kl_w = args.kl_start lr_w = args.learning_rate best_valid_nll = 1e100 # +inf best_epoch_id = -1 decay_cnt = 0 max_decay = args.max_decay decay_factor = 0.5 decay_ts = 2 steps_not_improved = 0 for epoch_id in range(max_epoch): start_time = time.time() if args.enable_ce: train_data_iter = reader.get_data_iter(train_data, batch_size, args.sort_cache, args.cache_num, enable_ce=True) else: train_data_iter = reader.get_data_iter(train_data, batch_size, args.sort_cache, args.cache_num) total_loss = 0 total_rec_loss = 0 total_kl_loss = 0 word_count = 0.0 batch_count = 0.0 batch_times = [] for batch_id, batch in enumerate(train_data_iter): batch_start_time = time.time() kl_w = min(1.0, kl_w + anneal_r) kl_weight = kl_w input_data_feed, src_word_num, dec_word_sum = prepare_input( batch, kl_weight, lr_w) fetch_outs = exe.run( program=train_program, feed=input_data_feed, fetch_list=[loss.name, kl_loss.name, rec_loss.name], use_program_cache=False) cost_train = np.array(fetch_outs[0]) kl_cost_train = np.array(fetch_outs[1]) rec_cost_train = np.array(fetch_outs[2]) total_loss += cost_train * batch_size total_rec_loss += rec_cost_train * batch_size total_kl_loss += kl_cost_train * batch_size word_count += dec_word_sum batch_count += batch_size batch_end_time = time.time() batch_time = batch_end_time - batch_start_time batch_times.append(batch_time) if batch_id > 0 and batch_id % 200 == 0: print("-- Epoch:[%d]; Batch:[%d]; Time: %.4f s; " "kl_weight: %.4f; kl_loss: %.4f; rec_loss: %.4f; " "nll: %.4f; ppl: %.4f" % (epoch_id, batch_id, batch_time, kl_w, total_kl_loss / batch_count, total_rec_loss / batch_count, total_loss / batch_count, np.exp(total_loss / word_count))) ce_ppl.append(np.exp(total_loss / word_count)) end_time = time.time() epoch_time = end_time - start_time ce_time.append(epoch_time) print( "\nTrain epoch:[%d]; Epoch Time: %.4f; avg_time: %.4f s/step\n" % (epoch_id, epoch_time, sum(batch_times) / len(batch_times))) val_nll, val_ppl = eval(valid_data) print("dev ppl", val_ppl) test_nll, test_ppl = eval(test_data) print("test ppl", test_ppl) if val_nll < best_valid_nll: best_valid_nll = val_nll steps_not_improved = 0 best_nll = test_nll best_ppl = test_ppl best_epoch_id = epoch_id save_path = os.path.join(args.model_path, "epoch_" + str(best_epoch_id), "checkpoint") print("save model {}".format(save_path)) fluid.save(main_program, save_path) else: steps_not_improved += 1 if steps_not_improved == decay_ts: old_lr = lr_w lr_w *= decay_factor steps_not_improved = 0 new_lr = lr_w print('-----\nchange lr, old lr: %f, new lr: %f\n-----' % (old_lr, new_lr)) dir_name = args.model_path + "/epoch_" + str(best_epoch_id) fluid.load(main_program, dir_name, exe) decay_cnt += 1 if decay_cnt == max_decay: break print('\nbest testing nll: %.4f, best testing ppl %.4f\n' % (best_nll, best_ppl)) if args.enable_ce: card_num = get_cards() _ppl = 0 _time = 0 try: _time = ce_time[-1] _ppl = ce_ppl[-1] except: print("ce info error") print("kpis\ttrain_duration_card%s\t%s" % (card_num, _time)) print("kpis\ttrain_ppl_card%s\t%f" % (card_num, _ppl))
def train_ptb_lm(): args = parse_args() model_type = args.model_type vocab_size = 10000 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 3 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 13 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return with fluid.dygraph.guard(core.CUDAPlace(0)): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed max_epoch = 1 ptb_model = PtbModel("ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps log_interval = total_batch_size // 20 bd = [] lr_arr = [1.0] for i in range(1, max_epoch): bd.append(total_batch_size * i) new_lr = base_learning_rate * (lr_decay**max( i + 1 - epoch_start_decay, 0.0)) lr_arr.append(new_lr) sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr)) def eval(model, data): print("begion to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader.get_data_iter(data, batch_size, num_steps) for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0]) if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) for epoch_id in range(max_epoch): ptb_model.train() total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) start_time = time.time() for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() dy_loss.backward() sgd.minimize(dy_loss, grad_clip=grad_clip) ptb_model.clear_gradients() total_loss += out_loss iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print(epoch_id, "ppl ", batch_id, ppl[0], sgd._global_learning_rate().numpy()) print("one ecpoh finished", epoch_id) print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) print("ppl ", epoch_id, ppl[0]) if args.ce: print("kpis\ttrain_ppl\t%0.3f" % ppl[0]) eval(ptb_model, test_data)
def train_ptb_lm(): args = parse_args() # check if set use_gpu=True in paddlepaddle cpu version model_check.check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied model_check.check_version() model_type = args.model_type vocab_size = 37484 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 4 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 2 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "gru4rec": num_layers = 1 batch_size = 500 hidden_size = 100 num_steps = 10 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 10 max_epoch = 5 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 0.05 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return with fluid.dygraph.guard(core.CUDAPlace(0)): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed max_epoch = 1 ptb_model = PtbModel("ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) if args.init_from_pretrain_model: if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return fluid.load_dygraph(args.init_from_pretrain_model) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps print("total_batch_size:", total_batch_size) log_interval = total_batch_size // 20 bd = [] lr_arr = [base_learning_rate] for i in range(1, max_epoch): bd.append(total_batch_size * i) new_lr = base_learning_rate * (lr_decay**max( i + 1 - epoch_start_decay, 0.0)) lr_arr.append(new_lr) sgd = AdagradOptimizer(parameter_list=ptb_model.parameters(), learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr)) print("parameters:--------------------------------") for para in ptb_model.parameters(): print(para.name) print("parameters:--------------------------------") def eval(model, data): print("begion to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader.get_data_iter(data, batch_size, num_steps) init_hidden = to_variable(init_hidden_data) accum_num_recall = 0.0 for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden) out_loss = dy_loss.numpy() acc_ = acc.numpy()[0] accum_num_recall += acc_ if batch_id % 1 == 0: print("batch_id:%d recall@20:%.4f" % (batch_id, accum_num_recall / (batch_id + 1))) init_hidden = last_hidden total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("recall@20 ", accum_num_recall / (batch_id + 1)) if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) for epoch_id in range(max_epoch): ptb_model.train() total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) init_hidden = to_variable(init_hidden_data) start_time = time.time() for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden) out_loss = dy_loss.numpy() acc_ = acc.numpy()[0] init_hidden = last_hidden dy_loss.backward() sgd.minimize(dy_loss, grad_clip=grad_clip) ptb_model.clear_gradients() total_loss += out_loss iters += num_steps if batch_id > 0 and batch_id % 100 == 1: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, acc: %.5f, lr: %.5f" % (epoch_id, batch_id, ppl[0], acc_, sgd._global_learning_rate().numpy())) print("one ecpoh finished", epoch_id) print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0])) if args.ce: print("kpis\ttrain_ppl\t%0.3f" % ppl[0]) save_model_dir = os.path.join(args.save_model_dir, str(epoch_id), 'params') fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) print("Saved model to: %s.\n" % save_model_dir) eval(ptb_model, test_data)
def train(sess): sess.run(init) if args.profile: profiler_step = 0 profiler = model_analyzer.Profiler(graph=sess.graph) run_options = tf.RunOptions(trace_level = tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() total_time = 0.0 epoch_times = [] for epoch_id in xrange(max_epoch): batch_times = [] epoch_start_time = time.time() train_data_iter = reader.get_data_iter( train_data, batch_size, num_steps) # assign lr, update the learning rate new_lr_1 = base_learning_rate * ( lr_decay ** max(epoch_id + 1 - epoch_start_decay, 0.0) ) sess.run( lr_update, {new_lr: new_lr_1}) total_loss = 0.0 iters = 0 batch_len = len(train_data) // batch_size epoch_size = ( batch_len - 1 ) // num_steps if args.profile: log_fre = 1 else: log_fre = epoch_size // 10 init_h = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_c = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') count = 0.0 for batch_id, batch in enumerate(train_data_iter): x,y = batch feed_dict = {} feed_dict[feeding_list[0]] = x feed_dict[feeding_list[1]] = y feed_dict[feeding_list[2]] = init_h feed_dict[feeding_list[3]] = init_c batch_start_time = time.time() if args.profile: output = sess.run([cost, final_h, final_c, train_op], feed_dict, options=run_options, run_metadata=run_metadata) profiler.add_step(step=profiler_step, run_meta=run_metadata) profiler_step = profiler_step + 1 if batch_id >= 10: break else: output = sess.run([cost, final_h, final_c, train_op], feed_dict) batch_time = time.time() - batch_start_time batch_times.append(batch_time) train_cost = output[0] init_h = output[1] init_c = output[2] total_loss += train_cost iters += num_steps count = count + 1 if batch_id > 0 and batch_id % log_fre == 0: ppl = np.exp( total_loss / iters ) print("-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl, new_lr_1)) ppl = np.exp(total_loss / iters) epoch_time = time.time() - epoch_start_time epoch_times.append(epoch_time) total_time += epoch_time print("\nTrain epoch:[%d]; epoch Time: %.5f s; ppl: %.5f; avg_time: %.5f steps/s\n" % (epoch_id, epoch_time, ppl, (batch_id + 1) / sum(batch_times))) valid_ppl, _ = eval(sess, valid_data) print("Valid ppl: %.5f" % valid_ppl) test_ppl, test_time = eval(sess, test_data) print("Test Time (total): %.5f, ppl: %.5f" % (test_time, test_ppl)) if args.profile: profile_op_opt_builder = option_builder.ProfileOptionBuilder() profile_op_opt_builder.select(['micros','occurrence']) profile_op_opt_builder.order_by('micros') profile_op_opt_builder.with_max_depth(50) profiler.profile_operations(profile_op_opt_builder.build())
def main(): args = parse_args() print(args) num_layers = args.num_layers src_vocab_size = args.src_vocab_size tar_vocab_size = args.tar_vocab_size batch_size = args.batch_size dropout = args.dropout init_scale = args.init_scale max_grad_norm = args.max_grad_norm hidden_size = args.hidden_size place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() with fluid.dygraph.guard(place): #args.enable_ce = True if args.enable_ce: fluid.default_startup_program().random_seed = 102 fluid.default_main_program().random_seed = 102 np.random.seed(102) random.seed(102) # Training process if args.attention: model = AttentionModel(hidden_size, src_vocab_size, tar_vocab_size, batch_size, num_layers=num_layers, init_scale=init_scale, dropout=dropout) else: model = BaseModel(hidden_size, src_vocab_size, tar_vocab_size, batch_size, num_layers=num_layers, init_scale=init_scale, dropout=dropout) gloabl_norm_clip = GradientClipByGlobalNorm(max_grad_norm) lr = args.learning_rate opt_type = args.optimizer if opt_type == "sgd": optimizer = fluid.optimizer.SGD(lr, parameter_list=model.parameters(), grad_clip=gloabl_norm_clip) elif opt_type == "adam": optimizer = fluid.optimizer.Adam(lr, parameter_list=model.parameters(), grad_clip=gloabl_norm_clip) else: print("only support [sgd|adam]") raise Exception("opt type not support") train_data_prefix = args.train_data_prefix eval_data_prefix = args.eval_data_prefix test_data_prefix = args.test_data_prefix vocab_prefix = args.vocab_prefix src_lang = args.src_lang tar_lang = args.tar_lang print("begin to load data") raw_data = reader.raw_data(src_lang, tar_lang, vocab_prefix, train_data_prefix, eval_data_prefix, test_data_prefix, args.max_len) print("finished load data") train_data, valid_data, test_data, _ = raw_data def prepare_input(batch, epoch_id=0): src_ids, src_mask, tar_ids, tar_mask = batch res = {} src_ids = src_ids.reshape((src_ids.shape[0], src_ids.shape[1])) in_tar = tar_ids[:, :-1] label_tar = tar_ids[:, 1:] in_tar = in_tar.reshape((in_tar.shape[0], in_tar.shape[1])) label_tar = label_tar.reshape( (label_tar.shape[0], label_tar.shape[1], 1)) inputs = [src_ids, in_tar, label_tar, src_mask, tar_mask] return inputs, np.sum(tar_mask) # get train epoch size def eval(data, epoch_id=0): model.eval() eval_data_iter = reader.get_data_iter(data, batch_size, mode='eval') total_loss = 0.0 word_count = 0.0 for batch_id, batch in enumerate(eval_data_iter): input_data_feed, word_num = prepare_input(batch, epoch_id) loss = model(input_data_feed) total_loss += loss * batch_size word_count += word_num ppl = np.exp(total_loss.numpy() / word_count) model.train() return ppl ce_time = [] ce_ppl = [] max_epoch = args.max_epoch for epoch_id in range(max_epoch): epoch_start = time.time() model.train() if args.enable_ce: train_data_iter = reader.get_data_iter(train_data, batch_size, enable_ce=True) else: train_data_iter = reader.get_data_iter(train_data, batch_size) total_loss = 0 word_count = 0.0 batch_times = [] total_reader_cost = 0.0 interval_time_start = time.time() batch_start = time.time() for batch_id, batch in enumerate(train_data_iter): batch_reader_end = time.time() total_reader_cost += batch_reader_end - batch_start input_data_feed, word_num = prepare_input(batch, epoch_id=epoch_id) word_count += word_num loss = model(input_data_feed) loss.backward() optimizer.minimize(loss) model.clear_gradients() total_loss += loss * batch_size total_loss_value = total_loss.numpy() batch_times.append(time.time() - batch_start) if batch_id > 0 and batch_id % 100 == 0: print( "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, batch_cost: %.5f sec, reader_cost: %.5f sec, ips: %.5f words/sec" % (epoch_id, batch_id, np.exp(total_loss_value / word_count), (time.time() - interval_time_start) / 100, total_reader_cost / 100, word_count / (time.time() - interval_time_start))) ce_ppl.append(np.exp(total_loss_value / word_count)) total_loss = 0.0 word_count = 0.0 total_reader_cost = 0.0 interval_time_start = time.time() batch_start = time.time() train_epoch_cost = time.time() - epoch_start print( "\nTrain epoch:[%d]; epoch_cost: %.5f sec; avg_batch_cost: %.5f s/step\n" % (epoch_id, train_epoch_cost, sum(batch_times) / len(batch_times))) ce_time.append(train_epoch_cost) dir_name = os.path.join(args.model_path, "epoch_" + str(epoch_id)) print("begin to save", dir_name) paddle.fluid.save_dygraph(model.state_dict(), dir_name) print("save finished") dev_ppl = eval(valid_data) print("dev ppl", dev_ppl) test_ppl = eval(test_data) print("test ppl", test_ppl) if args.enable_ce: card_num = get_cards() _ppl = 0 _time = 0 try: _time = ce_time[-1] _ppl = ce_ppl[-1] except: print("ce info error") print("kpis\ttrain_duration_card%s\t%s" % (card_num, _time)) print("kpis\ttrain_ppl_card%s\t%f" % (card_num, _ppl))
def eval(data): if args.inference_only and args.init_params_path: dirname = args.init_params_path filename = None if not os.path.isdir(args.init_params_path): dirname = os.path.dirname(args.init_params_path) filename = os.path.basename(args.init_params_path) fluid.io.load_persistables(exe, dirname, main_program=main_program, filename=filename) print("Load parameters from: %s." % args.init_params_path) batch_times = [] start_time = time.time() # when eval the batch_size set to 1 eval_data_iter = reader.get_data_iter(data, batch_size, num_steps) total_loss = 0.0 iters = 0 init_hidden = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') for batch_id, batch in enumerate(eval_data_iter): input_data_feed = prepare_input(batch, init_hidden, init_cell, epoch_id=0, with_lr=False) batch_start_time = time.time() # eval should not run the grad op and change the parameters. # use Executor to eval fetch_outs = exe.run( program=inference_program, feed=input_data_feed, fetch_list=[loss.name, last_hidden.name, last_cell.name], use_program_cache=True) batch_times.append(time.time() - batch_start_time) cost_train = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) total_loss += cost_train iters += num_steps ppl = np.exp(total_loss / iters) eval_time_total = time.time() - start_time eval_time_run = np.sum(batch_times) # Benchmark if args.inference_only: print("\n======== Benchmark Result ========") print( "Eval batch_size: %d; Time (total): %.5f s; Time (only run): %.5f s; ppl: %.5f" % (batch_size, eval_time_total, eval_time_run, ppl[0])) print("") # Save the inference model for C++ inference purpose fluid.io.save_inference_model(save_model_dir, feed_order, [loss, last_hidden, last_cell], exe, main_program=inference_program, model_filename="model", params_filename="params") print("Save inference model to: %s." % save_model_dir) return ppl
def train_an_epoch(epoch_id, batch_times): # get train epoch size num_batchs = len(train_data) // batch_size epoch_size = (num_batchs - 1) // num_steps if args.profile: log_interval = 1 else: log_interval = max(1, epoch_size // 10) data_iter_size = batch_size if device_count > 1 and args.parallel: data_iter_size = batch_size * device_count train_data_iter = reader.get_data_iter(train_data, data_iter_size, num_steps) total_loss = 0 iters = 0 if device_count > 1 and args.parallel: init_hidden = np.zeros( (num_layers * device_count, batch_size, hidden_size), dtype='float32') init_cell = np.zeros( (num_layers * device_count, batch_size, hidden_size), dtype='float32') else: init_hidden = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') for batch_id, batch in enumerate(train_data_iter): input_data_feed = prepare_input(batch, init_hidden, init_cell, epoch_id=epoch_id, device_count=device_count) batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=input_data_feed, fetch_list=[ loss.name, last_hidden.name, last_cell.name, "learning_rate" ], use_program_cache=True) batch_time = time.time() - batch_start_time batch_times.append(batch_time) cost_train = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) lr = np.array(fetch_outs[3]) total_loss += cost_train iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) if args.profile: if batch_id == 1: profiler.reset_profiler() elif batch_id >= 11: break ppl = np.exp(total_loss / iters) return ppl
def train(): model = BaseModel(batch_size=batch_size, maxlen=n_frames) loss, acc, output, no_grad_set = model.build_graph() main_program = fluid.default_main_program() inference_program = fluid.default_main_program().clone(for_test=True) optimizer = fluid.optimizer.Adadelta(0.001) optimizer.minimize(loss, no_grad_set=no_grad_set) place = fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) log_writter = LogWriter(log_path, sync_cycle=10) with log_writter.mode("train") as logger: log_train_loss = logger.scalar(tag="train_loss") log_train_acc = logger.scalar(tag="train_acc") with log_writter.mode("validation") as logger: log_valid_loss = logger.scalar(tag="validation_loss") log_valid_acc = logger.scalar(tag="validation_acc") def prepare_input(batch): x, y, x_seqlen = batch res = {} res['input'] = np.array(x).astype("float32") res['input_seqlen'] = np.array(x_seqlen).astype("int64") res['label'] = np.array(y).astype("float32") return res # (samples, seq, width, height, pixel) noisy_movies, shifted_movies = reader.generate_movies(n_samples, n_frames) data = noisy_movies[:1000], shifted_movies[:1000] train_data, validation_data = split(data, validation_split) step_id = 0 for epoch_id in range(max_epoch): start_time = time.time() print("epoch id", epoch_id) valid_data_iter = reader.get_data_iter(validation_data, batch_size) train_data_iter = reader.get_data_iter(train_data, batch_size) # train total_loss = 0 batch_id = 0 for batch in train_data_iter: input_data_feed = prepare_input(batch) fetch_outs = exe.run(program=main_program, feed=input_data_feed, fetch_list=[loss.name, acc.name], use_program_cache=False) cost_train = np.array(fetch_outs[0]) acc_train = fetch_outs[1] total_loss += cost_train if batch_id > 0 and batch_id % 5 == 0: log_train_loss.add_record(step_id, total_loss) log_train_acc.add_record(step_id, acc_train) step_id += 1 print("current loss: %.7f, for batch %d" % (total_loss, batch_id)) total_loss = 0.0 batch_id += 1 # validate total_loss = 0 total_acc = 0 batch_id = 0 for batch in valid_data_iter: input_data_feed = prepare_input(batch) fetch_outs = exe.run(program=inference_program, feed=input_data_feed, fetch_list=[loss.name, acc.name], use_program_cache=False) cost_train = np.array(fetch_outs[0]) acc_train = fetch_outs[1] total_loss += cost_train batch_id += 1 log_valid_loss.add_record(epoch_id, total_loss) log_valid_acc.add_record(epoch_id, total_acc / batch_id) print("validation loss: %.7f" % (total_loss)) fluid.io.save_inference_model( dirname=params_path, feeded_var_names=['input', 'input_seqlen'], target_vars=[loss, acc], executor=exe)
def train_ptb_lm(): args = parse_args() # check if set use_gpu=True in paddlepaddle cpu version model_check.check_cuda(args.use_gpu) place = core.CPUPlace() if args.use_gpu == True: place = core.CUDAPlace(0) # check if paddlepaddle version is satisfied model_check.check_version() model_type = args.model_type vocab_size = 10000 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 3 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 13 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return with fluid.dygraph.guard(place): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed max_epoch = 1 ptb_model = PtbModel( hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) if args.init_from_pretrain_model: if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return fluid.load_dygraph(args.init_from_pretrain_model) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps log_interval = 200 bd = [] lr_arr = [1.0] for i in range(1, max_epoch): bd.append(total_batch_size * i) new_lr = base_learning_rate * (lr_decay** max(i + 1 - epoch_start_decay, 0.0)) lr_arr.append(new_lr) sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters()) def eval(model, data): print("begin to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader.get_data_iter(data, batch_size, num_steps) for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0]) if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) for epoch_id in range(1): ptb_model.train() total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) start_time = time.time() start = time.time() for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, init_cell) init_hidden = last_hidden init_cell = last_cell init_hidden.stop_gradient = True init_cell.stop_gradient = True out_loss = dy_loss.numpy() dy_loss.backward() sgd.minimize(dy_loss, grad_clip=grad_clip) ptb_model.clear_gradients() total_loss += out_loss iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f" % (epoch_id, batch_id, ppl[0], sgd._global_learning_rate().numpy(), out_loss)) end = time.time() print("One epoch cost {}".format(end - start)) print("one epoch finished", epoch_id) print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0])) if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000: # for bad init, after first epoch, the loss is over 1000 # no more need to continue print("Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch.") print("Abort this training process and please start again.") return if args.ce: print("kpis\ttrain_ppl\t%0.3f" % ppl[0]) save_model_dir = os.path.join(args.save_model_dir, str(epoch_id), 'params') fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) print("Saved model to: %s.\n" % save_model_dir) eval(ptb_model, valid_data) eval(ptb_model, test_data)
def train(): args = parse_args() model_type = args.model_type rnn_model = args.rnn_model logger = logging.getLogger("lm") logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') if args.enable_ce: fluid.default_startup_program().random_seed = SEED if args.log_path: file_handler = logging.FileHandler(args.log_path) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) else: console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.info('Running with args : {}'.format(args)) vocab_size = 10000 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 3 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 13 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return # Training process loss, last_hidden, last_cell, feed_order = lm_model.lm_model( hidden_size, vocab_size, batch_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout, rnn_model=rnn_model) # clone from default main program and use it as the validation program main_program = fluid.default_main_program() inference_program = fluid.default_main_program().clone(for_test=True) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=max_grad_norm)) learning_rate = fluid.layers.create_global_var(name="learning_rate", shape=[1], value=1.0, dtype='float32', persistable=True) optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) optimizer.minimize(loss) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) data_path = args.data_path print("begin to load data") raw_data = reader.ptb_raw_data(data_path) print("finished load data") train_data, valid_data, test_data, _ = raw_data def prepare_input(batch, init_hidden, init_cell, epoch_id=0, with_lr=True): x, y = batch new_lr = base_learning_rate * (lr_decay**max( epoch_id + 1 - epoch_start_decay, 0.0)) lr = np.ones((1), dtype='float32') * new_lr res = {} x = x.reshape((-1, num_steps, 1)) y = y.reshape((-1, 1)) res['x'] = x res['y'] = y res['init_hidden'] = init_hidden res['init_cell'] = init_cell if with_lr: res['learning_rate'] = lr return res def eval(data): # when eval the batch_size set to 1 eval_data_iter = reader.get_data_iter(data, batch_size, num_steps) total_loss = 0.0 iters = 0 init_hidden = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') for batch_id, batch in enumerate(eval_data_iter): input_data_feed = prepare_input(batch, init_hidden, init_cell, epoch_id, with_lr=False) fetch_outs = exe.run( inference_program, feed=input_data_feed, fetch_list=[loss.name, last_hidden.name, last_cell.name], use_program_cache=True) cost_train = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) total_loss += cost_train iters += num_steps ppl = np.exp(total_loss / iters) return ppl # get train epoch size batch_len = len(train_data) // batch_size epoch_size = (batch_len - 1) // num_steps log_interval = epoch_size // 10 total_time = 0.0 for epoch_id in range(max_epoch): start_time = time.time() print("epoch id", epoch_id) train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) total_loss = 0 init_hidden = None init_cell = None #debug_para(fluid.framework.default_main_program(), parallel_executor) total_loss = 0 iters = 0 init_hidden = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') for batch_id, batch in enumerate(train_data_iter): input_data_feed = prepare_input(batch, init_hidden, init_cell, epoch_id=epoch_id) fetch_outs = exe.run(feed=input_data_feed, fetch_list=[ loss.name, last_hidden.name, last_cell.name, 'learning_rate' ], use_program_cache=True) cost_train = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) lr = np.array(fetch_outs[3]) total_loss += cost_train iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0], lr[0]) ppl = np.exp(total_loss / iters) if epoch_id == 0 and ppl[0] > 1000: # for bad init, after first epoch, the loss is over 1000 # no more need to continue return end_time = time.time() total_time += end_time - start_time print("train ppl", ppl[0]) if epoch_id == max_epoch - 1 and args.enable_ce: card_num = get_cards() print("ptblm\tlstm_language_model_%s_duration_card%d\t%s" % (args.rnn_model, card_num, total_time / max_epoch)) print("ptblm\tlstm_language_model_%s_loss_card%d\t%s" % (args.rnn_model, card_num, ppl[0])) model_path = os.path.join("model_new/", str(epoch_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(executor=exe, dirname=model_path, main_program=main_program) valid_ppl = eval(valid_data) print("valid ppl", valid_ppl[0]) test_ppl = eval(test_data) print("test ppl", test_ppl[0])
def train(): args = parse_args() num_layers = args.num_layers src_vocab_size = args.src_vocab_size tar_vocab_size = args.tar_vocab_size batch_size = args.batch_size dropout = args.dropout init_scale = args.init_scale max_grad_norm = args.max_grad_norm hidden_size = args.hidden_size if args.enable_ce: fluid.default_main_program().random_seed = 102 framework.default_startup_program().random_seed = 102 # Training process if args.attention: model = AttentionModel(hidden_size, src_vocab_size, tar_vocab_size, batch_size, num_layers=num_layers, init_scale=init_scale, dropout=dropout) else: model = BaseModel(hidden_size, src_vocab_size, tar_vocab_size, batch_size, num_layers=num_layers, init_scale=init_scale, dropout=dropout) loss = model.build_graph() # clone from default main program and use it as the validation program main_program = fluid.default_main_program() inference_program = fluid.default_main_program().clone(for_test=True) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=max_grad_norm)) lr = args.learning_rate opt_type = args.optimizer if opt_type == "sgd": optimizer = fluid.optimizer.SGD(lr) elif opt_type == "adam": optimizer = fluid.optimizer.Adam(lr) else: print("only support [sgd|adam]") raise Exception("opt type not support") optimizer.minimize(loss) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) train_data_prefix = args.train_data_prefix eval_data_prefix = args.eval_data_prefix test_data_prefix = args.test_data_prefix vocab_prefix = args.vocab_prefix src_lang = args.src_lang tar_lang = args.tar_lang print("begin to load data") raw_data = reader.raw_data(src_lang, tar_lang, vocab_prefix, train_data_prefix, eval_data_prefix, test_data_prefix, args.max_len) print("finished load data") train_data, valid_data, test_data, _ = raw_data def prepare_input(batch, epoch_id=0, with_lr=True): src_ids, src_mask, tar_ids, tar_mask = batch res = {} src_ids = src_ids.reshape((src_ids.shape[0], src_ids.shape[1], 1)) in_tar = tar_ids[:, :-1] label_tar = tar_ids[:, 1:] in_tar = in_tar.reshape((in_tar.shape[0], in_tar.shape[1], 1)) label_tar = label_tar.reshape( (label_tar.shape[0], label_tar.shape[1], 1)) res['src'] = src_ids res['tar'] = in_tar res['label'] = label_tar res['src_sequence_length'] = src_mask res['tar_sequence_length'] = tar_mask return res, np.sum(tar_mask) # get train epoch size def eval(data, epoch_id=0): eval_data_iter = reader.get_data_iter(data, batch_size, mode='eval') total_loss = 0.0 word_count = 0.0 for batch_id, batch in enumerate(eval_data_iter): input_data_feed, word_num = prepare_input(batch, epoch_id, with_lr=False) fetch_outs = exe.run(inference_program, feed=input_data_feed, fetch_list=[loss.name], use_program_cache=False) cost_train = np.array(fetch_outs[0]) total_loss += cost_train * batch_size word_count += word_num ppl = np.exp(total_loss / word_count) return ppl ce_time = [] ce_ppl = [] max_epoch = args.max_epoch for epoch_id in range(max_epoch): start_time = time.time() print("epoch id", epoch_id) if args.enable_ce: train_data_iter = reader.get_data_iter(train_data, batch_size, enable_ce=True) else: train_data_iter = reader.get_data_iter(train_data, batch_size) total_loss = 0 word_count = 0.0 for batch_id, batch in enumerate(train_data_iter): input_data_feed, word_num = prepare_input(batch, epoch_id=epoch_id) fetch_outs = exe.run(feed=input_data_feed, fetch_list=[loss.name], use_program_cache=True) cost_train = np.array(fetch_outs[0]) total_loss += cost_train * batch_size word_count += word_num if batch_id > 0 and batch_id % 100 == 0: print("ppl", batch_id, np.exp(total_loss / word_count)) ce_ppl.append(np.exp(total_loss / word_count)) total_loss = 0.0 word_count = 0.0 end_time = time.time() time_gap = end_time - start_time ce_time.append(time_gap) dir_name = args.model_path + "/epoch_" + str(epoch_id) print("begin to save", dir_name) fluid.io.save_params(exe, dir_name) print("save finished") dev_ppl = eval(valid_data) print("dev ppl", dev_ppl) test_ppl = eval(test_data) print("test ppl", test_ppl) if args.enable_ce: card_num = get_cards() _ppl = 0 _time = 0 try: _time = ce_time[-1] _ppl = ce_ppl[-1] except: print("ce info error") print("kpis\ttrain_duration_card%s\t%s" % (card_num, _time)) print("kpis\ttrain_ppl_card%s\t%f" % (card_num, _ppl))
def train(): def prepare_input(batch): src_ids, label = batch res = {} res['src'] = src_ids res['label'] = label return res # Set parameters: # ngram_range = 2 will add bi-grams features ngram_range = 2 max_features = 20000 maxlen = 400 batch_size = 32 embedding_dims = 50 epochs = 5 print('Loading data...') all_data = reader.raw_data(num_words=max_features) x_train, y_train, x_test, y_test = all_data print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Average train sequence length: {}'.format( np.mean(list(map(len, x_train)), dtype=int))) print('Average test sequence length: {}'.format( np.mean(list(map(len, x_test)), dtype=int))) if ngram_range > 1: print('Adding {}-gram features'.format(ngram_range)) # Create set of unique n-gram from the training set. ngram_set = set() for input_list in x_train: for i in range(2, ngram_range + 1): set_of_ngram = create_ngram_set(input_list, ngram_value=i) ngram_set.update(set_of_ngram) # Dictionary mapping n-gram token to a unique integer. # Integer values are greater than max_features in order # to avoid collision with existing features. start_index = max_features + 1 token_indice = {v: k + start_index for k, v in enumerate(ngram_set)} indice_token = {token_indice[k]: k for k in token_indice} # max_features is the highest integer that could be found in the dataset. max_features = np.max(list(indice_token.keys())) + 1 # Augmenting x_train and x_test with n-grams features x_train = add_ngram(x_train, token_indice, ngram_range) x_test = add_ngram(x_test, token_indice, ngram_range) print('Average train sequence length: {}'.format( np.mean(list(map(len, x_train)), dtype=int))) print('Average test sequence length: {}'.format( np.mean(list(map(len, x_test)), dtype=int))) print('Pad sequences (samples x time)') x_train = reader.pad_sequences(x_train, maxlen=maxlen) x_test = reader.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) all_data = x_train, y_train, x_test, y_test print('Build model...') model = BaseModel(max_features=max_features) loss, acc = model.build_graph() main_program = fluid.default_main_program() inference_program = fluid.default_main_program().clone(for_test=True) optimizer = fluid.optimizer.Adam(0.01) optimizer.minimize(loss) place = fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) for epoch_id in range(epochs): start_time = time.time() print("epoch id", epoch_id) train_data_iter = reader.get_data_iter(all_data, batch_size) total_loss = 0 total_acc = 0 batch_id = 0 for batch in train_data_iter: input_data_feed = prepare_input(batch) fetch_outs = exe.run(feed=input_data_feed, fetch_list=[loss.name, acc.name], use_program_cache=False) cost_train = np.array(fetch_outs[0]) acc_train = np.array(fetch_outs[1]) total_loss += cost_train total_acc += acc_train if batch_id > 0 and batch_id % 10 == 0: print("current loss: %.3f, current acc: %.3f for step %d" % (total_loss, total_acc * 0.1, batch_id)) total_loss = 0.0 total_acc = 0.0 batch_id += 1 test_data_iter = reader.get_data_iter(all_data, batch_size, mode='test') all_acc = [] for batch in test_data_iter: input_data_feed = prepare_input(batch) fetch_outs = exe.run(program=inference_program, feed=input_data_feed, fetch_list=[loss.name, acc.name], use_program_cache=False) all_acc.append(fetch_outs[1]) all_acc = np.array(all_acc).astype("float32") print("test acc: %.3f" % all_acc.mean())
def train(): ce_time = [] ce_ppl = [] max_epoch = args.max_epoch for epoch_id in range(max_epoch): start_time = time.time() if args.enable_ce: train_data_iter = reader.get_data_iter(train_data, batch_size, enable_ce=True) else: train_data_iter = reader.get_data_iter(train_data, batch_size) total_loss = 0 word_count = 0.0 batch_times = [] time_interval = 0.0 batch_start_time = time.time() epoch_word_count = 0.0 total_reader_cost = 0.0 batch_read_start = time.time() for batch_id, batch in enumerate(train_data_iter): input_data_feed, word_num = prepare_input(batch, epoch_id=epoch_id) word_count += word_num total_reader_cost += time.time() - batch_read_start fetch_outs = exe.run(program=CompiledProgram, feed=input_data_feed, fetch_list=[loss.name], use_program_cache=True) cost_train = np.mean(fetch_outs[0]) # print(cost_train) total_loss += cost_train * batch_size batch_end_time = time.time() batch_time = batch_end_time - batch_start_time batch_times.append(batch_time) time_interval += batch_time epoch_word_count += word_num if batch_id > 0 and batch_id % 100 == 0: print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f; reader cost: %0.5f s; ips: %0.5f tokens/sec" % (epoch_id, batch_id, batch_time, np.exp(total_loss / word_count), total_reader_cost / 100, word_count / time_interval)) ce_ppl.append(np.exp(total_loss / word_count)) total_loss = 0.0 word_count = 0.0 time_interval = 0.0 total_reader_cost = 0.0 # profiler tools if args.profile and epoch_id == 0 and batch_id == 100: profiler.reset_profiler() elif args.profile and epoch_id == 0 and batch_id == 105: return batch_start_time = time.time() batch_read_start = time.time() end_time = time.time() epoch_time = end_time - start_time ce_time.append(epoch_time) print( "\nTrain epoch:[%d]; Epoch Time: %.5f; avg_time: %.5f s/step; ips: %0.5f tokens/sec\n" % (epoch_id, epoch_time, sum(batch_times) / len(batch_times), epoch_word_count / sum(batch_times))) if not args.profile: save_path = os.path.join(args.model_path, "epoch_" + str(epoch_id), "checkpoint") print("begin to save", save_path) fluid.save(train_program, save_path) print("save finished") dev_ppl = eval(valid_data) print("dev ppl", dev_ppl) test_ppl = eval(test_data) print("test ppl", test_ppl) if args.enable_ce: card_num = get_cards() _ppl = 0 _time = 0 try: _time = ce_time[-1] _ppl = ce_ppl[-1] except: print("ce info error") print("kpis\ttrain_duration_card%s\t%s" % (card_num, _time)) print("kpis\ttrain_ppl_card%s\t%f" % (card_num, _ppl))
def train(): raw_data, raw_data_test = reader.get_lt5_data() model = BaseModel(fine_tune=False) loss, acc, output = model.build_graph() main_program = fluid.default_main_program() test_program = main_program.clone(for_test=True) optimizer = fluid.optimizer.Adadelta(0.01) optimizer.minimize(loss) place = fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) def prepare_input(batch, epoch_id=0): x, y = batch res = {} res['img'] = np.array(x).astype("float32") / 255 res['label'] = np.array(y).astype("int64") return res def train_test(test_batch): total_acc = [] input_data_feed = prepare_input(test_batch) fetch_outs = exe.run(program=test_program, feed=input_data_feed, fetch_list=[acc.name], use_program_cache=True) acc_train = np.array(fetch_outs[0]) total_acc.append(acc_train) print("test avg acc: {0:.2%}".format(np.mean(total_acc))) for epoch_id in range(epochs): print("epoch id", epoch_id) train_data_iter = reader.get_data_iter(raw_data, batch_size) test_data_iter = reader.get_data_iter(raw_data_test, batch_size) data_iter = zip(train_data_iter, test_data_iter) total_loss = 0 total_acc = [] for batch_id, batch in enumerate(data_iter): batch_train, batch_test = batch input_data_feed = prepare_input(batch_train) fetch_outs = exe.run(program=main_program, feed=input_data_feed, fetch_list=[loss.name, acc.name], use_program_cache=True) cost_train = np.array(fetch_outs[0]) acc_train = np.array(fetch_outs[1]) total_loss += cost_train * batch_size total_acc.append(acc_train) print("train total loss: ", total_loss, np.mean(total_acc)) train_test(batch_test) print() shutil.rmtree(temp_model_path, ignore_errors=True) os.makedirs(temp_model_path) fluid.io.save_params(executor=exe, dirname=temp_model_path)