def train_loop(args, logger, vocab, train_progs, infer_progs, optimizer, nccl2_num_trainers=1, nccl2_trainer_id=0, worker_endpoints=None): train_prog, train_startup_prog, train_model = train_progs infer_prog, infer_startup_prog, infer_model = infer_progs # prepare device place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() exe = Executor(place) if not args.use_gpu: place = fluid.CPUPlace() import multiprocessing dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() if args.load_dir: logger.info('load pretrained checkpoints from {}'.format( args.load_dir)) fluid.io.load_persistables(exe, args.load_dir, main_program=train_prog) elif args.load_pretraining_params: logger.info('load pretrained params from {}'.format( args.load_pretraining_params)) exe.run(train_startup_prog) init_pretraining_params(exe, args.load_pretraining_params, main_program=train_prog) else: exe.run(train_startup_prog) # prepare data feed_list = [ train_prog.global_block().var(var_name) for var_name in train_model.feed_order ] feeder = fluid.DataFeeder(feed_list, place) logger.info('Training the model...') exe_strategy = fluid.parallel_executor.ExecutionStrategy() parallel_executor = fluid.ParallelExecutor(loss_name=train_model.loss.name, main_program=train_prog, use_cuda=bool(args.use_gpu), exec_strategy=exe_strategy, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) logger.info("begin to load data") train_data = data.BidirectionalLMDataset(args.train_path, vocab, test=(not args.shuffle), shuffle_on_load=args.shuffle) logger.info("finished load vocab") # get train epoch size log_interval = args.log_interval total_time = 0.0 batch_size = args.batch_size hidden_size = args.hidden_size custom_samples_array = np.zeros( (batch_size, args.num_steps, args.n_negative_samples_batch + 1), dtype='int64') custom_probabilities_array = np.zeros( (batch_size, args.num_steps, args.n_negative_samples_batch + 1), dtype='float32') for i in range(batch_size): for j in range(0, args.num_steps): for k in range(0, args.n_negative_samples_batch + 1): custom_samples_array[i][j][k] = k custom_probabilities_array[i][j][k] = 1.0 start_time = time.time() train_data_iter = lambda: train_data.iter_batches(batch_size * dev_count, args.num_steps) train_reader = read_multiple(train_data_iter, batch_size, dev_count) total_num = 0 n_batch_loss = 0.0 n_batch_cnt = 0 last_hidden_values = np.zeros( (dev_count, args.num_layers * 2 * batch_size * args.embed_size), dtype='float32') last_cell_values = np.zeros( (dev_count, args.num_layers * 2 * batch_size * hidden_size), dtype='float32') n_tokens_per_batch = args.batch_size * args.num_steps n_batches_per_epoch = int(args.all_train_tokens / n_tokens_per_batch) n_batches_total = args.max_epoch * n_batches_per_epoch begin_time = time.time() for batch_id, batch_list in enumerate(train_reader(), 1): if batch_id > n_batches_total: break feed_data = batch_reader(batch_list, args) feed = list(feeder.feed_parallel(feed_data, dev_count)) for i in range(dev_count): init_hidden_tensor = fluid.core.LoDTensor() if args.use_gpu: placex = fluid.CUDAPlace(i) else: placex = fluid.CPUPlace() init_hidden_tensor.set(last_hidden_values[i], placex) init_cell_tensor = fluid.core.LoDTensor() init_cell_tensor.set(last_cell_values[i], placex) feed[i]['init_hiddens'] = init_hidden_tensor feed[i]['init_cells'] = init_cell_tensor fetch_outs = parallel_executor.run(feed=feed, fetch_list=[ train_model.loss.name, train_model.last_hidden.name, train_model.last_cell.name ], return_numpy=False) cost_train = np.array(fetch_outs[0]).mean() last_hidden_values = np.array(fetch_outs[1]) last_hidden_values = last_hidden_values.reshape( (dev_count, args.num_layers * 2 * batch_size * args.embed_size)) last_cell_values = np.array(fetch_outs[2]) last_cell_values = last_cell_values.reshape( (dev_count, args.num_layers * 2 * batch_size * args.hidden_size)) total_num += args.batch_size * dev_count n_batch_loss += np.array(fetch_outs[0]).sum() n_batch_cnt += len(np.array(fetch_outs[0])) if batch_id > 0 and batch_id % log_interval == 0: smoothed_ppl = np.exp(n_batch_loss / n_batch_cnt) ppl = np.exp( np.array(fetch_outs[0]).sum() / len(np.array(fetch_outs[0]))) used_time = time.time() - begin_time speed = log_interval / used_time logger.info( "[train] step:{}, loss:{:.3f}, ppl:{:.3f}, smoothed_ppl:{:.3f}, speed:{:.3f}" .format(batch_id, n_batch_loss / n_batch_cnt, ppl, smoothed_ppl, speed)) n_batch_loss = 0.0 n_batch_cnt = 0 begin_time = time.time() if batch_id > 0 and batch_id % args.dev_interval == 0: valid_ppl = eval(vocab, infer_progs, dev_count, logger, args) logger.info("valid ppl {}".format(valid_ppl)) if batch_id > 0 and batch_id % args.save_interval == 0: model_path = os.path.join(args.para_save_dir, str(batch_id + epoch_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(executor=exe, dirname=model_path, main_program=train_prog) end_time = time.time() total_time += end_time - start_time epoch_id = int(batch_id / n_batches_per_epoch) model_path = os.path.join(args.para_save_dir, str(epoch_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(executor=exe, dirname=model_path, main_program=train_prog) valid_ppl = eval(vocab, infer_progs, dev_count, logger, args) logger.info("valid ppl {}".format(valid_ppl)) test_ppl = eval(vocab, infer_progs, dev_count, logger, args)
def eval(vocab, infer_progs, dev_count, logger, args): infer_prog, infer_startup_prog, infer_model = infer_progs feed_order = infer_model.feed_order loss = infer_model.loss # prepare device place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() exe = Executor(place) if not args.use_gpu: place = fluid.CPUPlace() import multiprocessing dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() total_loss = 0.0 total_cnt = 0 n_batch_cnt = 0 n_batch_loss = 0.0 val_feed_list = [ infer_prog.global_block().var(var_name) for var_name in feed_order ] val_feeder = fluid.DataFeeder(val_feed_list, place) dev_data = data.BidirectionalLMDataset(args.test_path, vocab, test=True, shuffle_on_load=False) dev_data_iter = lambda: dev_data.iter_batches(args.batch_size * dev_count, args.num_steps) dev_reader = read_multiple(dev_data_iter, args.batch_size, dev_count) last_hidden_values = np.zeros( (dev_count, args.num_layers * 2 * args.batch_size * args.embed_size), dtype='float32') last_cell_values = np.zeros( (dev_count, args.num_layers * 2 * args.batch_size * args.hidden_size), dtype='float32') for batch_id, batch_list in enumerate(dev_reader(), 1): feed_data = batch_reader(batch_list, args) feed = list(val_feeder.feed_parallel(feed_data, dev_count)) for i in range(dev_count): init_hidden_tensor = fluid.core.LoDTensor() if args.use_gpu: placex = fluid.CUDAPlace(i) else: placex = fluid.CPUPlace() init_hidden_tensor.set(last_hidden_values[i], placex) init_cell_tensor = fluid.core.LoDTensor() init_cell_tensor.set(last_cell_values[i], placex) feed[i]['init_hiddens'] = init_hidden_tensor feed[i]['init_cells'] = init_cell_tensor last_hidden_values = [] last_cell_values = [] for i in range(dev_count): val_fetch_outs = exe.run(program=infer_prog, feed=feed[i], fetch_list=[ infer_model.loss.name, infer_model.last_hidden.name, infer_model.last_cell.name ], return_numpy=False) last_hidden_values.append(np.array(val_fetch_outs[1])) last_cell_values.append(np.array(val_fetch_outs[2])) total_loss += np.array(val_fetch_outs[0]).sum() n_batch_cnt += len(np.array(val_fetch_outs[0])) total_cnt += len(np.array(val_fetch_outs[0])) n_batch_loss += np.array(val_fetch_outs[0]).sum() last_hidden_values = np.array(last_hidden_values).reshape( (dev_count, args.num_layers * 2 * args.batch_size * args.embed_size)) last_cell_values = np.array(last_cell_values).reshape( (dev_count, args.num_layers * 2 * args.batch_size * args.hidden_size)) log_every_n_batch = args.log_interval if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0: logger.info('Average dev loss from batch {} to {} is {}'.format( batch_id - log_every_n_batch + 1, batch_id, "%.10f" % (n_batch_loss / n_batch_cnt))) n_batch_loss = 0.0 n_batch_cnt = 0 batch_offset = 0 ppl = np.exp(total_loss / total_cnt) return ppl