def main(): args = parse() hvd.init() torch.cuda.set_device(hvd.local_rank()) hvd_rank = hvd.rank() # Load a conf file if args.resume: conf = load_config(os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) # Load dataset train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size, n_epochs=args.n_epochs, min_n_tokens=args.min_n_tokens, bptt=args.bptt, n_customers=hvd.size(), backward=args.backward, serialize=args.serialize) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size, bptt=args.bptt, n_customers=hvd.size(), backward=args.backward, serialize=args.serialize) eval_set = Dataset(corpus=args.corpus, tsv_path=args.eval_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size, bptt=args.bptt, n_customers=hvd.size(), backward=args.backward, serialize=args.serialize) args.vocab = train_set.vocab train_loader = ChunkDataloader(train_set, batch_size=1, num_workers = 1, distributed=True, shuffle=False) eval_loader = ChunkDataloader(eval_set, batch_size=1, num_workers=1, distributed=True) # Set save path if args.resume: save_path = os.path.dirname(args.resume) dir_name = os.path.basename(save_path) else: dir_name = set_lm_name(args) save_path = mkdir_join(args.model_save_dir, '_'.join( os.path.basename(args.train_set).split('.')[:-1]), dir_name) if hvd.rank() == 0: save_path = set_save_path(save_path) # avoid overwriting # Set logger if hvd_rank == 0: logger = set_logger(os.path.join(save_path, 'train.log'), key='training', stdout=args.stdout) # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) logger.info('NUMBER_DEVICES: %s' % hvd.size()) setproctitle(args.job_name if args.job_name else dir_name) # Model setting model = build_lm(args, save_path) # GPU setting if args.n_gpus >= 1: torch.backends.cudnn.benchmark = True model.cuda() if args.resume: # Set optimizer epoch = int(args.resume.split('-')[-1]) optimizer = set_optimizer(model, 'sgd' if epoch > conf['convert_to_sgd_epoch'] else conf['optimizer'], conf['lr'], conf['weight_decay']) # Restore the last saved model if hvd_rank == 0: model, optimizer = load_checkpoint(model, args.resume, optimizer, resume=True) #broadcast optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Wrap optimizer by learning rate scheduler optimizer = LRScheduler(optimizer, conf['lr'], decay_type=conf['lr_decay_type'], decay_start_epoch=conf['lr_decay_start_epoch'], decay_rate=conf['lr_decay_rate'], decay_patient_n_epochs=conf['lr_decay_patient_n_epochs'], early_stop_patient_n_epochs=conf['early_stop_patient_n_epochs'], warmup_start_lr=conf['warmup_start_lr'], warmup_n_steps=conf['warmup_n_steps'], model_size=conf['d_model'], factor=conf['lr_factor'], noam=conf['lm_type'] == 'transformer') # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if epoch == conf['convert_to_sgd_epoch']: n_epochs = optimizer.n_epochs n_steps = optimizer.n_steps optimizer = set_optimizer(model, 'sgd', args.lr, conf['weight_decay']) optimizer = LRScheduler(optimizer, args.lr, decay_type='always', decay_start_epoch=0, decay_rate=0.5) optimizer._epoch = n_epochs optimizer._step = n_steps if hvd_rank == 0: logger.info('========== Convert to SGD ==========') #broadcast optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) else: # Save the conf file as a yaml file if hvd_rank == 0: save_config(vars(args), os.path.join(save_path, 'conf.yml')) # Save the nlsyms, dictionar, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(save_path, 'nlsyms.txt')) shutil.copy(args.dict, os.path.join(save_path, 'dict.txt')) if args.unit == 'wp': shutil.copy(args.wp_model, os.path.join(save_path, 'wp.model')) for k, v in sorted(vars(args).items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] if hvd.rank() == 0: logger.info("%s %d" % (n, n_params)) if hvd_rank == 0: logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info(model) # Set optimizer hvd.broadcast_parameters(model.state_dict(), root_rank=0) optimizer = set_optimizer(model, args.optimizer, args.lr, args.weight_decay) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Wrap optimizer by learning rate scheduler optimizer = LRScheduler(optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, model_size=args.d_model, factor=args.lr_factor, noam=args.lm_type == 'transformer') # Set reporter reporter = Reporter(save_path) hidden = None start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() data_size = len(train_set) accum_n_tokens = 0 verbose = 1 if hvd_rank == 0 else 0 while True: model.train() with tqdm(total=data_size/hvd.size(), desc='Train Epoch #{}'.format(optimizer.n_epochs + 1), disable=not verbose) as pbar_epoch: # Compute loss in the training set for _, ys_train in enumerate(train_loader): accum_n_tokens += sum([len(y) for y in ys_train]) optimizer.zero_grad() loss, hidden, reporter = model(ys_train, hidden, reporter) loss.backward() loss.detach() # Trancate the graph if args.accum_grad_n_tokens == 0 or accum_n_tokens >= args.accum_grad_n_tokens: if args.clip_grad_norm > 0: total_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.clip_grad_norm) #reporter.add_tensorboard_scalar('total_norm', total_norm) optimizer.step() optimizer.zero_grad() accum_n_tokens = 0 loss_train = loss.item() del loss hidden = model.repackage_state(hidden) if optimizer.n_steps % args.print_step == 0: model.eval() # Compute loss in the dev set ys_dev = dev_set.next()[0] loss, _, reporter = model(ys_dev, None, reporter, is_eval=True) loss_dev = loss.item() del loss duration_step = time.time() - start_time_step if hvd_rank == 0: logger.info("step:%d(ep:%.2f) loss:%.3f(%.3f)/ppl:%.3f(%.3f)/lr:%.5f/bs:%d (%.2f min)" % (optimizer.n_steps, optimizer.n_steps/data_size*hvd.size(), loss_train, loss_dev, np.exp(loss_train), np.exp(loss_dev), optimizer.lr, ys_train.shape[0], duration_step / 60)) start_time_step = time.time() pbar_epoch.update(1) # Save checkpoint and evaluate model per epoch duration_epoch = time.time() - start_time_epoch if hvd_rank == 0: logger.info('========== EPOCH:%d (%.2f min) ==========' %(optimizer.n_epochs + 1, duration_epoch / 60)) if optimizer.n_epochs + 1 < args.eval_start_epoch: # Save the model if hvd_rank == 0: optimizer.epoch() save_checkpoint(model, save_path, optimizer, optimizer.n_epochs, remove_old_checkpoints=args.lm_type != 'transformer') else: start_time_eval = time.time() # dev model.eval() ppl_dev, _ = eval_ppl_parallel([model], eval_loader, optimizer.n_epochs, batch_size=args.batch_size) ppl_dev = hvd.allreduce(np2tensor(np.array([ppl_dev], dtype=float), hvd.local_rank())) if hvd_rank == 0: logger.info('PPL : %.2f' % ppl_dev) optimizer.epoch(ppl_dev) if optimizer.is_best and hvd.rank() == 0: # Save the model save_checkpoint(model, save_path, optimizer, optimizer.n_epochs, remove_old_checkpoints=args.lm_type != 'transformer') duration_eval = time.time() - start_time_eval if hvd_rank == 0: logger.info('Evaluation time: %.2f min' % (duration_eval / 60)) # Early stopping if optimizer.is_early_stop: break # Convert to fine-tuning stage if optimizer.n_epochs == args.convert_to_sgd_epoch: n_epochs = optimizer.n_epochs n_steps = optimizer.n_steps optimizer = set_optimizer(model, 'sgd', args.lr, args.weight_decay) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) optimizer = LRScheduler(optimizer, args.lr, decay_type='always', decay_start_epoch=0, decay_rate=0.5) optimizer._epoch = n_epochs optimizer._step = n_steps if hvd_rank == 0: logger.info('========== Convert to SGD ==========') if optimizer.n_epochs == args.n_epochs: break start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train if hvd_rank == 0: logger.info('Total time: %.2f hour' % (duration_train / 3600)) reporter.tf_writer.close() return save_path
def main(): args = parse() # Load a conf file if args.resume: conf = load_config( os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) # Load dataset train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size * args.n_gpus, n_epochs=args.n_epochs, min_n_tokens=args.min_n_tokens, bptt=args.bptt, shuffle=args.shuffle, backward=args.backward, serialize=args.serialize) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size * args.n_gpus, bptt=args.bptt, backward=args.backward, serialize=args.serialize) eval_sets = [ Dataset(corpus=args.corpus, tsv_path=s, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=1, bptt=args.bptt, backward=args.backward, serialize=args.serialize) for s in args.eval_sets ] args.vocab = train_set.vocab # Set save path if args.resume: save_path = os.path.dirname(args.resume) dir_name = os.path.basename(save_path) else: dir_name = set_lm_name(args) save_path = mkdir_join( args.model_save_dir, '_'.join(os.path.basename(args.train_set).split('.')[:-1]), dir_name) save_path = set_save_path(save_path) # avoid overwriting # Set logger set_logger(os.path.join(save_path, 'train.log'), stdout=args.stdout) # Model setting model = build_lm(args, save_path) if args.resume: transformer = conf['lm_type'] in ['transformer', 'transformer_xl'] else: transformer = args.lm_type in ['transformer', 'transformer_xl'] if args.resume: # Set optimizer epoch = int(args.resume.split('-')[-1]) optimizer = set_optimizer( model, 'sgd' if epoch > conf['convert_to_sgd_epoch'] else conf['optimizer'], conf['lr'], conf['weight_decay']) # Wrap optimizer by learning rate scheduler optimizer = LRScheduler( optimizer, conf['lr'], decay_type=conf['lr_decay_type'], decay_start_epoch=conf['lr_decay_start_epoch'], decay_rate=conf['lr_decay_rate'], decay_patient_n_epochs=conf['lr_decay_patient_n_epochs'], early_stop_patient_n_epochs=conf['early_stop_patient_n_epochs'], warmup_start_lr=conf['warmup_start_lr'], warmup_n_steps=conf['warmup_n_steps'], model_size=conf['transformer_d_model'], factor=conf['lr_factor'], noam=transformer, save_checkpoints_topk=1) # Restore the last saved model load_checkpoint(model, args.resume, optimizer) # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if epoch == conf['convert_to_sgd_epoch']: optimizer.convert_to_sgd(model, args.lr, conf['weight_decay'], decay_type='always', decay_rate=0.5) else: # Save the conf file as a yaml file save_config(vars(args), os.path.join(save_path, 'conf.yml')) # Save the nlsyms, dictionar, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(save_path, 'nlsyms.txt')) shutil.copy(args.dict, os.path.join(save_path, 'dict.txt')) if args.unit == 'wp': shutil.copy(args.wp_model, os.path.join(save_path, 'wp.model')) for k, v in sorted(vars(args).items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] logger.info("%s %d" % (n, n_params)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info(model) # Set optimizer optimizer = set_optimizer(model, args.optimizer, args.lr, args.weight_decay) # Wrap optimizer by learning rate scheduler optimizer = LRScheduler( optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, model_size=args.transformer_d_model, factor=args.lr_factor, noam=transformer, save_checkpoints_topk=1) # GPU setting if args.n_gpus >= 1: model.cudnn_setting(deterministic=False, benchmark=args.cudnn_benchmark) model = CustomDataParallel(model, device_ids=list(range(0, args.n_gpus))) model.cuda() # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) setproctitle(args.job_name if args.job_name else dir_name) # Set reporter reporter = Reporter(save_path) hidden = None start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() pbar_epoch = tqdm(total=len(train_set)) accum_n_steps = 0 n_steps = optimizer.n_steps * args.accum_grad_n_steps while True: # Compute loss in the training set ys_train, is_new_epoch = train_set.next() accum_n_steps += 1 loss, hidden, observation = model(ys_train, hidden) reporter.add(observation) loss.backward() loss.detach() # Trancate the graph if args.accum_grad_n_steps == 1 or accum_n_steps >= args.accum_grad_n_steps: if args.clip_grad_norm > 0: total_norm = torch.nn.utils.clip_grad_norm_( model.module.parameters(), args.clip_grad_norm) reporter.add_tensorboard_scalar('total_norm', total_norm) optimizer.step() optimizer.zero_grad() accum_n_steps = 0 loss_train = loss.item() del loss hidden = model.module.repackage_state(hidden) reporter.add_tensorboard_scalar('learning_rate', optimizer.lr) # NOTE: loss/acc/ppl are already added in the model reporter.step() pbar_epoch.update(ys_train.shape[0] * (ys_train.shape[1] - 1)) n_steps += 1 if n_steps % args.print_step == 0: # Compute loss in the dev set ys_dev = dev_set.next(bptt=args.bptt)[0] loss, _, observation = model(ys_dev, None, is_eval=True) reporter.add(observation, is_eval=True) loss_dev = loss.item() del loss reporter.step(is_eval=True) duration_step = time.time() - start_time_step logger.info( "step:%d(ep:%.2f) loss:%.3f(%.3f)/lr:%.5f/bs:%d (%.2f min)" % (n_steps, optimizer.n_epochs + train_set.epoch_detail, loss_train, loss_dev, optimizer.lr, ys_train.shape[0], duration_step / 60)) start_time_step = time.time() # Save fugures of loss and accuracy if n_steps % (args.print_step * 10) == 0: reporter.snapshot() model.module.plot_attention() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch logger.info('========== EPOCH:%d (%.2f min) ==========' % (optimizer.n_epochs + 1, duration_epoch / 60)) if optimizer.n_epochs + 1 < args.eval_start_epoch: optimizer.epoch() # lr decay reporter.epoch() # plot # Save the model optimizer.save_checkpoint(model, save_path, remove_old=True) else: start_time_eval = time.time() # dev model.module.reset_length(args.bptt) ppl_dev, _ = eval_ppl([model.module], dev_set, batch_size=1, bptt=args.bptt) model.module.reset_length(args.bptt) optimizer.epoch(ppl_dev) # lr decay reporter.epoch(ppl_dev, name='perplexity') # plot logger.info('PPL (%s, ep:%d): %.2f' % (dev_set.set, optimizer.n_epochs, ppl_dev)) if optimizer.is_topk: # Save the model optimizer.save_checkpoint(model, save_path, remove_old=True) # test ppl_test_avg = 0. for eval_set in eval_sets: model.module.reset_length(args.bptt) ppl_test, _ = eval_ppl([model.module], eval_set, batch_size=1, bptt=args.bptt) model.module.reset_length(args.bptt) logger.info( 'PPL (%s, ep:%d): %.2f' % (eval_set.set, optimizer.n_epochs, ppl_test)) ppl_test_avg += ppl_test if len(eval_sets) > 0: logger.info('PPL (avg., ep:%d): %.2f' % (optimizer.n_epochs, ppl_test_avg / len(eval_sets))) duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.2f min' % (duration_eval / 60)) # Early stopping if optimizer.is_early_stop: break # Convert to fine-tuning stage if optimizer.n_epochs == args.convert_to_sgd_epoch: optimizer.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) pbar_epoch = tqdm(total=len(train_set)) if optimizer.n_epochs == args.n_epochs: break start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train logger.info('Total time: %.2f hour' % (duration_train / 3600)) reporter.tf_writer.close() pbar_epoch.close() return save_path
def main(): args = parse() # Load a conf file if args.resume: conf = load_config( os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) # Load dataset train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size * args.n_gpus, n_epochs=args.n_epochs, min_n_tokens=args.min_n_tokens, bptt=args.bptt, backward=args.backward, serialize=args.serialize) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size * args.n_gpus, bptt=args.bptt, backward=args.backward, serialize=args.serialize) eval_sets = [] for s in args.eval_sets: eval_sets += [ Dataset(corpus=args.corpus, tsv_path=s, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=1, bptt=args.bptt, backward=args.backward, serialize=args.serialize) ] args.vocab = train_set.vocab # Set save path if args.resume: save_path = os.path.dirname(args.resume) dir_name = os.path.basename(save_path) else: dir_name = set_lm_name(args) save_path = mkdir_join( args.model_save_dir, '_'.join(os.path.basename(args.train_set).split('.')[:-1]), dir_name) save_path = set_save_path(save_path) # avoid overwriting # Set logger logger = set_logger(os.path.join(save_path, 'train.log'), key='training') # Model setting model = select_lm(args, save_path) if args.resume: # Set optimizer epoch = int(args.resume.split('-')[-1]) optimizer = set_optimizer( model, optimizer='sgd' if epoch > conf['convert_to_sgd_epoch'] else conf['optimizer'], lr=float(conf['learning_rate']), # on-the-fly weight_decay=float(conf['weight_decay'])) # Restore the last saved model model, checkpoint = load_checkpoint(model, args.resume, resume=True) optimizer = checkpoint['optimizer'] epoch = checkpoint['epoch'] step = checkpoint['step'] ppl_dev_best = checkpoint['metric_dev_best'] # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if epoch == conf['convert_to_sgd_epoch']: optimizer = set_optimizer(model, optimizer='sgd', lr=float(args.learning_rate), weight_decay=float(conf['weight_decay'])) optimizer = LRScheduler(optimizer, lr_max=args.learning_rate, decay_type='epoch', decay_start_epoch=0, decay_rate=0.5, lower_better=True) logger.info('========== Convert to SGD ==========') else: # Save the conf file as a yaml file save_config(vars(args), os.path.join(save_path, 'conf.yml')) # Save the nlsyms, dictionar, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(save_path, 'nlsyms.txt')) shutil.copy(args.dict, os.path.join(save_path, 'dict.txt')) if args.unit == 'wp': shutil.copy(args.wp_model, os.path.join(save_path, 'wp.model')) for k, v in sorted(vars(args).items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): nparams = model.num_params_dict[n] logger.info("%s %d" % (n, nparams)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info(model) epoch, step = 0, 0 ppl_dev_best = 10000 # Set optimizer optimizer = set_optimizer(model, optimizer=args.optimizer, lr=float(args.learning_rate), weight_decay=float(args.weight_decay)) # Wrap optimizer by learning rate scheduler optimizer = LRScheduler( optimizer, lr_max=float(args.learning_rate), decay_type=args.decay_type, decay_start_epoch=args.decay_start_epoch, decay_rate=args.decay_rate, decay_patient_n_epochs=args.decay_patient_n_epochs, lower_better=True, best_value=ppl_dev_best, model_size=args.d_model, warmup_start_lr=args.warmup_start_learning_rate, warmup_n_steps=args.warmup_n_steps, lr_factor=args.learning_rate_factor, noam=args.lm_type == 'transformer') # GPU setting if args.n_gpus >= 1: model = CustomDataParallel(model, device_ids=list(range(0, args.n_gpus, 1)), deterministic=False, benchmark=True) model.cuda() logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) # Set process name if args.job_name: setproctitle(args.job_name) else: setproctitle(dir_name) # Set reporter reporter = Reporter(save_path, tensorboard=True) hidden = None start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() not_improved_n_epochs = 0 pbar_epoch = tqdm(total=len(train_set)) accum_n_tokens = 0 while True: # Compute loss in the training set ys_train, is_new_epoch = train_set.next() accum_n_tokens += sum([len(y) for y in ys_train]) optimizer.zero_grad() loss, hidden, reporter = model(ys_train, hidden, reporter) # loss /= args.accum_grad_n_steps if len(model.device_ids) > 1: loss.backward(torch.ones(len(model.device_ids))) else: loss.backward() loss.detach() # Trancate the graph if args.accum_grad_n_tokens == 0 or accum_n_tokens >= args.accum_grad_n_tokens: if args.clip_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.module.parameters(), args.clip_grad_norm) optimizer.step() optimizer.zero_grad() accum_n_tokens = 0 loss_train = loss.item() del loss hidden = model.module.repackage_state(hidden) reporter.step() # step += args.n_gpus if step % args.print_step == 0: # Compute loss in the dev set ys_dev = dev_set.next()[0] loss, _, reporter = model(ys_dev, None, reporter, is_eval=True) loss_dev = loss.item() del loss reporter.step(is_eval=True) duration_step = time.time() - start_time_step logger.info( "step:%d(ep:%.2f) loss:%.3f(%.3f)/ppl:%.3f(%.3f)/lr:%.5f/bs:%d (%.2f min)" % (step, epoch + train_set.epoch_detail, loss_train, loss_dev, np.exp(loss_train), np.exp(loss_dev), optimizer.lr, ys_train.shape[0], duration_step / 60)) start_time_step = time.time() step += args.n_gpus pbar_epoch.update(ys_train.shape[0] * (ys_train.shape[1] - 1)) # Save fugures of loss and accuracy if step % (args.print_step * 10) == 0: reporter.snapshot() if args.lm_type == 'transformer': model.module.plot_attention() # Save checkpoint and evaluate model per epoch if is_new_epoch: epoch += 1 duration_epoch = time.time() - start_time_epoch logger.info('========== EPOCH:%d (%.2f min) ==========' % (epoch, duration_epoch / 60)) if epoch < args.eval_start_epoch: # Save the model save_checkpoint( model, save_path, optimizer, epoch, step, ppl_dev_best, remove_old_checkpoints=args.lm_type != 'transformer') else: start_time_eval = time.time() # dev ppl_dev, _ = eval_ppl([model.module], dev_set, batch_size=1, bptt=args.bptt) logger.info('PPL (%s): %.2f' % (dev_set.set, ppl_dev)) # Update learning rate optimizer.decay(epoch=epoch, value=ppl_dev) if ppl_dev < ppl_dev_best: ppl_dev_best = ppl_dev not_improved_n_epochs = 0 logger.info('||||| Best Score |||||') # Save the model save_checkpoint( model, save_path, optimizer, epoch, step, ppl_dev_best, remove_old_checkpoints=args.lm_type != 'transformer') # test ppl_test_avg = 0. for eval_set in eval_sets: ppl_test, _ = eval_ppl([model.module], eval_set, batch_size=1, bptt=args.bptt) logger.info('PPL (%s): %.2f' % (eval_set.set, ppl_test)) ppl_test_avg += ppl_test if len(eval_sets) > 0: logger.info('PPL (avg.): %.2f' % (ppl_test_avg / len(eval_sets))) else: not_improved_n_epochs += 1 duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.2f min' % (duration_eval / 60)) # Early stopping if not_improved_n_epochs == args.not_improved_patient_n_epochs: break # Convert to fine-tuning stage if epoch == args.convert_to_sgd_epoch: optimizer = set_optimizer(model, optimizer='sgd', lr=args.learning_rate, weight_decay=float( args.weight_decay)) optimizer = LRScheduler(optimizer, lr_max=args.learning_rate, decay_type='epoch', decay_start_epoch=0, decay_rate=0.5, lower_better=True) logger.info('========== Convert to SGD ==========') pbar_epoch = tqdm(total=len(train_set)) if epoch == args.n_epochs: break start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train logger.info('Total time: %.2f hour' % (duration_train / 3600)) if reporter.tensorboard: reporter.tf_writer.close() pbar_epoch.close() return save_path