def main(): """ Launches data-parallel multi-gpu training. """ args = parse_args() if not args.cudnn: torch.backends.cudnn.enabled = False if args.seed is not None: torch.manual_seed(args.seed + args.rank) # initialize distributed backend distributed = args.world_size > 1 if distributed: backend = 'nccl' if args.cuda else 'gloo' dist.init_process_group(backend=backend, rank=args.rank, init_method=args.dist_url, world_size=args.world_size) # create directory for results save_path = os.path.join(args.results_dir, args.save) args.save_path = save_path os.makedirs(save_path, exist_ok=True) # setup logging log_filename = f'log_gpu_{args.rank}.log' setup_logging(os.path.join(save_path, log_filename)) logging.info(f'Saving results to: {save_path}') logging.info(f'Run arguments: {args}') if args.cuda: torch.cuda.set_device(args.rank) # build tokenizer tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME)) # build datasets train_data = ParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size) val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_VAL_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_VAL_FNAME), tokenizer=tokenizer, min_len=args.min_length_val, max_len=args.max_length_val, sort=True) test_data = TextDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tokenizer=tokenizer, min_len=args.min_length_test, max_len=args.max_length_test, sort=False) vocab_size = tokenizer.vocab_size # build GNMT model model_config = dict(vocab_size=vocab_size, math=args.math, **literal_eval(args.model_config)) model = GNMT(**model_config) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing) opt_config = literal_eval(args.optimization_config) logging.info(f'Training optimizer: {opt_config}') num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') # get data loaders train_loader = train_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=True, bucketing=args.bucketing, num_workers=args.workers, drop_last=True) val_loader = val_data.get_loader(batch_size=args.val_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.workers, drop_last=False) test_loader = test_data.get_loader(batch_size=args.test_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.workers, drop_last=False) translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_length_test, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir, target_bleu=args.target_bleu, save_path=args.save_path) # create trainer trainer_options = dict(criterion=criterion, grad_clip=args.grad_clip, save_path=save_path, save_freq=args.save_freq, save_info={ 'config': args, 'tokenizer': tokenizer }, opt_config=opt_config, batch_first=batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed, intra_epoch_eval=args.intra_epoch_eval, translator=translator) trainer_options['model'] = model trainer = trainers.Seq2SeqTrainer(**trainer_options) # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # training loop best_loss = float('inf') for epoch in range(args.start_epoch, args.epochs): logging.info(f'Starting epoch {epoch}') if distributed: train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss, train_perf = trainer.optimize(train_loader) # evaluate on validation set if args.rank == 0 and not args.disable_eval: logging.info(f'Running validation on dev set') val_loss, val_perf = trainer.evaluate(val_loader) # remember best prec@1 and save checkpoint is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) trainer.save(save_all=args.save_all, is_best=is_best) break_training = False if not args.disable_eval: test_bleu, break_training = translator.run(calc_bleu=True, epoch=epoch) if args.rank == 0 and not args.disable_eval: logging.info(f'Summary: Epoch: {epoch}\t' f'Training Loss: {train_loss:.4f}\t' f'Validation Loss: {val_loss:.4f}\t' f'Test BLEU: {test_bleu:.2f}') logging.info(f'Performance: Epoch: {epoch}\t' f'Training: {train_perf:.0f} Tok/s\t' f'Validation: {val_perf:.0f} Tok/s') else: logging.info(f'Summary: Epoch: {epoch}\t' f'Training Loss {train_loss:.4f}') logging.info(f'Performance: Epoch: {epoch}\t' f'Training: {train_perf:.0f} Tok/s') logging.info(f'Finished epoch {epoch}') if break_training: break
def main(): """ Launches data-parallel multi-gpu training. """ training_start = time.time() args = parse_args() device = utils.set_device(args.cuda, args.local_rank) utils.init_distributed(args.cuda) args.rank = utils.get_rank() if not args.cudnn: torch.backends.cudnn.enabled = False # create directory for results os.makedirs(args.save_dir, exist_ok=True) # setup logging log_filename = f'log_rank_{utils.get_rank()}.log' utils.setup_logging(args.log_all_ranks, os.path.join(args.save_dir, log_filename)) if args.env: utils.log_env_info() logging.info(f'Saving results to: {args.save_dir}') logging.info(f'Run arguments: {args}') args.train_iter_size = set_iter_size(args.train_iter_size, args.train_global_batch_size, args.train_batch_size) worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs, device) worker_seed = worker_seeds[args.rank] logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # build tokenizer pad_vocab = utils.pad_vocabulary(args.math) tokenizer = Tokenizer(args.vocab, args.bpe_codes, args.lang, pad_vocab) # build datasets train_data = LazyParallelDataset( src_fname=args.train_src, tgt_fname=args.train_tgt, tokenizer=tokenizer, min_len=args.train_min_length, max_len=args.train_max_length, sort=False, max_size=args.train_max_size, ) val_data = ParallelDataset( src_fname=args.val_src, tgt_fname=args.val_tgt, tokenizer=tokenizer, min_len=args.val_min_length, max_len=args.val_max_length, sort=True, ) test_data = TextDataset( src_fname=args.test_src, tokenizer=tokenizer, min_len=args.test_min_length, max_len=args.test_max_length, sort=True, ) vocab_size = tokenizer.vocab_size # build GNMT model model_config = { 'hidden_size': args.hidden_size, 'vocab_size': vocab_size, 'num_layers': args.num_layers, 'dropout': args.dropout, 'batch_first': False, 'share_embedding': args.share_embedding, } model = GNMT(**model_config).to(device) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing).to(device) opt_config = {'optimizer': args.optimizer, 'lr': args.lr} opt_config.update(literal_eval(args.optimizer_extra)) logging.info(f'Training optimizer config: {opt_config}') scheduler_config = { 'warmup_steps': args.warmup_steps, 'remain_steps': args.remain_steps, 'decay_interval': args.decay_interval, 'decay_steps': args.decay_steps, 'decay_factor': args.decay_factor } logging.info(f'Training LR schedule config: {scheduler_config}') num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') batching_opt = { 'shard_size': args.shard_size, 'num_buckets': args.num_buckets } # get data loaders train_loader = train_data.get_loader(batch_size=args.train_batch_size, seeds=shuffling_seeds, batch_first=batch_first, shuffle=True, batching=args.batching, batching_opt=batching_opt, num_workers=args.train_loader_workers) val_loader = val_data.get_loader(batch_size=args.val_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.val_loader_workers) test_loader = test_data.get_loader(batch_size=args.test_batch_size, batch_first=batch_first, shuffle=False, pad=True, num_workers=args.test_loader_workers) translator = Translator( model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.test_max_length, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, print_freq=args.print_freq, reference=args.test_tgt, ) # create trainer total_train_iters = len(train_loader) // args.train_iter_size * args.epochs save_info = { 'model_config': model_config, 'config': args, 'tokenizer': tokenizer.get_state() } loss_scaling = { 'init_scale': args.init_scale, 'upscale_interval': args.upscale_interval } trainer_options = dict( model=model, criterion=criterion, grad_clip=args.grad_clip, iter_size=args.train_iter_size, save_dir=args.save_dir, save_freq=args.save_freq, save_info=save_info, opt_config=opt_config, scheduler_config=scheduler_config, train_iterations=total_train_iters, keep_checkpoints=args.keep_checkpoints, math=args.math, loss_scaling=loss_scaling, print_freq=args.print_freq, intra_epoch_eval=args.intra_epoch_eval, translator=translator, prealloc_mode=args.prealloc_mode, ) trainer = trainers.Seq2SeqTrainer(**trainer_options) # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # training loop best_loss = float('inf') training_perf = [] break_training = False test_bleu = None for epoch in range(args.start_epoch, args.epochs): logging.info(f'Starting epoch {epoch}') train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss, train_perf = trainer.optimize(train_loader) training_perf.append(train_perf) # evaluate on validation set if args.eval: logging.info(f'Running validation on dev set') val_loss, val_perf = trainer.evaluate(val_loader) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) trainer.save(save_all=args.save_all, is_best=is_best) if args.eval: utils.barrier() eval_fname = f'eval_epoch_{epoch}' eval_path = os.path.join(args.save_dir, eval_fname) _, eval_stats = translator.run( calc_bleu=True, epoch=epoch, eval_path=eval_path, ) test_bleu = eval_stats['bleu'] if args.target_bleu and test_bleu >= args.target_bleu: logging.info(f'Target accuracy reached') break_training = True acc_log = [] acc_log += [f'Summary: Epoch: {epoch}'] acc_log += [f'Training Loss: {train_loss:.4f}'] if args.eval: acc_log += [f'Validation Loss: {val_loss:.4f}'] acc_log += [f'Test BLEU: {test_bleu:.2f}'] perf_log = [] perf_log += [f'Performance: Epoch: {epoch}'] perf_log += [f'Training: {train_perf:.0f} Tok/s'] if args.eval: perf_log += [f'Validation: {val_perf:.0f} Tok/s'] if args.rank == 0: logging.info('\t'.join(acc_log)) logging.info('\t'.join(perf_log)) logging.info(f'Finished epoch {epoch}') if break_training: break utils.barrier() training_stop = time.time() training_time = training_stop - training_start logging.info(f'Total training time {training_time:.0f} s') table = TrainingTable() avg_training_perf = sum(training_perf) / len(training_perf) table.add(utils.get_world_size(), args.train_batch_size, test_bleu, avg_training_perf, training_time) if utils.get_rank() == 0: table.write('Training Summary', args.math) passed = utils.benchmark(test_bleu, args.target_bleu, train_perf, args.target_perf) if not passed: sys.exit(1)
def main(): args = parse_args() print(args) profile_dir = args.profile_dir if not args.profile: profile_dir = None if not args.cudnn: torch.backends.cudnn.enabled = False if args.seed: torch.manual_seed(args.seed + args.rank) if args.cuda: torch.cuda.set_device(args.gpu_rank) # initialize distributed backend distributed = args.world_size > 1 if distributed: print "init process group" backend = 'nccl' if args.cuda else 'gloo' dist.init_process_group(backend=backend, rank=args.rank, init_method=args.dist_url, world_size=args.world_size) print "distributed backend initialized" # create directory for results save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) # setup logging log_filename = 'log_gpu_{}.log'.format(args.rank) setup_logging(os.path.join(save_path, log_filename)) logging.info('Saving results to: {}'.format(save_path)) logging.info('Run arguments: {}'.format(args)) # build tokenizer tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME)) # build datasets train_data = ParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size) val_data = ParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_VAL_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_VAL_FNAME), tokenizer=tokenizer, min_len=args.min_length_val, max_len=args.max_length_val, sort=True) test_data = ParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TEST_FNAME), tokenizer=tokenizer, min_len=args.min_length_val, max_len=args.max_length_val, sort=False) vocab_size = tokenizer.vocab_size # build GNMT model model_config = dict(vocab_size=vocab_size, math=args.math, **literal_eval(args.model_config)) model = models.GNMT(**model_config) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing) opt_config = literal_eval(args.optimization_config) logging.info('Training optimizer: {}'.format(opt_config)) # create trainer trainer_options = dict( criterion=criterion, grad_clip=args.grad_clip, save_path=save_path, save_freq=args.save_freq, save_info={'config': args, 'tokenizer': tokenizer}, opt_config=opt_config, batch_first=batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed, log_dir=profile_dir, num_minibatches=args.num_minibatches, cupti=args.cupti) trainer_options['model'] = model trainer = trainers.Seq2SeqTrainer(**trainer_options) translator = Translator(model, tokenizer, beam_size=args.beam_size, max_seq_len=args.max_length_val, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info('Number of parameters: {}'.format(num_parameters)) # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join( checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error('No checkpoint found at {}'.format(args.resume)) # get data loaders train_loader = train_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=True, bucket=args.bucketing, num_workers=args.workers, drop_last=True, distributed=distributed, log_dir=profile_dir) val_loader = val_data.get_loader(batch_size=args.eval_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.workers, drop_last=False, distributed=False) test_loader = test_data.get_loader(batch_size=args.eval_batch_size, batch_first=batch_first, shuffle=False, num_workers=0, drop_last=False, distributed=False) # training loop best_loss = float('inf') for epoch in range(args.start_epoch, args.epochs): logging.info('Starting epoch {}'.format(epoch)) if distributed: train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss = trainer.optimize(train_loader) if args.profile: if args.cuda: break_training = torch.cuda.LongTensor([0]) else: break_training = torch.LongTensor([0]) print("profiling finished...") break if args.num_minibatches > 0: break # evaluate on validation set if args.rank == 0 and not args.disable_eval: logging.info('Running validation on dev set') val_loss = trainer.evaluate(val_loader) # remember best prec@1 and save checkpoint is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) trainer.save(save_all=args.save_all, is_best=is_best) logging.info('Epoch: {}\tTraining Loss {:.4f}\tValidation Loss {:.4f}'.format( epoch, train_loss, val_loss)) else: logging.info('Epoch: {}\tTraining Loss {:.4f}'.format( epoch, train_loss)) if args.cuda: break_training = torch.cuda.LongTensor([0]) else: break_training = torch.LongTensor([0]) if args.rank == 0 and not args.disable_eval: logging.info('Running evaluation on test set') model.eval() torch.cuda.empty_cache() eval_path = os.path.join(save_path, 'eval_epoch_{}'.format(epoch)) eval_file = open(eval_path, 'w') for i, (src, tgt, indices) in enumerate(test_loader): src, src_length = src if translator.batch_first: batch_size = src.size(0) else: batch_size = src.size(1) beam_size = args.beam_size bos = [translator.insert_target_start] * (batch_size * beam_size) bos = torch.LongTensor(bos) if translator.batch_first: bos = bos.view(-1, 1) else: bos = bos.view(1, -1) src_length = torch.LongTensor(src_length) if args.cuda: src = src.cuda() src_length = src_length.cuda() bos = bos.cuda() with torch.no_grad(): context = translator.model.encode(src, src_length) context = [context, src_length, None] if beam_size == 1: generator = translator.generator.greedy_search else: generator = translator.generator.beam_search preds, lengths, counter = generator(batch_size, bos, context) preds = preds.cpu() lengths = lengths.cpu() output = [] for idx, pred in enumerate(preds): end = lengths[idx] - 1 pred = pred[1: end] pred = pred.tolist() out = translator.tok.detokenize(pred) output.append(out) output = [output[indices.index(i)] for i in range(len(output))] for line in output: eval_file.write(line) eval_file.write('\n') eval_file.close() # run moses detokenizer detok_path = os.path.join(args.dataset_dir, config.DETOKENIZER) detok_eval_path = eval_path + '.detok' with open(detok_eval_path, 'w') as detok_eval_file, \ open(eval_path, 'r') as eval_file: subprocess.run(['perl', '{}'.format(detok_path)], stdin=eval_file, stdout=detok_eval_file, stderr=subprocess.DEVNULL) # run sacrebleu reference_path = os.path.join(args.dataset_dir, config.TGT_TEST_TARGET_FNAME) sacrebleu = subprocess.run(['sacrebleu --input {} {} --score-only -lc --tokenize intl'.format( detok_eval_path, reference_path)], stdout=subprocess.PIPE, shell=True) bleu = float(sacrebleu.stdout.strip()) logging.info('Finished evaluation on test set') logging.info('BLEU on test dataset: {}'.format(bleu)) if args.target_bleu: if bleu >= args.target_bleu: logging.info('Target accuracy reached') break_training[0] = 1 torch.cuda.empty_cache() if distributed: dist.broadcast(break_training, 0) logging.info('Finished epoch {}'.format(epoch)) if break_training: break
def main(): """ Launches data-parallel multi-gpu training. """ mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__)) mlperf_log.LOGGER.propagate = False args = parse_args() if args.cuda: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') else: device = torch.device('cpu') # initialize distributed backend distributed = False if 'WORLD_SIZE' in os.environ: distributed = int(os.environ['WORLD_SIZE']) > 1 if distributed: assert args.cuda '''Initialize distributed communication''' torch.distributed.init_process_group(backend='nccl', init_method='env://') assert torch.distributed.is_initialized() gnmt_print(key=mlperf_log.RUN_START) args.rank = get_rank() if not args.cudnn: torch.backends.cudnn.enabled = False # create directory for results save_path = os.path.join(args.results_dir, args.save) args.save_path = save_path os.makedirs(save_path, exist_ok=True) # setup logging log_filename = f'log_gpu_{args.rank}.log' setup_logging(os.path.join(save_path, log_filename)) logging.info(f'Saving results to: {save_path}') logging.info(f'Run arguments: {args}') # setup L2 promotion if args.cuda: l2_promote() gnmt_print(key=mlperf_log.RUN_SET_RANDOM_SEED) # https://github.com/mlperf/policies/issues/120#issuecomment-431111348 if args.seed is None: # random master seed, random.SystemRandom() uses /dev/urandom on Unix master_seed = random.SystemRandom().randint(0, 2**32 - 1) if get_rank() == 0: # master seed is reported only from rank=0 worker, it's to avoid # confusion, seeds from rank=0 are later broadcasted to other # workers logging.info(f'Using random master seed: {master_seed}') else: # master seed was specified from command line master_seed = args.seed logging.info(f'Using master seed from command line: {master_seed}') # initialize seeding RNG seeding_rng = random.Random(master_seed) # generate worker seeds, one seed for every distributed worker worker_seeds = generate_seeds(seeding_rng, get_world_size()) # generate seeds for data shuffling, one seed for every epoch shuffling_seeds = generate_seeds(seeding_rng, args.epochs) # broadcast seeds from rank=0 to other workers worker_seeds = broadcast_seeds(worker_seeds, device) shuffling_seeds = broadcast_seeds(shuffling_seeds, device) # set worker seed worker_seed = worker_seeds[args.rank] logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # build tokenizer tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME)) # build datasets gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING) gnmt_print(key=mlperf_log.TRAIN_HP_MAX_SEQ_LEN, value=args.max_length_train) train_data = LazyParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size) gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=len(train_data)) val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_VAL_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_VAL_FNAME), tokenizer=tokenizer, min_len=args.min_length_val, max_len=args.max_length_val, sort=True) gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL) test_data = TextDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tokenizer=tokenizer, min_len=args.min_length_test, max_len=args.max_length_test, sort=False) gnmt_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=len(test_data)) vocab_size = tokenizer.vocab_size # size of the vocabulary has been padded to a multiple of 8 gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size) # build GNMT model model_config = dict(vocab_size=vocab_size, math=args.math, **literal_eval(args.model_config)) model = GNMT(**model_config) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing) opt_config = literal_eval(args.optimization_config) scheduler_config = literal_eval(args.scheduler_config) logging.info(f'Training optimizer: {opt_config}') logging.info(f'Training LR Schedule: {scheduler_config}') num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') # get data loaders train_loader = train_data.get_loader(batch_size=args.batch_size, seeds=shuffling_seeds, batch_first=batch_first, shuffle=True, bucketing=args.bucketing, num_workers=args.train_loader_workers) gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size * get_world_size()) gnmt_print(key=mlperf_log.INPUT_SIZE, value=train_loader.sampler.num_samples) val_loader = val_data.get_loader(batch_size=args.val_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.val_loader_workers) test_loader = test_data.get_loader(batch_size=args.test_batch_size, batch_first=batch_first, shuffle=False, pad=True, num_workers=args.test_loader_workers) gnmt_print(key=mlperf_log.EVAL_SIZE, value=len(test_loader.dataset)) translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_length_test, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir, target_bleu=args.target_bleu, save_path=args.save_path) # create trainer trainer_options = dict( criterion=criterion, grad_clip=args.grad_clip, save_path=save_path, save_freq=args.save_freq, save_info={ 'config': args, 'tokenizer': tokenizer.get_state() }, opt_config=opt_config, scheduler_config=scheduler_config, batch_first=batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed, distributed_overlap_allreduce=args.enable_apex_allreduce_overlap, distributed_overlap_allreduce_messagesize=args.apex_message_size, intra_epoch_eval=args.intra_epoch_eval, translator=translator, arch=args.arch) trainer_options['model'] = model trainer = trainers.Seq2SeqTrainer(**trainer_options) # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # training loop # best_loss = float('inf') gnmt_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(1): logging.info(f'Starting epoch {epoch}') gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) if distributed: train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss, train_perf = trainer.optimize(train_loader) logging.info(f'Finished epoch {epoch}') # Save the checkpoint at the end of the training loop, after the RUN_STOP # tag # https://github.com/mlperf/policies/issues/55#issuecomment-428335773 if not args.disable_eval: gnmt_print(key=mlperf_log.TRAIN_CHECKPOINT) if get_rank() == 0: trainer.save(save_all=args.save_all, is_best=True) gnmt_print(key=mlperf_log.RUN_FINAL)
def main(): mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__)) mlperf_log.LOGGER.propagate = False mlperf_log.gnmt_print(key=mlperf_log.RUN_START) args = parse_args() print(args) if not args.cudnn: torch.backends.cudnn.enabled = False mlperf_log.gnmt_print(key=mlperf_log.RUN_SET_RANDOM_SEED) if args.seed: torch.manual_seed(args.seed + args.rank) # initialize distributed backend distributed = args.world_size > 1 if distributed: backend = 'nccl' if args.cuda else 'gloo' dist.init_process_group(backend=backend, rank=args.rank, init_method=args.dist_url, world_size=args.world_size) # create directory for results save_path = os.path.join(args.results_dir, args.save) os.makedirs(save_path, exist_ok=True) # setup logging log_filename = f'log_gpu_{args.rank}.log' setup_logging(os.path.join(save_path, log_filename)) logging.info(f'Saving results to: {save_path}') logging.info(f'Run arguments: {args}') if args.cuda: torch.cuda.set_device(args.rank) # build tokenizer tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME)) # build datasets mlperf_log.gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING) mlperf_log.gnmt_print(key=mlperf_log.TRAIN_HP_MAX_SEQ_LEN, value=args.max_length_train) train_data = ParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size) mlperf_log.gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=len(train_data)) val_data = ParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_VAL_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_VAL_FNAME), tokenizer=tokenizer, min_len=args.min_length_val, max_len=args.max_length_val, sort=True) mlperf_log.gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL) test_data = ParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TEST_FNAME), tokenizer=tokenizer, min_len=args.min_length_val, max_len=args.max_length_val, sort=False) mlperf_log.gnmt_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=len(test_data)) vocab_size = tokenizer.vocab_size mlperf_log.gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size) # build GNMT model model_config = dict(vocab_size=vocab_size, math=args.math, **literal_eval(args.model_config)) # SSY the real model # seq2seq/models/gnmt.py model = models.GNMT(**model_config) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing) opt_config = literal_eval(args.optimization_config) logging.info(f'Training optimizer: {opt_config}') # create trainer trainer_options = dict( criterion=criterion, grad_clip=args.grad_clip, save_path=save_path, save_freq=args.save_freq, save_info={'config': args, 'tokenizer': tokenizer}, opt_config=opt_config, batch_first=batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed) trainer_options['model'] = model # SSY only the trainer seq2seq/train/trainer.py # not the models trainer = trainers.Seq2SeqTrainer(**trainer_options) translator = Translator(model, tokenizer, beam_size=args.beam_size, max_seq_len=args.max_length_val, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join( checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # get data loaders train_loader = train_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=True, bucket=args.bucketing, num_workers=args.workers, drop_last=True, distributed=distributed) mlperf_log.gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size * args.world_size) mlperf_log.gnmt_print(key=mlperf_log.INPUT_SIZE, value=train_loader.sampler.num_samples) val_loader = val_data.get_loader(batch_size=args.eval_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.workers, drop_last=False, distributed=False) test_loader = test_data.get_loader(batch_size=args.eval_batch_size, batch_first=batch_first, shuffle=False, num_workers=0, drop_last=False, distributed=False) mlperf_log.gnmt_print(key=mlperf_log.EVAL_SIZE, value=len(test_loader.sampler)) # training loop best_loss = float('inf') mlperf_log.gnmt_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.start_epoch, args.epochs): mlperf_log.gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) logging.info(f'Starting epoch {epoch}') if distributed: train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss = trainer.optimize(train_loader) # evaluate on validation set if args.rank == 0 and not args.disable_eval: logging.info(f'Running validation on dev set') val_loss = trainer.evaluate(val_loader) # remember best prec@1 and save checkpoint is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) mlperf_log.gnmt_print(key=mlperf_log.TRAIN_CHECKPOINT) trainer.save(save_all=args.save_all, is_best=is_best) logging.info(f'Epoch: {epoch}\t' f'Training Loss {train_loss:.4f}\t' f'Validation Loss {val_loss:.4f}') else: logging.info(f'Epoch: {epoch}\t' f'Training Loss {train_loss:.4f}') if args.cuda: break_training = torch.cuda.LongTensor([0]) else: break_training = torch.LongTensor([0]) if args.rank == 0 and not args.disable_eval: logging.info(f'Running evaluation on test set') mlperf_log.gnmt_print(key=mlperf_log.EVAL_START, value=epoch) model.eval() torch.cuda.empty_cache() eval_path = os.path.join(save_path, f'eval_epoch_{epoch}') eval_file = open(eval_path, 'w') for i, (src, tgt, indices) in enumerate(test_loader): src, src_length = src if translator.batch_first: batch_size = src.size(0) else: batch_size = src.size(1) beam_size = args.beam_size bos = [translator.insert_target_start] * (batch_size * beam_size) bos = torch.LongTensor(bos) if translator.batch_first: bos = bos.view(-1, 1) else: bos = bos.view(1, -1) src_length = torch.LongTensor(src_length) if args.cuda: src = src.cuda() src_length = src_length.cuda() bos = bos.cuda() with torch.no_grad(): context = translator.model.encode(src, src_length) context = [context, src_length, None] if beam_size == 1: generator = translator.generator.greedy_search else: generator = translator.generator.beam_search preds, lengths, counter = generator(batch_size, bos, context) preds = preds.cpu() lengths = lengths.cpu() output = [] for idx, pred in enumerate(preds): end = lengths[idx] - 1 pred = pred[1: end] pred = pred.tolist() out = translator.tok.detokenize(pred) output.append(out) output = [output[indices.index(i)] for i in range(len(output))] for line in output: eval_file.write(line) eval_file.write('\n') eval_file.close() # run moses detokenizer detok_path = os.path.join(args.dataset_dir, config.DETOKENIZER) detok_eval_path = eval_path + '.detok' with open(detok_eval_path, 'w') as detok_eval_file, \ open(eval_path, 'r') as eval_file: subprocess.run(['perl', f'{detok_path}'], stdin=eval_file, stdout=detok_eval_file, stderr=subprocess.DEVNULL) # run sacrebleu reference_path = os.path.join(args.dataset_dir, config.TGT_TEST_TARGET_FNAME) sacrebleu = subprocess.run([f'sacrebleu --input {detok_eval_path} \ {reference_path} --score-only -lc --tokenize intl'], stdout=subprocess.PIPE, shell=True) bleu = float(sacrebleu.stdout.strip()) logging.info(f'Finished evaluation on test set') logging.info(f'BLEU on test dataset: {bleu}') if args.target_bleu: if bleu >= args.target_bleu: logging.info(f'Target accuracy reached') break_training[0] = 1 torch.cuda.empty_cache() mlperf_log.gnmt_print(key=mlperf_log.EVAL_ACCURACY, value={"epoch": epoch, "value": bleu}) mlperf_log.gnmt_print(key=mlperf_log.EVAL_TARGET, value=args.target_bleu) mlperf_log.gnmt_print(key=mlperf_log.EVAL_STOP) if distributed: dist.broadcast(break_training, 0) logging.info(f'Finished epoch {epoch}') if break_training: break mlperf_log.gnmt_print(key=mlperf_log.RUN_STOP, value={"success": bool(break_training)}) mlperf_log.gnmt_print(key=mlperf_log.RUN_FINAL)
def main(): args = parse_args() print(args) if args.cuda: torch.cuda.set_device(0) if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if args.math == 'fp16' and not args.cuda: raise RuntimeError('fp16 requires cuda') if not args.cudnn: torch.backends.cudnn.enabled = False checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) vocab_size = checkpoint['tokenizer'].vocab_size model_config = dict(vocab_size=vocab_size, math=checkpoint['config'].math, **literal_eval(checkpoint['config'].model_config)) model_config['batch_first'] = args.batch_first model = models.GNMT(**model_config) state_dict = checkpoint['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) if args.math == 'fp32': dtype = torch.FloatTensor if args.math == 'fp16': dtype = torch.HalfTensor model.type(dtype) if args.cuda: model = model.cuda() model.eval() tokenizer = checkpoint['tokenizer'] test_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TEST_FNAME), tokenizer=tokenizer, min_len=0, max_len=150, sort=False) test_loader = test_data.get_loader(batch_size=args.batch_size, batch_first=True, shuffle=False, num_workers=0, drop_last=False, distributed=False) translator = Translator(model, tokenizer, beam_size=args.beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda) model.eval() torch.cuda.empty_cache() # only write the output to file in accuracy mode if args.mode == 'accuracy': test_file = open(args.output, 'w', encoding='UTF-8') batch_time = AverageMeter(False) tot_tok_per_sec = AverageMeter(False) iterations = AverageMeter(False) enc_seq_len = AverageMeter(False) dec_seq_len = AverageMeter(False) stats = {} for i, (src, tgt, indices) in enumerate(test_loader): translate_timer = time.time() src, src_length = src if translator.batch_first: batch_size = src.size(0) else: batch_size = src.size(1) beam_size = args.beam_size bos = [translator.insert_target_start] * (batch_size * beam_size) bos = torch.LongTensor(bos) if translator.batch_first: bos = bos.view(-1, 1) else: bos = bos.view(1, -1) src_length = torch.LongTensor(src_length) stats['total_enc_len'] = int(src_length.sum()) if args.cuda: src = src.cuda() src_length = src_length.cuda() bos = bos.cuda() with torch.no_grad(): context = translator.model.encode(src, src_length) context = [context, src_length, None] if beam_size == 1: generator = translator.generator.greedy_search else: generator = translator.generator.beam_search preds, lengths, counter = generator(batch_size, bos, context) stats['total_dec_len'] = lengths.sum().item() stats['iters'] = counter preds = preds.cpu() lengths = lengths.cpu() output = [] for idx, pred in enumerate(preds): end = lengths[idx] - 1 pred = pred[1:end] pred = pred.tolist() out = translator.tok.detokenize(pred) output.append(out) # only write the output to file in accuracy mode if args.mode == 'accuracy': output = [output[indices.index(i)] for i in range(len(output))] for line in output: test_file.write(line) test_file.write('\n') # Get timing elapsed = time.time() - translate_timer batch_time.update(elapsed, batch_size) total_tokens = stats['total_dec_len'] + stats['total_enc_len'] ttps = total_tokens / elapsed tot_tok_per_sec.update(ttps, batch_size) iterations.update(stats['iters']) enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size) dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size) if i % 5 == 0: log = [] log += 'TEST ' log += 'Time {:.3f} ({:.3f})\t'.format(batch_time.val, batch_time.avg) log += 'Decoder iters {:.1f} ({:.1f})\t'.format( iterations.val, iterations.avg) log += 'Tok/s {:.0f} ({:.0f})'.format(tot_tok_per_sec.val, tot_tok_per_sec.avg) log = ''.join(log) print(log) # summary timing time_per_sentence = (batch_time.avg / batch_size) log = [] log += 'TEST SUMMARY:\n' log += 'Lines translated: {}\t'.format(len(test_loader.dataset)) log += 'Avg total tokens/s: {:.0f}\n'.format(tot_tok_per_sec.avg) log += 'Avg time per batch: {:.3f} s\t'.format(batch_time.avg) log += 'Avg time per sentence: {:.3f} ms\n'.format(1000 * time_per_sentence) log += 'Avg encoder seq len: {:.2f}\t'.format(enc_seq_len.avg) log += 'Avg decoder seq len: {:.2f}\t'.format(dec_seq_len.avg) log += 'Total decoder iterations: {}'.format(int(iterations.sum)) log = ''.join(log) print(log) # only write the output to file in accuracy mode if args.mode == 'accuracy': test_file.close() test_path = args.output # run moses detokenizer detok_path = os.path.join(args.dataset_dir, config.DETOKENIZER) detok_test_path = test_path + '.detok' with open(detok_test_path, 'w') as detok_test_file, \ open(test_path, 'r') as test_file: subprocess.run(['perl', detok_path], stdin=test_file, stdout=detok_test_file, stderr=subprocess.DEVNULL) # run sacrebleu reference_path = os.path.join(args.dataset_dir, config.TGT_TEST_TARGET_FNAME) sacrebleu = subprocess.run([ 'sacrebleu --input {} {} --score-only -lc --tokenize intl'.format( detok_test_path, reference_path) ], stdout=subprocess.PIPE, shell=True) bleu = float(sacrebleu.stdout.strip()) print('BLEU on test dataset: {}'.format(bleu)) print('Finished evaluation on test set')
def main(): mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__)) mlperf_log.LOGGER.propagate = False mlperf_log.gnmt_print(key=mlperf_log.RUN_START) args = exp.get_arguments(parse_args(), show=True) device = exp.get_device() chrono = exp.chrono() if not args.cudnn: torch.backends.cudnn.enabled = False # initialize distributed backend distributed = args.world_size > 1 if distributed: backend = 'nccl' if args.cuda else 'gloo' dist.init_process_group(backend=backend, rank=args.rank, init_method=args.dist_url, world_size=args.world_size) # create directory for results save_path = os.environ.get('OUTPUT_DIRECTORY') if save_path is None: save_path = '/tmp' if args.save is not None: save_path = os.path.join(args.results_dir, args.save) os.makedirs(save_path, exist_ok=True) # setup logging log_filename = f'log_gpu_{args.rank}.log' setup_logging(os.path.join(save_path, log_filename)) if args.cuda: torch.cuda.set_device(args.rank) # build tokenizer tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME)) train_data = ParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size) mlperf_log.gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=len(train_data)) vocab_size = tokenizer.vocab_size mlperf_log.gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size) # build GNMT model model_config = dict(vocab_size=vocab_size, math=args.math, **literal_eval(args.model_config)) model = models.GNMT(**model_config) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing) opt_config = literal_eval(args.optimization_config) # create trainer trainer_options = dict(criterion=criterion, grad_clip=args.grad_clip, save_path=save_path, save_freq=args.save_freq, save_info={ 'config': args, 'tokenizer': tokenizer }, opt_config=opt_config, batch_first=batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed) trainer_options['model'] = model trainer = trainers.Seq2SeqTrainer(**trainer_options, number=args.number) translator = Translator(model, tokenizer, beam_size=args.beam_size, max_seq_len=args.max_length_val, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda) num_parameters = sum([l.nelement() for l in model.parameters()]) # get data loaders train_loader = train_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=True, bucket=args.bucketing, num_workers=args.workers, drop_last=True, distributed=distributed) mlperf_log.gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size * args.world_size) mlperf_log.gnmt_print(key=mlperf_log.INPUT_SIZE, value=train_loader.sampler.num_samples) # training loop best_loss = float('inf') mlperf_log.gnmt_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(0, args.repeat): with chrono.time('train') as t: if distributed: train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss = trainer.optimize(train_loader) exp.log_epoch_loss(train_loss) exp.show_eta(epoch, t) exp.report()
def main(): global args, best_prec1 args = parser.parse_args() # Special case handling for GNMT model l2_promote() torch.cuda.set_device(args.local_rank) # build tokenizer tokenizer = Tokenizer(os.path.join(args.data_dir, config.VOCAB_FNAME)) # define loss function criterion = build_gnmt_criterion(vocab_size=tokenizer.vocab_size, padding_idx=config.PAD, smoothing=0.1) # create stages of the model module = importlib.import_module(args.module) args.arch = module.arch() model = module.model(criterion) input_size = [args.max_length_train, args.batch_size] training_tensor_shapes = { "input0": input_size, "input1": [args.batch_size], "input2": input_size, "target": [args.max_length_train * args.batch_size], "target_length": [args.batch_size] } dtypes = { "input0": torch.int64, "input1": torch.int64, "input2": torch.int64, "target": torch.int64, "target_length": torch.int32 } inputs_module_destinations = {"input0": 0, "input1": 0, "input2": 0} target_tensor_names = {"target", "target_length"} for module_id, (stage, inputs, outputs) in enumerate( model[:-1]): # Skip last layer (loss). input_tensors = [] for module_input in inputs: if module_input in inputs_module_destinations: inputs_module_destinations[module_input] = module_id input_tensor = torch.ones(tuple( training_tensor_shapes[module_input]), dtype=dtypes[module_input]).cuda() input_tensors.append(input_tensor) stage.cuda() # PyTorch should not maintain metadata for a backward pass on # synthetic inputs. Without the following line, the runtime is # as much as 1.5x slower in a full DP configuration. with torch.no_grad(): output_tensors = stage(*tuple(input_tensors)) if not type(output_tensors) is tuple: output_tensors = [output_tensors] for output, output_tensor in zip(outputs, list(output_tensors)): training_tensor_shapes[output] = list(output_tensor.size()) dtypes[output] = output_tensor.dtype eval_tensor_shapes = {} for key in training_tensor_shapes: eval_tensor_shapes[key] = tuple(training_tensor_shapes[key]) training_tensor_shapes[key] = tuple(training_tensor_shapes[key]) configuration_maps = { 'module_to_stage_map': None, 'stage_to_rank_map': None, 'stage_to_depth_map': None } if args.config_path is not None: json_config_file = json.load(open(args.config_path, 'r')) configuration_maps['module_to_stage_map'] = json_config_file.get( "module_to_stage_map", None) configuration_maps['stage_to_rank_map'] = json_config_file.get( "stage_to_rank_map", None) configuration_maps['stage_to_rank_map'] = { int(k): v for (k, v) in configuration_maps['stage_to_rank_map'].items() } configuration_maps['stage_to_depth_map'] = json_config_file.get( "stage_to_depth_map", None) r = runtime.StageRuntime( model=model, distributed_backend=args.distributed_backend, fp16=args.fp16, loss_scale=args.loss_scale, training_tensor_shapes=training_tensor_shapes, eval_tensor_shapes=eval_tensor_shapes, training_tensor_dtypes=dtypes, inputs_module_destinations=inputs_module_destinations, target_tensor_names=target_tensor_names, configuration_maps=configuration_maps, master_addr=args.master_addr, rank=args.rank, local_rank=args.local_rank, num_ranks_in_server=args.num_ranks_in_server, verbose_freq=args.verbose_frequency, model_type=runtime.TRANSLATION, enable_recompute=args.recompute) # stage needed to determine if current stage is the first stage # num_stages needed to determine if current stage is the last stage # num_ranks needed to determine number of warmup_minibatches in case of pipelining args.stage = r.stage args.num_stages = r.num_stages args.num_ranks = r.num_ranks if not is_first_stage(): args.synthetic_data = True # define optimizer if args.no_input_pipelining: num_versions = 1 else: # number of versions is the total number of machines following the current # stage, shared amongst all replicas in this stage num_versions = r.num_warmup_minibatches + 1 # if specified, resume from checkpoint if args.resume: checkpoint_file_path = "%s.%d.pth.tar" % (args.resume, r.stage) assert os.path.isfile(checkpoint_file_path) print("=> loading checkpoint '{}'".format(checkpoint_file_path)) checkpoint = torch.load(checkpoint_file_path) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] r.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file_path, checkpoint['epoch'])) # TODO: make this configurable by args use_adam_optimizer = True if use_adam_optimizer: optimizer = adam.AdamWithWeightStashing( modules=r.modules(), master_parameters=r.master_parameters, model_parameters=r.model_parameters, loss_scale=args.loss_scale, num_versions=num_versions, lr=args.lr, betas=(0.9, 0.999), weight_decay=args.weight_decay, verbose_freq=args.verbose_frequency, macrobatch=args.macrobatch) else: optimizer = sgd.SGDWithWeightStashing( modules=r.modules(), master_parameters=r.master_parameters, model_parameters=r.model_parameters, loss_scale=args.loss_scale, num_versions=num_versions, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, verbose_freq=args.verbose_frequency) if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) cudnn.benchmark = True train_dataset = LazyParallelDataset( src_fname=os.path.join(args.data_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.data_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=None) val_dataset = ParallelDataset( src_fname=os.path.join(args.data_dir, config.SRC_VAL_FNAME), tgt_fname=os.path.join(args.data_dir, config.TGT_VAL_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=True) distributed_sampler = False if configuration_maps['stage_to_rank_map'] is not None: num_ranks_in_first_stage = len( configuration_maps['stage_to_rank_map'][0]) if num_ranks_in_first_stage > 1: distributed_sampler = True # TODO: fix random seeds train_loader = train_dataset.get_loader( batch_size=args.batch_size, seeds=range(args.epochs), batch_first=False, shuffle=True, bucketing=not args.no_bucketing, num_workers=args.workers, world_size=r.num_ranks_in_first_stage, rank=r.rank_in_stage if r.stage == 0 else 0) val_loader = val_dataset.get_loader( batch_size=args.batch_size, batch_first=False, shuffle=True, num_workers=args.workers, world_size=r.num_ranks_in_first_stage, seeds=range(args.epochs), rank=r.rank_in_stage if r.stage == 0 else 0) # if checkpoint is loaded, start by running validation if args.resume: assert args.start_epoch > 0 validate(val_loader, r, args.start_epoch - 1) for epoch in range(args.start_epoch, args.epochs): if distributed_sampler: train_loader.sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args.epochs, r, args.lr_policy) # train or run forward pass only for one epoch if args.forward_only: validate(val_loader, r, epoch) else: train(train_loader, r, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, r, epoch) if r.stage != r.num_stages: prec1 = 0 # remember best prec@1 and save checkpoint best_prec1 = max(prec1, best_prec1) should_save_checkpoint = args.checkpoint_dir_not_nfs or r.rank_in_stage == 0 if args.checkpoint_dir and should_save_checkpoint: save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': r.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), 'tokenizer': tokenizer.get_state() }, args.checkpoint_dir, r.stage, epoch)
def main(): """ Launches data-parallel multi-gpu training. """ mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__)) mlperf_log.LOGGER.propagate = False args = parse_args() device = utils.set_device(args.cuda, args.local_rank) distributed = utils.init_distributed(args.cuda) gnmt_print(key=mlperf_log.RUN_START, sync=True) args.rank = utils.get_rank() if not args.cudnn: torch.backends.cudnn.enabled = False # create directory for results save_path = os.path.join(args.results_dir, args.save) args.save_path = save_path os.makedirs(save_path, exist_ok=True) # setup logging log_filename = f'log_rank_{utils.get_rank()}.log' utils.setup_logging(os.path.join(save_path, log_filename)) if args.env: utils.log_env_info() logging.info(f'Saving results to: {save_path}') logging.info(f'Run arguments: {args}') # automatically set train_iter_size based on train_global_batch_size, # world_size and per-worker train_batch_size if args.train_global_batch_size is not None: global_bs = args.train_global_batch_size bs = args.train_batch_size world_size = utils.get_world_size() assert global_bs % (bs * world_size) == 0 args.train_iter_size = global_bs // (bs * world_size) logging.info(f'Global batch size was set in the config, ' f'Setting train_iter_size to {args.train_iter_size}') worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs, device) worker_seed = worker_seeds[args.rank] logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # build tokenizer pad_vocab = utils.pad_vocabulary(args.math) tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME), pad_vocab) # build datasets gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING, sync=False) gnmt_print(key=mlperf_log.TRAIN_HP_MAX_SEQ_LEN, value=args.max_length_train, sync=False) train_data = LazyParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size) gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=len(train_data), sync=False) val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_VAL_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_VAL_FNAME), tokenizer=tokenizer, min_len=args.min_length_val, max_len=args.max_length_val, sort=True) gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL, sync=False) test_data = TextDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tokenizer=tokenizer, min_len=args.min_length_test, max_len=args.max_length_test, sort=True) gnmt_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=len(test_data), sync=False) vocab_size = tokenizer.vocab_size gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size, sync=False) # build GNMT model model_config = { 'hidden_size': args.hidden_size, 'num_layers': args.num_layers, 'dropout': args.dropout, 'batch_first': False, 'share_embedding': args.share_embedding } model = GNMT(vocab_size=vocab_size, **model_config) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing) opt_config = {'optimizer': args.optimizer, 'lr': args.lr} opt_config.update(literal_eval(args.optimizer_extra)) logging.info(f'Training optimizer config: {opt_config}') scheduler_config = { 'warmup_steps': args.warmup_steps, 'remain_steps': args.remain_steps, 'decay_interval': args.decay_interval, 'decay_steps': args.decay_steps, 'decay_factor': args.decay_factor } logging.info(f'Training LR schedule config: {scheduler_config}') num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') batching_opt = { 'shard_size': args.shard_size, 'num_buckets': args.num_buckets } # get data loaders train_loader = train_data.get_loader(batch_size=args.train_batch_size, seeds=shuffling_seeds, batch_first=batch_first, shuffle=True, batching=args.batching, batching_opt=batching_opt, num_workers=args.train_loader_workers) gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.train_batch_size * utils.get_world_size(), sync=False) gnmt_print(key=mlperf_log.INPUT_SIZE, value=train_loader.sampler.num_samples, sync=False) val_loader = val_data.get_loader(batch_size=args.val_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.val_loader_workers) test_loader = test_data.get_loader(batch_size=args.test_batch_size, batch_first=batch_first, shuffle=False, pad=True, num_workers=args.test_loader_workers) gnmt_print(key=mlperf_log.EVAL_SIZE, value=len(test_loader.dataset), sync=False) translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_length_test, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir, target_bleu=args.target_bleu, save_path=args.save_path) # create trainer total_train_iters = len(train_loader) // args.train_iter_size * args.epochs save_info = { 'model_config': model_config, 'config': args, 'tokenizer': tokenizer.get_state() } trainer_options = dict(criterion=criterion, grad_clip=args.grad_clip, iter_size=args.train_iter_size, save_path=save_path, save_freq=args.save_freq, save_info=save_info, opt_config=opt_config, scheduler_config=scheduler_config, train_iterations=total_train_iters, batch_first=batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed, intra_epoch_eval=args.intra_epoch_eval, translator=translator) trainer_options['model'] = model trainer = trainers.Seq2SeqTrainer(**trainer_options) # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # training loop best_loss = float('inf') break_training = False test_bleu = None gnmt_print(key=mlperf_log.TRAIN_LOOP, sync=True) for epoch in range(args.start_epoch, args.epochs): logging.info(f'Starting epoch {epoch}') gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=epoch, sync=True) train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss, train_perf = trainer.optimize(train_loader) # evaluate on validation set if args.eval: logging.info(f'Running validation on dev set') val_loss, val_perf = trainer.evaluate(val_loader) # remember best prec@1 and save checkpoint gnmt_print(key=mlperf_log.TRAIN_CHECKPOINT, sync=False) if args.rank == 0: is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) trainer.save(save_all=args.save_all, is_best=is_best) if args.eval: gnmt_print(key=mlperf_log.EVAL_START, value=epoch, sync=True) test_bleu, break_training = translator.run(calc_bleu=True, epoch=epoch) gnmt_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": round(test_bleu, 2) }, sync=False) gnmt_print(key=mlperf_log.EVAL_TARGET, value=args.target_bleu, sync=False) gnmt_print(key=mlperf_log.EVAL_STOP, sync=True) acc_log = [] acc_log += [f'Summary: Epoch: {epoch}'] acc_log += [f'Training Loss: {train_loss:.4f}'] if args.eval: acc_log += [f'Validation Loss: {val_loss:.4f}'] acc_log += [f'Test BLEU: {test_bleu:.2f}'] perf_log = [] perf_log += [f'Performance: Epoch: {epoch}'] perf_log += [f'Training: {train_perf:.0f} Tok/s'] if args.eval: perf_log += [f'Validation: {val_perf:.0f} Tok/s'] if args.rank == 0: logging.info('\t'.join(acc_log)) logging.info('\t'.join(perf_log)) logging.info(f'Finished epoch {epoch}') if break_training: break gnmt_print(key=mlperf_log.RUN_STOP, value={"success": bool(break_training)}, sync=True) gnmt_print(key=mlperf_log.RUN_FINAL, sync=False)
def main(): execution_timer = time.time() tfiargs = tfiParser.getParser() args = tfiargs.parse_args() # import os # os.environ['CUDA_LAUNCH_BLOCKING']='1' if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True if args.gpu is not None: torch.cuda.set_device(args.gpu) cudnn.benchmark = True print("Use GPU: {} for training".format(args.gpu)) checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) vocab_size = checkpoint['tokenizer'].vocab_size model_config = dict(vocab_size=vocab_size, math=checkpoint['config'].math, **literal_eval(checkpoint['config'].model_config)) model_config['batch_first'] = args.batch_first model = models.GNMT(**model_config) state_dict = checkpoint['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) if args.gpu is not None: model = model.cuda() tokenizer = checkpoint['tokenizer'] test_data = ParallelDataset(src_fname=os.path.join(args.data, config.SRC_TEST_FNAME), tgt_fname=os.path.join(args.data, config.TGT_TEST_FNAME), tokenizer=tokenizer, min_len=0, max_len=150, sort=False) test_loader = test_data.get_loader(batch_size=args.batch_size, batch_first=True, shuffle=False, num_workers=0, drop_last=False, distributed=False) translator = Translator(model, tokenizer, beam_size=args.beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.gpu is not None) model.eval() # torch.cuda.empty_cache() if args.record_prefix is not None: record = Record('GNMTv2', batch_size=args.batch_size, injection=args.injection, fiLayer=args.layer, fiFeatures=args.fiFeats, fiWeights=args.fiWeights) # Faulty Run if args.faulty: fi = FI(model, record=record, fiMode=args.injection, fiLayer=args.layer, fiBit=args.bit, fiFeatures=args.fiFeats, fiWeights=args.fiWeights, log=args.log) traverse_time = AverageMeter() start = time.time() fi.traverseModel(model) traverse_time.update(time.time() - start) displayConfig(args) fi.injectionMode = True print("\n Number of new layers: #%d \n" % fi.numNewLayers) elif args.golden: import distiller.modules as dist model = dist.convert_model_to_distiller_lstm(model) if args.quantize: overrides_yaml = """ .*att_rnn.attn.*: clip_acts: NONE # Quantize without clipping decoder.classifier.classifier: clip_acts: NONE # Quantize without clipping """ from distiller.utils import yaml_ordered_load overrides = yaml_ordered_load( overrides_yaml) # Basic quantizer defintion stats_file = '/home/bfgoldstein/torchfi/examples/wmt16/model_stats.yaml' quantizer = tfi.FIPostTraLinearQuantizer( model, mode=args.quant_mode, bits_activations=args.quant_bacts, bits_parameters=args.quant_bwts, bits_accum=args.quant_baccum, per_channel_wts=args.quant_channel, clip_acts=args.quant_cacts, model_activation_stats=args.quant_stats_file, overrides=overrides, clip_n_stds=args.quant_cnstds, scale_approx_mult_bits=args.quant_scalebits) quantizer.prepare_model() # model = quantizer.model if args.faulty: fi.setQuantParams(args) print(model._modules.items()) # Setting model to evaluation mode and cuda (if enabled) after FI traverse model.eval() if args.gpu is not None: model = model.cuda() test_file = open(args.record_prefix + getRecordPrefix(args, 'fp32', faulty=args.faulty) + ".tok", 'w', encoding='UTF-8') batch_time = AverageMeter(False) tot_tok_per_sec = AverageMeter(False) iterations = AverageMeter(False) enc_seq_len = AverageMeter(False) dec_seq_len = AverageMeter(False) bleu_score = AverageMeter(False) score_time = AverageMeter(False) stats = {} reference_content = readReferenceFile(args) for batch_idx, (input, target, indices) in enumerate(test_loader): translate_timer = time.time() input_data, input_lenght = input if translator.batch_first: batch_size = input_data.size(0) else: batch_size = input_data.size(1) beam_size = args.beam_size bos = [translator.insert_target_start] * (batch_size * beam_size) bos = torch.LongTensor(bos) if translator.batch_first: bos = bos.view(-1, 1) else: bos = bos.view(1, -1) input_lenght = torch.LongTensor(input_lenght) stats['total_enc_len'] = int(input_lenght.sum()) if args.gpu is not None: input_data = input_data.cuda(args.gpu, non_blocking=True) input_lenght = input_lenght.cuda(args.gpu, non_blocking=True) bos = bos.cuda(args.gpu, non_blocking=True) with torch.no_grad(): context = translator.model.encode(input_data, input_lenght) context = [context, input_lenght, None] if beam_size == 1: generator = translator.generator.greedy_search else: generator = translator.generator.beam_search preds, lengths, counter = generator(batch_size, bos, context) if args.faulty: fi.injectionMode = True stats['total_dec_len'] = lengths.sum().item() stats['iters'] = counter preds = preds.cpu() lengths = lengths.cpu() output = [] for idx, pred in enumerate(preds): end = lengths[idx] - 1 pred = pred[1:end] pred = pred.tolist() out = translator.tok.detokenize(pred) output.append(out) output = [output[indices.index(i)] for i in range(len(output))] for line_idx, line in enumerate(output): score_timer = time.time() detok_sentence = detokenizeSentence(args, line) chunk = (batch_idx * batch_size) + line_idx score = scoreBleuSentence(args, detok_sentence, reference_content[chunk]) bleu_score.update(score) record.addBleuScores(score) # Get timing elapsed = time.time() - score_timer score_time.update(elapsed) test_file.write(line) test_file.write('\n') # Get timing elapsed = time.time() - translate_timer batch_time.update(elapsed, batch_size) total_tokens = stats['total_dec_len'] + stats['total_enc_len'] ttps = total_tokens / elapsed tot_tok_per_sec.update(ttps, batch_size) iterations.update(stats['iters']) enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size) dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size) if batch_idx % args.print_freq == 0: print('[Test {}] Time: {:.3f} ({:.3f})\t \ Decoder iters {:.1f} ({:.1f})\t \ Tok/s {:.0f} ({:.0f})\n \ Bleu score: {:.2f} ({:.2f})\t \ Bleu time: {:.3f} ({:.3f})'.format( batch_idx, batch_time.val, batch_time.avg, iterations.val, iterations.avg, tot_tok_per_sec.val, tot_tok_per_sec.avg, bleu_score.val, bleu_score.avg, score_time.val, score_time.avg)) # summary timing time_per_sentence = (batch_time.avg / batch_size) print('[Test] Summary \n \ Lines translated: {}\t \ Avg total tokens/s: {:.0f}\n \ Avg time per batch: {:.3f} s\t \ Avg time per sentence: {:.3f} ms\n \ Avg encoder seq len: {:.2f}\t \ Avg decoder seq len: {:.2f}\t \ Total decoder iterations: {}\n \ Traverse time : {:.3f} s\t \ Total number of injections: {}'.format( len(test_loader.dataset), tot_tok_per_sec.avg, batch_time.avg, 1000 * time_per_sentence, enc_seq_len.avg, dec_seq_len.avg, int(iterations.sum), traverse_time.val if args.faulty else 0.0, int(fi.numInjections) if args.faulty else 0)) test_file.close() detok = detokenizeFile(args) bleu = scoreBleuFile(args, detok) record.setBleuScoreAvg(bleu) saveRecord( args.record_prefix + getRecordPrefix(args, 'fp32', faulty=args.faulty), record) print('BLEU on test dataset: {}'.format(bleu)) # Get timing execution_elapsed = time.time() - execution_timer print('Finished evaluation on test set in {:.2f} seconds'.format( execution_elapsed))