def mlperf_submission_log(benchmark): num_nodes = os.environ.get('SLURM_NNODES', 1) configure_logger(benchmark) log_event( key=constants.SUBMISSION_BENCHMARK, value=benchmark, ) log_event( key=constants.SUBMISSION_ORG, value='Inspur') log_event( key=constants.SUBMISSION_DIVISION, value='closed') log_event( key=constants.SUBMISSION_STATUS, value='onprem') log_event( key=constants.SUBMISSION_PLATFORM, value=f'{num_nodes}xNF5488')
def mlperf_submission_log(benchmark): num_nodes = os.environ.get('SLURM_NNODES', 1) if int(num_nodes) > 1: torch.distributed.init_process_group(backend='nccl', init_method='env://') configure_logger(benchmark) log_event( key=constants.SUBMISSION_BENCHMARK, value=benchmark, ) log_event(key=constants.SUBMISSION_ORG, value='Fujitsu') log_event(key=constants.SUBMISSION_DIVISION, value='closed') log_event(key=constants.SUBMISSION_STATUS, value='onprem') log_event(key=constants.SUBMISSION_PLATFORM, value=f'1xGX2570M5')
def __init__(self, optimizer, iterations, warmup_steps=0, remain_steps=1.0, decay_interval=None, decay_steps=4, decay_factor=0.5, last_epoch=-1): """ Constructor of WarmupMultiStepLR. Parameters: warmup_steps, remain_steps and decay_interval accept both integers and floats as an input. Integer input is interpreted as absolute index of iteration, float input is interpreted as a fraction of total training iterations (epochs * steps_per_epoch). If decay_interval is None then the decay will happen at regulary spaced intervals ('decay_steps' decays between iteration indices 'remain_steps' and 'iterations'). :param optimizer: instance of optimizer :param iterations: total number of training iterations :param warmup_steps: number of warmup iterations :param remain_steps: start decay at 'remain_steps' iteration :param decay_interval: interval between LR decay steps :param decay_steps: max number of decay steps :param decay_factor: decay factor :param last_epoch: the index of last iteration """ # iterations before learning rate reaches base LR self.warmup_steps = perhaps_convert_float(warmup_steps, iterations) logging.info(f'Scheduler warmup steps: {self.warmup_steps}') # iteration at which decay starts self.remain_steps = perhaps_convert_float(remain_steps, iterations) logging.info(f'Scheduler remain steps: {self.remain_steps}') # number of steps between each decay if decay_interval is None: # decay at regulary spaced intervals decay_iterations = iterations - self.remain_steps self.decay_interval = decay_iterations // (decay_steps) self.decay_interval = max(self.decay_interval, 1) else: self.decay_interval = perhaps_convert_float( decay_interval, iterations) logging.info(f'Scheduler decay interval: {self.decay_interval}') # multiplicative decay factor self.decay_factor = decay_factor logging.info(f'Scheduler decay factor: {self.decay_factor}') # max number of decay steps self.decay_steps = decay_steps logging.info(f'Scheduler max decay steps: {self.decay_steps}') if self.warmup_steps > self.remain_steps: logging.warn(f'warmup_steps should not be larger than ' f'remain_steps, setting warmup_steps=remain_steps') self.warmup_steps = self.remain_steps log_event(key=constants.OPT_LR_ALT_DECAY_FUNC, value=True) log_event(key=constants.OPT_LR_ALT_WARMUP_FUNC, value=True) log_event(key=constants.OPT_LR_DECAY_INTERVAL, value=self.decay_interval) log_event(key=constants.OPT_LR_DECAY_FACTOR, value=self.decay_factor) log_event(key=constants.OPT_LR_DECAY_STEPS, value=self.decay_steps) log_event(key=constants.OPT_LR_REMAIN_STEPS, value=self.remain_steps) log_event(key=constants.OPT_LR_WARMUP_STEPS, value=self.warmup_steps) super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
def __init__(self, model, criterion, opt_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), batch_first=False, save_info={}, save_path='.', train_iterations=0, checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', loss_scaling={}, cuda=True, distributed=False, distributed_overlap_allreduce=False, distributed_overlap_num_allreduce_streams=1, distributed_overlap_allreduce_messagesize=1e7, distributed_overlap_allreduce_communicators=None, intra_epoch_eval=0, prealloc_mode='always', iter_size=1, verbose=False, args=None): """ Constructor for the Seq2SeqTrainer. :param model: model to train :param criterion: criterion (loss function) :param opt_config: dictionary with options for the optimizer :param print_freq: prints short summary every 'print_freq' iterations :param save_freq: saves checkpoint every 'save_freq' iterations :param grad_clip: coefficient for gradient clipping :param batch_first: if True the model uses (batch,seq,feature) tensors, if false the model uses (seq, batch, feature) :param save_info: dict with additional state stored in each checkpoint :param save_path: path to the directiory for checkpoints :param train_iterations: total number of training iterations to execute :param checkpoint_filename: name of files with checkpoints :param keep_checkpoints: max number of checkpoints to keep :param math: arithmetic type :param loss_scaling: options for dynamic loss scaling :param cuda: if True use cuda, if False train on cpu :param distributed: if True run distributed training :param intra_epoch_eval: number of additional eval runs within each training epoch :param prealloc_mode: controls preallocation, choices=['off', 'once', 'always'] :param iter_size: number of iterations between weight updates :param verbose: enables verbose logging """ super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_path = save_path self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.cuda = cuda self.distributed = distributed self.print_freq = print_freq self.batch_first = batch_first self.verbose = verbose self.loss = None self.translator = None self.scheduler = None self.intra_epoch_eval = intra_epoch_eval self.iter_size = iter_size self.prealloc_mode = prealloc_mode self.preallocated = False # Assume multi-tensor apply if with APEX DDP self.args = args self.use_mt = (distributed and iter_size == 1 and \ opt_config['optimizer'] == 'FusedAdam') # Use APEX gradient average if gradient accumulation option enabled self.retain_allreduce_buffers = True if iter_size == 1 else False self.gradient_average = False if iter_size == 1 else True if cuda: self.model = self.model.cuda() self.criterion = self.criterion.cuda() params = self.model.parameters() if math == 'fp16': self.model = self.model.half() if distributed and self.args.distributed_weight_update != 2: self.model = DDP(self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce), num_allreduce_streams=distributed_overlap_num_allreduce_streams, allreduce_communicators=distributed_overlap_allreduce_communicators, retain_allreduce_buffers=self.retain_allreduce_buffers, gradient_average=self.gradient_average) if self.args.distributed_weight_update == 2: # gradient clipping maintained by DistributedFusedAdam self.fp_optimizer = DwuFp16Optimizer( self.model, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval'] ) params = list(self.model.parameters()) else: self.fp_optimizer = Fp16Optimizer( self.model, grad_clip, use_mt=self.use_mt, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval'] ) params = self.fp_optimizer.fp32_params if isinstance(self.fp_optimizer.fp32_params, list) \ else [self.fp_optimizer.fp32_params] elif math == 'fp32': if distributed: self.model = DDP(self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce)) self.fp_optimizer = Fp32Optimizer(self.model, grad_clip) # params = self.model.parameters() opt_name = opt_config.pop('optimizer') if opt_name == 'FusedAdam': if math == 'fp16' or math == 'fp32': if self.args.distributed_weight_update == 2: dwu_args = self.distributed_weight_update_config self.optimizer = DistributedFusedAdam(params, max_grad_norm=grad_clip, **dwu_args, **opt_config) self.optimizer.set_global_scale(1.0) # used for grad norm clipping in step function else: # Maintain grad norm and scaling by ourselves self.optimizer = FusedAdam(params, use_mt=self.use_mt, **opt_config) else: self.optimizer = FusedAdam(params, use_mt=self.use_mt, max_grad_norm=grad_clip, amp_scale_adjustment=get_world_size(), **opt_config) else: self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config) logging.info(f'Using optimizer: {self.optimizer}') log_event(key=constants.OPT_NAME, value=constants.ADAM, sync=False) log_event(key=constants.OPT_BASE_LR, value=opt_config['lr'], sync=False) log_event(key=constants.OPT_ADAM_BETA_1, value=self.optimizer.defaults['betas'][0], sync=False) log_event(key=constants.OPT_ADAM_BETA_2, value=self.optimizer.defaults['betas'][1], sync=False) log_event(key=constants.OPT_ADAM_EPSILON, value=self.optimizer.defaults['eps'], sync=False)
def main(): """ Launches data-parallel multi-gpu training. """ configure_logger(constants.GNMT) log_start(key=constants.INIT_START, log_all_ranks=True) args = parse_args() device = utils.set_device(args.cuda, args.local_rank) distributed = utils.init_distributed(args.cuda) # preinit and warmup streams/ groups for apex DDP communicators # distributed weight update doesn't require this allreduce_communicators=None if distributed and args.distributed_weight_update == 0 and \ args.apex_num_allreduce_streams > 1: bucket_pgs = [torch.distributed.new_group() for _ in range(args.apex_num_allreduce_streams)] bucket_streams = [torch.cuda.Stream() for _ in range(args.apex_num_allreduce_streams)] for pg, stream in zip(bucket_pgs,bucket_streams): with torch.cuda.stream(stream): torch.distributed.all_reduce(torch.cuda.FloatTensor(1), group=pg) allreduce_communicators=(bucket_pgs,bucket_streams) args.rank = utils.get_rank() if not args.cudnn: torch.backends.cudnn.enabled = False # create directory for results save_path = os.path.join(args.results_dir, args.save) args.save_path = save_path os.makedirs(save_path, exist_ok=True) # setup logging log_filename = f'log_rank_{utils.get_rank()}.log' utils.setup_logging(args.log_all_ranks, os.path.join(save_path, log_filename)) if args.env: utils.log_env_info() logging.info(f'Saving results to: {save_path}') logging.info(f'Run arguments: {args}') # additional argument check if args.math == 'fp32' and (args.fused_attention or args.fused_xentropy): logging.warn(f'Only support FP16 `--fused-attention` and ' '`--fused-xentropy`, disabling them') args.fused_attention = args.fused_xentropy = False # automatically set train_iter_size based on train_global_batch_size, # world_size and per-worker train_batch_size if args.train_global_batch_size is not None: global_bs = args.train_global_batch_size bs = args.train_batch_size world_size = utils.get_world_size() assert global_bs % (bs * world_size) == 0 args.train_iter_size = global_bs // (bs * world_size) logging.info(f'Global batch size was set in the config, ' f'Setting train_iter_size to {args.train_iter_size}') # setup L2 promotion if args.cuda: utils.l2_promote() worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs, device) worker_seed = worker_seeds[args.rank] logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # build tokenizer # https://github.com/mlperf/policies/issues/201 pad_vocab = utils.pad_vocabulary(args.math) tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME), pad_vocab) vocab_size = tokenizer.vocab_size # build GNMT model model_config = {'hidden_size': args.hidden_size, 'num_layers': args.num_layers, 'dropout': args.dropout, 'batch_first': False, 'share_embedding': args.share_embedding, 'fusion': args.fused_attention} model = GNMT(vocab_size=vocab_size, **model_config) logging.info(model) # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing, args.fused_xentropy) opt_config = {'optimizer': args.optimizer, 'lr': args.lr} opt_config.update(literal_eval(args.optimizer_extra)) logging.info(f'Training optimizer config: {opt_config}') num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') # create trainer save_info = {'model_config': model_config, 'config': args, 'tokenizer': tokenizer.get_state()} loss_scaling = {'init_scale': args.init_scale, 'upscale_interval': args.upscale_interval} trainer_options = dict( criterion=criterion, grad_clip=args.grad_clip, iter_size=args.train_iter_size, save_path=save_path, save_freq=args.save_freq, save_info=save_info, opt_config=opt_config, batch_first=model.batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, loss_scaling=loss_scaling, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed, distributed_overlap_allreduce=args.enable_apex_allreduce_overlap, distributed_overlap_num_allreduce_streams=args.apex_num_allreduce_streams, distributed_overlap_allreduce_messagesize=args.apex_message_size, distributed_overlap_allreduce_communicators=allreduce_communicators, intra_epoch_eval=args.intra_epoch_eval, prealloc_mode=args.prealloc_mode) trainer_options['model'] = model trainer = trainers.Seq2SeqTrainer(args=args, **trainer_options) trainer.preallocate(args.train_batch_size, args.max_length_train, training=True) log_end(key=constants.INIT_STOP, sync=False) log_start(key=constants.RUN_START, sync=True) utils.barrier() log_event(key=constants.MAX_SEQUENCE_LENGTH, value=args.max_length_train, metadata={'method': 'discard'}) if args.use_preproc_data: train_data = PreprocessedDataset( min_len=args.min_length_train, max_len=args.max_length_train, vocab_size=tokenizer.vocab_size, ) train_data.read_data( os.path.join(args.preproc_data_dir, 'training.bin'), tokenizer.vocab_size, ) train_data.prepare() else: train_data = LazyParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size, ) test_data = TextDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tokenizer=tokenizer, min_len=args.min_length_test, max_len=args.max_length_test, sort=True) batching_opt = {'shard_size': args.shard_size, 'num_buckets': args.num_buckets} # get data loaders train_loader = train_data.get_loader(batch_size=args.train_batch_size, seeds=shuffling_seeds, batch_first=model.batch_first, shuffle=True, batching=args.batching, batching_opt=batching_opt, num_workers=args.train_loader_workers) log_event(key=constants.GLOBAL_BATCH_SIZE, value=args.train_batch_size * utils.get_world_size(), sync=False) test_loader = test_data.get_loader(batch_size=args.test_batch_size, batch_first=model.batch_first, shuffle=False, num_workers=args.test_loader_workers) log_event(key=constants.TRAIN_SAMPLES, value=train_loader.sampler.num_samples, sync=False) log_event(key=constants.EVAL_SAMPLES, value=len(test_loader.dataset), sync=False) translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_length_test, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir, target_bleu=args.target_bleu, save_path=args.save_path) total_train_iters = len(train_loader) // args.train_iter_size * args.epochs scheduler_config = {'warmup_steps': args.warmup_steps, 'remain_steps': args.remain_steps, 'decay_interval': args.decay_interval, 'decay_steps': args.decay_steps, 'decay_factor': args.decay_factor} logging.info(f'Training LR schedule config: {scheduler_config}') scheduler = WarmupMultiStepLR(trainer.optimizer, total_train_iters, **scheduler_config) trainer.scheduler = scheduler trainer.translator = translator # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join( checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # training loop break_training = False test_bleu = None for epoch in range(args.start_epoch, args.epochs): log_start(key=constants.BLOCK_START, metadata={'first_epoch_num': epoch + 1, 'epoch_count': 1}, sync=False) log_start(key=constants.EPOCH_START, metadata={'epoch_num': epoch + 1}, sync=False) logging.info(f'Starting epoch {epoch}') train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss, train_perf = trainer.optimize(train_loader) log_end(key=constants.EPOCH_STOP, metadata={'epoch_num': epoch + 1}, sync=False) if args.eval: log_start(key=constants.EVAL_START, metadata={'epoch_num': epoch + 1}, sync=False) test_bleu, break_training = translator.run(calc_bleu=True, epoch=epoch) log_event(key=constants.EVAL_ACCURACY, value=test_bleu / 100, metadata={'epoch_num': epoch + 1}, sync=False) log_end(key=constants.EVAL_STOP, metadata={'epoch_num': epoch + 1}, sync=False) acc_log = [] acc_log += [f'Summary: Epoch: {epoch}'] acc_log += [f'Training Loss: {train_loss:.4f}'] if args.eval: acc_log += [f'Test BLEU: {test_bleu:.2f}'] perf_log = [] perf_log += [f'Performance: Epoch: {epoch}'] perf_log += [f'Training: {train_perf:.0f} Tok/s'] if args.rank == 0: logging.info('\t'.join(acc_log)) logging.info('\t'.join(perf_log)) logging.info(f'Finished epoch {epoch}') log_end(key=constants.BLOCK_STOP, metadata={'first_epoch_num': epoch + 1}, sync=False) if break_training: break if args.use_preproc_data: train_data.finalize() status = 'success' if break_training else 'aborted' log_end(key=constants.RUN_STOP, metadata={'status': status}, sync=False)