def __init__(self, model, criterion, opt_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), batch_first=False, save_info={}, save_path='.', checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', cuda=True, distributed=False, verbose=False): super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_path = save_path self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.cuda = cuda self.distributed = distributed self.print_freq = print_freq self.batch_first = batch_first self.verbose = verbose self.loss = None if cuda: self.model = self.model.cuda() self.criterion = self.criterion.cuda() if distributed: self.model = DDP(self.model) if math == 'fp16': self.model = self.model.half() self.fp_optimizer = Fp16Optimizer(self.model, grad_clip) params = self.fp_optimizer.fp32_params elif math == 'fp32': self.fp_optimizer = Fp32Optimizer(self.model, grad_clip) params = self.model.parameters() opt_name = opt_config['optimizer'] lr = opt_config['lr'] self.optimizer = torch.optim.__dict__[opt_name](params, lr=lr) mlperf_log.gnmt_print(key=mlperf_log.OPT_NAME, value=opt_name) mlperf_log.gnmt_print(key=mlperf_log.OPT_LR, value=lr) mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=self.optimizer.defaults['betas'][0]) mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=self.optimizer.defaults['betas'][1]) mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=self.optimizer.defaults['eps'])
def __init__(self, model, criterion, opt_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), batch_first=False, save_info={}, save_path='.', checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', cuda=True, distributed=False, verbose=False, log_dir=None, num_minibatches=20, cupti=False): super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_path = save_path self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.cuda = cuda self.distributed = distributed self.print_freq = print_freq self.batch_first = batch_first self.verbose = verbose self.loss = None self.cupti = cupti self.log_dir = log_dir self.num_steps = num_minibatches self.math = math self.grad_clip = grad_clip if cuda: self.model = self.model.cuda() self.criterion = self.criterion.cuda() if distributed: self.model = DDP(self.model, log_dir=self.log_dir) if math == 'fp16': self.model = self.model.half() self.fp_optimizer = Fp16Optimizer(self.model, grad_clip) params = self.fp_optimizer.fp32_params elif math == 'fp32': self.fp_optimizer = Fp32Optimizer(self.model, grad_clip) params = self.model.parameters() opt_name = opt_config['optimizer'] lr = opt_config['lr'] self.optimizer = torch.optim.__dict__[opt_name](params, lr=lr) print("optimizer name " + opt_name) print(type(self.optimizer))
def __init__(self, model, criterion, opt_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), batch_first=False, save_info={}, save_path='.', checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', cuda=True, distributed=False, intra_epoch_eval=0, translator=None, verbose=False): super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_path = save_path self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.cuda = cuda self.distributed = distributed self.print_freq = print_freq self.batch_first = batch_first self.verbose = verbose self.loss = None self.translator = translator self.intra_epoch_eval = intra_epoch_eval if cuda: self.model = self.model.cuda() self.criterion = self.criterion.cuda() if distributed: self.model = DDP(self.model) if math == 'fp16': self.model = self.model.half() self.fp_optimizer = Fp16Optimizer(self.model, grad_clip) params = self.fp_optimizer.fp32_params elif math == 'fp32': self.fp_optimizer = Fp32Optimizer(self.model, grad_clip) params = self.model.parameters() opt_name = opt_config.pop('optimizer') self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config) logging.info(f'Using optimizer: {self.optimizer}')
def __init__(self, model, criterion, opt_config, scheduler_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), batch_first=False, save_info={}, save_path='.', train_iterations=0, checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', cuda=True, distributed=False, intra_epoch_eval=0, iter_size=1, translator=None, verbose=False): """ Constructor for the Seq2SeqTrainer. :param model: model to train :param criterion: criterion (loss function) :param opt_config: dictionary with options for the optimizer :param scheduler_config: dictionary with options for the learning rate scheduler :param print_freq: prints short summary every 'print_freq' iterations :param save_freq: saves checkpoint every 'save_freq' iterations :param grad_clip: coefficient for gradient clipping :param batch_first: if True the model uses (batch,seq,feature) tensors, if false the model uses (seq, batch, feature) :param save_info: dict with additional state stored in each checkpoint :param save_path: path to the directiory for checkpoints :param train_iterations: total number of training iterations to execute :param checkpoint_filename: name of files with checkpoints :param keep_checkpoints: max number of checkpoints to keep :param math: arithmetic type :param cuda: if True use cuda, if False train on cpu :param distributed: if True run distributed training :param intra_epoch_eval: number of additional eval runs within each training epoch :param iter_size: number of iterations between weight updates :param translator: instance of Translator, runs inference on test set :param verbose: enables verbose logging """ super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_path = save_path self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.cuda = cuda self.distributed = distributed self.print_freq = print_freq self.batch_first = batch_first self.verbose = verbose self.loss = None self.translator = translator self.intra_epoch_eval = intra_epoch_eval self.iter_size = iter_size if cuda: self.model = self.model.cuda() self.criterion = self.criterion.cuda() if math == 'fp16': self.model = self.model.half() if distributed: self.model = DDP(self.model) if math == 'fp16': self.fp_optimizer = Fp16Optimizer(self.model, grad_clip) params = self.fp_optimizer.fp32_params elif math == 'fp32': self.fp_optimizer = Fp32Optimizer(self.model, grad_clip) params = self.model.parameters() opt_name = opt_config.pop('optimizer') self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config) logging.info(f'Using optimizer: {self.optimizer}') self.scheduler = WarmupMultiStepLR(self.optimizer, train_iterations, **scheduler_config)
def __init__(self, model, criterion, opt_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), batch_first=False, save_info={}, save_path='.', train_iterations=0, checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', loss_scaling={}, cuda=True, distributed=False, distributed_overlap_allreduce=False, distributed_overlap_num_allreduce_streams=1, distributed_overlap_allreduce_messagesize=1e7, distributed_overlap_allreduce_communicators=None, intra_epoch_eval=0, prealloc_mode='always', iter_size=1, verbose=False, args=None): """ Constructor for the Seq2SeqTrainer. :param model: model to train :param criterion: criterion (loss function) :param opt_config: dictionary with options for the optimizer :param print_freq: prints short summary every 'print_freq' iterations :param save_freq: saves checkpoint every 'save_freq' iterations :param grad_clip: coefficient for gradient clipping :param batch_first: if True the model uses (batch,seq,feature) tensors, if false the model uses (seq, batch, feature) :param save_info: dict with additional state stored in each checkpoint :param save_path: path to the directiory for checkpoints :param train_iterations: total number of training iterations to execute :param checkpoint_filename: name of files with checkpoints :param keep_checkpoints: max number of checkpoints to keep :param math: arithmetic type :param loss_scaling: options for dynamic loss scaling :param cuda: if True use cuda, if False train on cpu :param distributed: if True run distributed training :param intra_epoch_eval: number of additional eval runs within each training epoch :param prealloc_mode: controls preallocation, choices=['off', 'once', 'always'] :param iter_size: number of iterations between weight updates :param verbose: enables verbose logging """ super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_path = save_path self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.cuda = cuda self.distributed = distributed self.print_freq = print_freq self.batch_first = batch_first self.verbose = verbose self.loss = None self.translator = None self.scheduler = None self.intra_epoch_eval = intra_epoch_eval self.iter_size = iter_size self.prealloc_mode = prealloc_mode self.preallocated = False # Assume multi-tensor apply if with APEX DDP self.args = args self.use_mt = (distributed and iter_size == 1 and \ opt_config['optimizer'] == 'FusedAdam') # Use APEX gradient average if gradient accumulation option enabled self.retain_allreduce_buffers = True if iter_size == 1 else False self.gradient_average = False if iter_size == 1 else True if cuda: self.model = self.model.cuda() self.criterion = self.criterion.cuda() params = self.model.parameters() if math == 'fp16': self.model = self.model.half() if distributed and self.args.distributed_weight_update != 2: self.model = DDP(self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce), num_allreduce_streams=distributed_overlap_num_allreduce_streams, allreduce_communicators=distributed_overlap_allreduce_communicators, retain_allreduce_buffers=self.retain_allreduce_buffers, gradient_average=self.gradient_average) if self.args.distributed_weight_update == 2: # gradient clipping maintained by DistributedFusedAdam self.fp_optimizer = DwuFp16Optimizer( self.model, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval'] ) params = list(self.model.parameters()) else: self.fp_optimizer = Fp16Optimizer( self.model, grad_clip, use_mt=self.use_mt, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval'] ) params = self.fp_optimizer.fp32_params if isinstance(self.fp_optimizer.fp32_params, list) \ else [self.fp_optimizer.fp32_params] elif math == 'fp32': if distributed: self.model = DDP(self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce)) self.fp_optimizer = Fp32Optimizer(self.model, grad_clip) # params = self.model.parameters() opt_name = opt_config.pop('optimizer') if opt_name == 'FusedAdam': if math == 'fp16' or math == 'fp32': if self.args.distributed_weight_update == 2: dwu_args = self.distributed_weight_update_config self.optimizer = DistributedFusedAdam(params, max_grad_norm=grad_clip, **dwu_args, **opt_config) self.optimizer.set_global_scale(1.0) # used for grad norm clipping in step function else: # Maintain grad norm and scaling by ourselves self.optimizer = FusedAdam(params, use_mt=self.use_mt, **opt_config) else: self.optimizer = FusedAdam(params, use_mt=self.use_mt, max_grad_norm=grad_clip, amp_scale_adjustment=get_world_size(), **opt_config) else: self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config) logging.info(f'Using optimizer: {self.optimizer}') log_event(key=constants.OPT_NAME, value=constants.ADAM, sync=False) log_event(key=constants.OPT_BASE_LR, value=opt_config['lr'], sync=False) log_event(key=constants.OPT_ADAM_BETA_1, value=self.optimizer.defaults['betas'][0], sync=False) log_event(key=constants.OPT_ADAM_BETA_2, value=self.optimizer.defaults['betas'][1], sync=False) log_event(key=constants.OPT_ADAM_EPSILON, value=self.optimizer.defaults['eps'], sync=False)
def __init__(self, model, criterion, opt_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), batch_first=False, save_info={}, save_path='.', train_iterations=0, checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', loss_scaling={}, cuda=True, distributed=False, distributed_overlap_allreduce=False, distributed_overlap_num_allreduce_streams=1, distributed_overlap_allreduce_messagesize=1e7, distributed_overlap_allreduce_communicators=None, intra_epoch_eval=0, prealloc_mode='always', iter_size=1, verbose=False): """ Constructor for the Seq2SeqTrainer. :param model: model to train :param criterion: criterion (loss function) :param opt_config: dictionary with options for the optimizer :param print_freq: prints short summary every 'print_freq' iterations :param save_freq: saves checkpoint every 'save_freq' iterations :param grad_clip: coefficient for gradient clipping :param batch_first: if True the model uses (batch,seq,feature) tensors, if false the model uses (seq, batch, feature) :param save_info: dict with additional state stored in each checkpoint :param save_path: path to the directiory for checkpoints :param train_iterations: total number of training iterations to execute :param checkpoint_filename: name of files with checkpoints :param keep_checkpoints: max number of checkpoints to keep :param math: arithmetic type :param loss_scaling: options for dynamic loss scaling :param cuda: if True use cuda, if False train on cpu :param distributed: if True run distributed training :param intra_epoch_eval: number of additional eval runs within each training epoch :param prealloc_mode: controls preallocation, choices=['off', 'once', 'always'] :param iter_size: number of iterations between weight updates :param verbose: enables verbose logging """ super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_path = save_path self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.cuda = cuda self.distributed = distributed self.print_freq = print_freq self.batch_first = batch_first self.verbose = verbose self.loss = None self.translator = None self.scheduler = None self.intra_epoch_eval = intra_epoch_eval self.iter_size = iter_size self.prealloc_mode = prealloc_mode self.preallocated = False self.retain_allreduce_buffers = True self.gradient_average = False if cuda: self.model = self.model.cuda() self.criterion = self.criterion.cuda() params = self.model.parameters() if math == 'fp16': self.model = self.model.half() if distributed: self.model = DDP( self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce), retain_allreduce_buffers=self.retain_allreduce_buffers, gradient_average=self.gradient_average) self.fp_optimizer = Fp16Optimizer( self.model, grad_clip, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval']) params = [self.fp_optimizer.fp32_params] elif math == 'fp32': if distributed: self.model = DDP( self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce)) self.fp_optimizer = Fp32Optimizer(self.model, grad_clip) # params = self.model.parameters() opt_name = opt_config.pop('optimizer') if opt_name == 'FusedAdam': if math == 'fp16' or math == 'fp32': self.optimizer = FusedAdam(params, **opt_config) else: self.optimizer = FusedAdam( params, use_mt=True, max_grad_norm=grad_clip, amp_scale_adjustment=get_world_size(), **opt_config) else: self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config) if math == 'amp_fp16': self.model, self.optimizer = amp.initialize( self.model, self.optimizer, cast_model_outputs=torch.float16, keep_batchnorm_fp32=False, opt_level='O2') self.fp_optimizer = AMPOptimizer( self.model, grad_clip, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval']) if distributed: self.model = DDP( self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce), num_allreduce_streams= distributed_overlap_num_allreduce_streams, allreduce_communicators= distributed_overlap_allreduce_communicators, retain_allreduce_buffers=self.retain_allreduce_buffers, gradient_average=self.gradient_average) logging.info(f'Using optimizer: {self.optimizer}') mlperf_print(key=mlperf_compliance.constants.OPT_BASE_LR, value=opt_config['lr'])
def __init__(self, model, criterion, opt_config, scheduler_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), batch_first=False, save_info={}, save_path='.', checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', cuda=True, distributed=False, distributed_overlap_allreduce=False, distributed_overlap_allreduce_messagesize=1e7, intra_epoch_eval=0, translator=None, verbose=False, arch="gnmt"): super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_path = save_path self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.cuda = cuda self.distributed = distributed self.print_freq = print_freq self.batch_first = batch_first self.verbose = verbose self.loss = None self.translator = translator self.intra_epoch_eval = intra_epoch_eval self.arch = arch self.retain_allreduce_buffers = True self.gradient_average = False if cuda: self.model = self.model.cuda() self.criterion = self.criterion.cuda() if math == 'fp16': self.model = self.model.half() if distributed: # self.model = apex.parallel.DistributedDataParallel(self.model, message_size=10000000, delay_allreduce=True) self.model = apex.parallel.DistributedDataParallel( self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce), retain_allreduce_buffers=self.retain_allreduce_buffers, gradient_average=self.gradient_average) self.fp_optimizer = Fp16Optimizer(self.model, grad_clip) params = [self.fp_optimizer.fp32_params] elif math == 'fp32': if distributed: # self.model = apex.parallel.DistributedDataParallel(self.model, message_size=10000000, delay_allreduce=True) self.model = apex.parallel.DistributedDataParallel( self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce)) self.fp_optimizer = Fp32Optimizer(self.model, grad_clip) params = self.model.parameters() opt_name = opt_config.pop('optimizer') if opt_name == 'FusedAdam': self.optimizer = apex.optimizers.FusedAdam(params, **opt_config) else: self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config) gnmt_print(key=mlperf_log.OPT_NAME, value=mlperf_log.ADAM) gnmt_print(key=mlperf_log.OPT_LR, value=opt_config['lr']) gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=self.optimizer.defaults['betas'][0]) gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=self.optimizer.defaults['betas'][1]) gnmt_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=self.optimizer.defaults['eps']) self.scheduler = WarmupMultiStepLR( self.optimizer, lr_method=scheduler_config["lr_method"], warmup_iters=scheduler_config["warmup_iters"], remain_steps=scheduler_config["remain_steps"], decay_steps=scheduler_config["decay_steps"]) logging.info(f'Using optimizer: {self.optimizer}')