def tpu_data_loader(args, itr): import torch_xla.core.xla_model as xm import torch_xla.distributed.parallel_loader as pl xm.rendezvous('tpu_data_loader') # wait for all workers xm.mark_step() device = utils.get_tpu_device(args) return iterators.CountingIterator( pl.ParallelLoader(itr, [device]).per_device_loader(device), start=getattr(itr, 'n', 0), total=len(itr), )
def __init__(self, args, task, model, criterion, quantizer=None): self.args = args self.task = task # catalog shared parameters shared_params = _catalog_shared_params(model) self.tpu = getattr(args, 'tpu', False) self.cuda = torch.cuda.is_available() and not args.cpu and not self.tpu if self.cuda: self.device = torch.device('cuda') elif self.tpu: self.device = utils.get_tpu_device(args) else: self.device = torch.device('cpu') device = ort_supplement.setup_onnxruntime_with_mpi(args) # copy model and criterion to current device/dtype self._criterion = criterion self._ptmodel = model self._model = model #Shape inference #self._infer_model = SymbolicShapeInference.infer_shapes(self._model, auto_merge=True, int_max=100000) self._model = bart_model_with_loss(self._model, criterion) self._model = ort_supplement.create_ort_trainer(args, device, self._model) self._dummy_batch = "DUMMY" # indicates we don't have a dummy batch at first self._lr_scheduler = None self._num_updates = 0 self._num_xla_compiles = 0 # for TPUs self._optim_history = None self._optimizer = None self._warn_once = set() self._wrapped_criterion = None self._wrapped_model = None # TODO(myleott): support tpu if self.cuda and self.data_parallel_world_size > 1: self._grad_norm_buf = torch.cuda.DoubleTensor(self.data_parallel_world_size) else: self._grad_norm_buf = None metrics.log_start_time("wall", priority=790, round=0)
def __init__(self, cfg: FairseqConfig, task, model, criterion, quantizer=None): if isinstance(cfg, Namespace): logger.warning( "argparse.Namespace configuration is deprecated! Automatically converting to OmegaConf" ) cfg = convert_namespace_to_omegaconf(cfg) self.cfg = cfg self.task = task # catalog shared parameters shared_params = _catalog_shared_params(model) self.tpu = cfg.common.tpu self.cuda = torch.cuda.is_available( ) and not cfg.common.cpu and not self.tpu if self.cuda: self.device = torch.device("cuda") elif self.tpu: self.device = utils.get_tpu_device() else: self.device = torch.device("cpu") # copy model and criterion to current device/dtype self._criterion = criterion self._model = model if cfg.common.fp16: self._criterion = self._criterion.half() self._model = self._model.half() elif cfg.common.bf16: self._criterion = self._criterion.to(dtype=torch.bfloat16) self._model = self._model.to(dtype=torch.bfloat16) if not cfg.distributed_training.pipeline_model_parallel: self._criterion = self._criterion.to(device=self.device) self._model = self._model.to(device=self.device) self.pipeline_model_parallel = cfg.distributed_training.pipeline_model_parallel self.last_device = None if self.cuda and self.pipeline_model_parallel: self.last_device = torch.device( cfg.distributed_training.pipeline_devices[-1]) # check that shared parameters are preserved after device transfer for shared_param in shared_params: ref = _get_module_by_path(self._model, shared_param[0]) for path in shared_param[1:]: logger.info("detected shared parameter: {} <- {}".format( shared_param[0], path)) _set_module_by_path(self._model, path, ref) self._dummy_batch = None # indicates we don't have a dummy batch at first self._lr_scheduler = None self._num_updates = 0 self._num_xla_compiles = 0 # for TPUs self._optim_history = None self._optimizer = None self._warn_once = set() self._wrapped_criterion = None self._wrapped_model = None # TODO(myleott): support tpu if self.cuda and self.data_parallel_world_size > 1: self._grad_norm_buf = torch.cuda.DoubleTensor( self.data_parallel_world_size) else: self._grad_norm_buf = None self.quantizer = quantizer if self.quantizer is not None: self.quantizer.set_trainer(self) # get detailed cuda environment if self.cuda: self.cuda_env = utils.CudaEnvironment() if self.data_parallel_world_size > 1: self.cuda_env_arr = distributed_utils.all_gather_list( self.cuda_env, group=distributed_utils.get_global_group()) else: self.cuda_env_arr = [self.cuda_env] if self.data_parallel_rank == 0: utils.CudaEnvironment.pretty_print_cuda_env_list( self.cuda_env_arr) else: self.cuda_env = None self.cuda_env_arr = None metrics.log_start_time("wall", priority=790, round=0) self._start_time = time.time() self._previous_training_time = 0 self._cumulative_training_time = None
def __init__(self, args, task, model, criterion, quantizer=None): self.args = args self.task = task # catalog shared parameters shared_params = _catalog_shared_params(model) self.tpu = getattr(args, 'tpu', False) self.cuda = torch.cuda.is_available() and not args.cpu and not self.tpu if self.cuda: self.device = torch.device('cuda') elif self.tpu: self.device = utils.get_tpu_device(args) else: self.device = torch.device('cpu') # copy model and criterion to current device/dtype self._criterion = criterion self._model = model if self.tpu: import torch_xla.core.xla_model as xm self._model = xm.send_cpu_data_to_device(self._model, self.device) if args.fp16: self._criterion = self._criterion.half() self._model = self._model.half() elif args.bf16: self._criterion = self._criterion.to(dtype=torch.bfloat16) self._model = self._model.to(dtype=torch.bfloat16) self._criterion = self._criterion.to(device=self.device) self._model = self._model.to(device=self.device) # check that shared parameters are preserved after device transfer for shared_param in shared_params: ref = _get_module_by_path(self._model, shared_param[0]) for path in shared_param[1:]: logger.info( 'detected shared parameter: {} <- {}'.format(shared_param[0], path) ) _set_module_by_path(self._model, path, ref) self._dummy_batch = "DUMMY" # indicates we don't have a dummy batch at first self._lr_scheduler = None self._num_updates = 0 self._num_xla_compiles = 0 # for TPUs self._optim_history = None self._optimizer = None self._warn_once = set() self._wrapped_criterion = None self._wrapped_model = None # TODO(myleott): support tpu if self.cuda and self.data_parallel_world_size > 1: self._grad_norm_buf = torch.cuda.DoubleTensor(self.data_parallel_world_size) else: self._grad_norm_buf = None self.quantizer = quantizer if self.quantizer is not None: self.quantizer.set_trainer(self) # get detailed cuda environment if self.cuda: self.cuda_env = utils.CudaEnvironment() if self.data_parallel_world_size > 1: self.cuda_env_arr = distributed_utils.all_gather_list(self.cuda_env) else: self.cuda_env_arr = [self.cuda_env] if self.data_parallel_rank == 0: utils.CudaEnvironment.pretty_print_cuda_env_list(self.cuda_env_arr) else: self.cuda_env = None self.cuda_env_arr = None metrics.log_start_time("wall", priority=790, round=0) self._start_time = time.time() self._previous_training_time = 0 self._cumulative_training_time = None