def __init__(self, args, model, optimizer=None, model_parameters=None, training_data=None, lr_scheduler=None, mpu=None, dist_init_required=True, collate_fn=None): super(DeepSpeedLight, self).__init__() logging.basicConfig(level=logging.INFO, format="[%(levelname)s %(asctime)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S") self.client_optimizer = optimizer self.client_model_parameters = model_parameters self.client_lr_scheduler = lr_scheduler self.training_data = training_data self.collate_fn = collate_fn self.mpu = mpu self.data_parallel_group = None self.global_steps = 0 self.micro_steps = 0 self.skipped_steps = 0 self.gradient_predivide_factor = 1.0 self.gradient_average = True self.warn_unscaled_loss = True if dist_init_required: dist.init_process_group(backend="nccl") self._do_args_sanity_check(args) self._configure_with_arguments(args, mpu) self._do_sanity_check() self.sample_count = 0 if self.tensorboard_enabled(): self.summary_writer = self.get_summary_writer() self._init_distributed(dist_init_required) # Throughput timer self.tput_timer = ThroughputTimer( batch_size=self.train_micro_batch_size_per_gpu(), num_workers=self.world_size, monitor_memory=False) self.training_dataloader = self.deepspeed_io( training_data) if training_data else None # Configure distributed model self._configure_distributed_model(model) # Configure optimizer and scheduler self.optimizer = None self.lr_scheduler = None if model_parameters or optimizer: self._configure_optimizer(optimizer, model_parameters) self._configure_lr_scheduler(lr_scheduler) self._report_progress(0) # Configure wall clock timer self.timers = SynchronizedWallClockTimer() # Bookkeeping for csr support self.csr_tensor_module_names = set() if self.sparse_gradients_enabled(): for name, module in self.module.named_modules(): if isinstance(module, torch.nn.Embedding): self.csr_tensor_module_names.add(name) logging.info("Will convert {} to sparse (csr) " "tensor during training".format(name)) self.save_non_zero_checkpoint = False self.save_zero_checkpoint = False self._configure_checkpointing(dist_init_required) if self.global_rank == 0: self._config.print('DeepSpeedLight configuration') if self.dump_state(): print_configuration(self, 'DeepSpeedLight')
def __init__(self, args, model, optimizer=None, model_parameters=None, training_data=None, lr_scheduler=None, mpu=None, dist_init_required=None, collate_fn=None, config_params=None): super(DeepSpeedLight, self).__init__() self.client_optimizer = optimizer self.client_model_parameters = model_parameters self.client_lr_scheduler = lr_scheduler self.training_data = training_data self.collate_fn = collate_fn self.mpu = mpu self.data_parallel_group = None self.global_steps = 0 self.micro_steps = 0 self.skipped_steps = 0 self.gradient_average = True self.warn_unscaled_loss = True self.config_params = config_params if dist_init_required is None: dist_init_required = not dist.is_initialized() self._mpi_check(args, dist_init_required) self.dist_backend = "nccl" if dist_init_required: if not dist.is_initialized(): logger.info("Initializing torch distributed with backend: {}".format( self.dist_backend)) dist.init_process_group(backend=self.dist_backend) else: logger.warning( "Was given dist_init_required=True but detected that torch" "distributed was already initialized, cannot initialize twice.") self._do_args_sanity_check(args) self._configure_with_arguments(args, mpu) self._do_sanity_check() self.sample_count = 0 if self.tensorboard_enabled(): self.summary_writer = self.get_summary_writer() self._init_distributed(dist_init_required) # Configure distributed model self._configure_distributed_model(model) # Configure wall clock timer self.timers = SynchronizedWallClockTimer() # Throughput timer self.tput_timer = ThroughputTimer( batch_size=self.train_micro_batch_size_per_gpu(), num_workers=self.dp_world_size, monitor_memory=False) self.training_dataloader = self.deepspeed_io( training_data) if training_data else None # Configure optimizer and scheduler self.optimizer = None self.lr_scheduler = None if model_parameters or optimizer: self._configure_optimizer(optimizer, model_parameters) self._configure_lr_scheduler(lr_scheduler) self._report_progress(0) # Bookkeeping for csr support self.csr_tensor_module_names = set() if self.sparse_gradients_enabled(): for name, module in self.module.named_modules(): if isinstance(module, torch.nn.Embedding): self.csr_tensor_module_names.add(name + ".weight") logger.info("Will convert {} to sparse (csr) " "tensor during training".format(name)) self.save_non_zero_checkpoint = False self.save_zero_checkpoint = False self._configure_checkpointing(dist_init_required) if self.global_rank == 0: self._config.print('DeepSpeedLight configuration') if self.dump_state(): print_configuration(self, 'DeepSpeedLight')