def _init_deepspeed_distributed(self) -> None: if platform.system() != "Windows": # do not set env variables on windows, allow deepspeed to control setup self._set_node_environment_variables() log.info( "initializing deepspeed distributed: " f"GLOBAL_RANK: {self.global_rank}, " f"MEMBER: {self.global_rank + 1}/{self.world_size}" ) deepspeed.init_distributed(self.torch_distributed_backend, distributed_port=self.cluster_environment.main_port)
def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Optional[int] = None) -> None: if platform.system() != "Windows": # do not set env variables on windows, allow deepspeed to control setup global_rank = global_rank if global_rank is not None else self.cluster_environment.global_rank( ) world_size = world_size if world_size is not None else self.cluster_environment.world_size( ) self._set_node_environment_variables(global_rank, world_size) log.info("initializing deepspeed distributed: " f"GLOBAL_RANK: {global_rank}, " f"MEMBER: {global_rank + 1}/{world_size}") deepspeed.init_distributed( self.torch_distributed_backend, distributed_port=self.cluster_environment.master_port())