def distributed_init(config): if config.distributed.world_size == 1: raise ValueError( "Cannot initialize distributed with distributed_world_size=1") logger.info(f"XLA Mode:{is_xla()}") if is_xla(): config.device_id = xm.get_local_ordinal() config.distributed.rank = xm.get_ordinal() elif dist.is_initialized(): warnings.warn( "Distributed is already initialized, cannot initialize twice!") config.distributed.rank = dist.get_rank() else: logger.info(f"Distributed Init (Rank {config.distributed.rank}): " f"{config.distributed.init_method}") dist.init_process_group( backend=config.distributed.backend, init_method=config.distributed.init_method, world_size=config.distributed.world_size, rank=config.distributed.rank, ) logger.info(f"Initialized Host {socket.gethostname()} as Rank " f"{config.distributed.rank}") # perform a dummy all-reduce to initialize the NCCL communicator dist.all_reduce(torch.zeros(1).cuda()) suppress_output(is_master()) config.distributed.rank = dist.get_rank() return config.distributed.rank
def distributed_init(config): if config.distributed.world_size == 1: raise ValueError( "Cannot initialize distributed with distributed_world_size=1") logger.info(f"XLA Mode:{is_xla()}") if is_xla(): config.device_id = xm.get_local_ordinal() config.distributed.rank = xm.get_ordinal() elif dist.is_initialized(): warnings.warn( "Distributed is already initialized, cannot initialize twice!") config.distributed.rank = dist.get_rank() else: logger.info(f"Distributed Init (Rank {config.distributed.rank}): " f"{config.distributed.init_method}") nccl_config = config.distributed.get("nccl", {}) if nccl_config.get("nsocks_perthread", None): os.environ["NCCL_NSOCKS_PERTHREAD"] = str( nccl_config["nsocks_perthread"]) logger.info( f"NCCL_NSOCKS_PERTHREAD: {os.environ['NCCL_NSOCKS_PERTHREAD']}" ) if nccl_config.get("socket_nthreads", None): os.environ["NCCL_SOCKET_NTHREADS"] = str( nccl_config["socket_nthreads"]) logger.info( f"NCCL_SOCKET_NTHREADS: {os.environ['NCCL_SOCKET_NTHREADS']}") dist.init_process_group( backend=config.distributed.backend, init_method=config.distributed.init_method, world_size=config.distributed.world_size, rank=config.distributed.rank, ) logger.info(f"Initialized Host {socket.gethostname()} as Rank " f"{config.distributed.rank}") if "MASTER_ADDR" not in os.environ or "MASTER_PORT" not in os.environ: # Set for onboxdataloader support split = config.distributed.init_method.split("//") assert len(split) == 2, ( "host url for distributed should be split by '//' " + "into exactly two elements") split = split[1].split(":") assert (len(split) == 2 ), "host url should be of the form <host_url>:<host_port>" os.environ["MASTER_ADDR"] = split[0] os.environ["MASTER_PORT"] = split[1] # perform a dummy all-reduce to initialize the NCCL communicator dist.all_reduce(torch.zeros(1).cuda()) suppress_output(is_main()) config.distributed.rank = dist.get_rank() return config.distributed.rank
def new_process(self, process_idx: int, trainer, mp_queue) -> None: self.mp_queue = mp_queue reset_seed() self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal() # set warning rank rank_zero_only.rank = self.global_rank if self.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None: trainer.progress_bar_callback.disable() self.model_to_device() trainer.accelerator.setup_optimizers(trainer) trainer.precision_plugin.connect(self._model, None, None) self.barrier("pre-run-stage") results = trainer.run_stage() self.transfer_distrib_spawn_state_on_fit_end(results) # https://github.com/pytorch/xla/issues/1801#issuecomment-602799542 self.barrier("end-process") # https://github.com/pytorch/xla/issues/2190#issuecomment-641665358 if self.local_rank == 0: time.sleep(2)
def configure_device(self) -> None: if self.config.training.get("device", "cuda") == "xla": import torch_xla.core.xla_model as xm self.device = xm.xla_device() self.distributed = True self.local_rank = xm.get_local_ordinal() is_xla = True else: is_xla = False self.local_rank = self.config.device_id self.device = self.local_rank self.distributed = False # Will be updated later based on distributed setup registry.register("global_device", self.device) if self.config.distributed.init_method is not None: self.distributed = True self.device = torch.device("cuda", self.local_rank) torch.cuda.set_device(self.local_rank) elif torch.cuda.is_available(): self.device = torch.device("cuda") torch.cuda.set_device(0) elif not is_xla: self.device = torch.device("cpu") registry.register("global_device", self.config.distributed.rank)
def _thread_fn(local_ordinal, global_ordinal): pjrt.set_local_ordinal(local_ordinal) pjrt.set_global_ordinal(global_ordinal) time.sleep(1) return xm.get_local_ordinal(), xm.get_ordinal()
def __setup_tpu_training(self, model: LightningModule, trainer): # use the default device from the process # tpu_device = xm.xla_device() # if given an ordinal device, use this as the device if trainer.tpu_id is not None: tpu_device = xm.xla_device(trainer.tpu_id) else: tpu_device = xm.xla_device() # track the device and move model to it trainer._device = tpu_device model.to(trainer._device) # get the appropriate tpu ranks trainer.tpu_local_core_rank = xm.get_local_ordinal() trainer.tpu_global_core_rank = xm.get_ordinal() # avoid duplicating progress bar if trainer.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None: trainer.progress_bar_callback.disable() trainer.global_rank = trainer.tpu_local_core_rank rank_zero_only.rank = trainer.global_rank # CHOOSE OPTIMIZER # allow for lr schedulers as well self.setup_optimizers(model) # init 16 bit for TPU if trainer.precision == 16: os.environ['XLA_USE_BF16'] = str(1) log.info(f'INIT TPU local core: {trainer.tpu_local_core_rank},' f' global rank: {trainer.tpu_global_core_rank}' f' with XLA_USE_BF16={os.environ.get("XLA_USE_BF16")}')
def tpu_train(self, tpu_core_idx, model): # put model on tpu model.to(xm.xla_device()) # get the appropriate tpu ranks self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal() # avoid duplicating progress bar self.show_progress_bar = self.show_progress_bar and self.tpu_global_core_rank == 0 # track current tpu self.current_tpu_idx = tpu_core_idx self.proc_rank = self.tpu_local_core_rank # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers( model) # init 16 bit for TPU if self.precision == 16: os.environ['XLA_USE_BF16'] = str(1) log.info(f'INIT TPU local core: {self.tpu_local_core_rank},' f' global rank: {self.tpu_global_core_rank}') # continue training routine self.run_pretrain_routine(model) self.save_spawn_weights(model)
def tpu_train(self, model): # put model on tpu self._device = xm.xla_device( self.tpu_id) if self.tpu_id is not None else xm.xla_device() model.to(self._device) # get the appropriate tpu ranks self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal() # avoid duplicating progress bar if self.tpu_global_core_rank != 0 and self.progress_bar_callback is not None: self.progress_bar_callback.disable() self.proc_rank = self.tpu_local_core_rank rank_zero_only.rank = self.proc_rank # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers( model) # init 16 bit for TPU if self.precision == 16: os.environ['XLA_USE_BF16'] = str(1) log.info(f'INIT TPU local core: {self.tpu_local_core_rank},' f' global rank: {self.tpu_global_core_rank}') # continue training routine self.run_pretrain_routine(model) # when training ends on these platforms dump weights to get out of the main process if self.on_colab_kaggle: self.save_spawn_weights(model)
def distributed_init(args): if args.distributed_world_size == 1: raise ValueError( 'Cannot initialize distributed with distributed_world_size=1') if not getattr(args, 'tpu', False): if torch.distributed.is_initialized(): warnings.warn( 'Distributed is already initialized, cannot initialize twice!') else: logger.info('distributed init (rank {}): {}'.format( args.distributed_rank, args.distributed_init_method, )) dist.init_process_group( backend=args.distributed_backend, init_method=args.distributed_init_method, world_size=args.distributed_world_size, rank=args.distributed_rank, ) logger.info('initialized host {} as rank {}'.format( socket.gethostname(), args.distributed_rank, )) # perform a dummy all-reduce to initialize the NCCL communicator if torch.cuda.is_available(): dist.all_reduce(torch.zeros(1).cuda()) args.distributed_rank = torch.distributed.get_rank() else: import torch_xla.core.xla_model as xm assert xm.xrt_world_size() == args.distributed_world_size args.device_id = xm.get_local_ordinal() args.distributed_rank = xm.get_ordinal() xm.rendezvous('distributed_init') # wait for all workers xm.mark_step() if is_master(args): logging.getLogger().setLevel(logging.INFO) else: logging.getLogger().setLevel(logging.WARNING) if args.model_parallel_size > 1: try: from fairseq.model_parallel.megatron.mpu import ( get_model_parallel_rank, initialize_model_parallel, model_parallel_cuda_manual_seed, ) except ImportError: raise ImportError('\n\nPlease install the megatron submodule:' '\n\n git submodule update --init ' 'fairseq/model_parallel/megatron') initialize_model_parallel(args.model_parallel_size) model_parallel_cuda_manual_seed(args.seed) model_part_number = get_model_parallel_rank() args.checkpoint_suffix += '-model_part-{0}'.format(model_part_number) return args.distributed_rank
def _extract_metrics_file(): # Delay xla_model import to avoid cross dependencies. import torch_xla.core.xla_model as xm metrics_file = os.environ.get('XLA_METRICS_FILE', None) if metrics_file is not None: ordinal = xm.get_local_ordinal(defval=-1) if ordinal >= 0 and xm.xrt_world_size() > 1: metrics_file = '{}.{}'.format(metrics_file, ordinal) return metrics_file
def pre_dispatch(self) -> None: if isinstance(self.device, int): self.device = xm.xla_device(self.device) if self.debug: os.environ["PT_XLA_DEBUG"] = str(1) self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal()
def local_process_index(self): """ The index of the local process used. """ if is_torch_tpu_available(): return xm.get_local_ordinal() elif is_sagemaker_mp_enabled(): return smp.local_rank() elif is_sagemaker_dp_enabled(): return sm_dist.get_rank() elif self.local_rank != -1: return self.local_rank return 0
def setup(self, trainer: "pl.Trainer") -> None: shared_params = find_shared_parameters(self.model) self.model_to_device() if is_overridden("on_post_move_to_device", self.lightning_module): self.model.on_post_move_to_device() else: set_shared_parameters(self.model, shared_params) super().setup(trainer) if self.debug: os.environ["PT_XLA_DEBUG"] = str(1) self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal()
def configure_device(self) -> None: if self.config.training.get("device", "cuda") == "xla": import torch_xla.core.xla_model as xm self.device = xm.xla_device() self.distributed = True self.local_rank = xm.get_local_ordinal() is_xla = True else: is_xla = False if "device_id" not in self.config: warnings.warn( "No 'device_id' in 'config', setting to -1. " "This can cause issues later in training. Ensure that " "distributed setup is properly initialized.") self.local_rank = -1 else: self.local_rank = self.config.device_id self.device = self.local_rank self.distributed = False # Will be updated later based on distributed setup registry.register("global_device", self.device) if self.config.distributed.init_method is not None: self.distributed = True self.device = torch.device("cuda", self.local_rank) torch.cuda.set_device(self.local_rank) elif torch.cuda.is_available(): self.device = torch.device("cuda") torch.cuda.set_device(0) elif not is_xla: self.device = torch.device("cpu") if "rank" not in self.config.distributed: if torch.distributed.is_available( ) and torch.distributed.is_initialized(): global_rank = torch.distributed.get_rank() else: global_rank = -1 with open_dict(self.config.distributed): self.config.distributed.rank = global_rank registry.register("global_device", self.config.distributed.rank)
def build_progress_bar(args, iterator, epoch=None, prefix=None, default='tqdm', no_progress_bar='none'): if args.log_format is None: args.log_format = no_progress_bar if args.no_progress_bar else default if args.log_format == 'tqdm' and not sys.stderr.isatty(): args.log_format = 'simple' if args.log_format == 'json': bar = json_progress_bar(iterator, epoch, prefix, args.log_interval) elif args.log_format == 'none': bar = noop_progress_bar(iterator, epoch, prefix) elif args.log_format == 'simple': bar = simple_progress_bar(iterator, epoch, prefix, args.log_interval) elif args.log_format == 'tqdm': bar = tqdm_progress_bar(iterator, epoch, prefix) else: raise ValueError('Unknown log format: {}'.format(args.log_format)) if args.tbmf_wrapper and distributed_utils.is_master(args): global g_tbmf_wrapper if g_tbmf_wrapper is None: try: from fairseq.fb_tbmf_wrapper import fb_tbmf_wrapper except Exception: raise ImportError("fb_tbmf_wrapper package not found.") g_tbmf_wrapper = fb_tbmf_wrapper bar = g_tbmf_wrapper(bar, args, args.log_interval) elif ( args.tensorboard_logdir and getattr(args, 'use_gpu', True) and distributed_utils.is_master(args) ): bar = tensorboard_log_wrapper(bar, args.tensorboard_logdir, args) elif args.tensorboard_logdir and not getattr(args, 'use_gpu', True): # tpu-comment: making every core have a tensorboard writer guarantees # the same work accross cores. logdir = os.path.join( args.tensorboard_logdir, str(xm.get_local_ordinal()) ) bar = tensorboard_log_wrapper_xla(bar, logdir, args) return bar
def _mp_fn(index, temp_file): device = xm.xla_device() dd = _create_state_dict(device) xm.save(dd, temp_file) ldd = torch.load(temp_file) pdd = _get_data_str(ldd) data = xm.rendezvous('xm_save_test', pdd) if xm.get_local_ordinal() == 0: os.remove(temp_file) for i in range(1, len(data)): bio = io.BytesIO(data[i]) ildd = torch.load(bio) for k, v in ldd.items(): if isinstance(v, torch.Tensor): assert v.allclose(ildd[k]) elif isinstance(v, (list, tuple)): iv = ildd[k] for a, b in zip(v, iv): assert a.allclose(b) else: raise RuntimeError('Invalid data type')
def __init__(self, fp16: bool = None, cpu: bool = False, _from_accelerator: bool = False): self.__dict__ = self._shared_state if not getattr(self, "initialized", False): if not _from_accelerator: raise ValueError( "Please make sure to properly initialize your accelerator via `accelerator = Accelerator()` " "before using any functionality from the `accelerate` library." ) elif is_tpu_available() and not cpu: self.distributed_type = DistributedType.TPU self.num_processes = xm.xrt_world_size() self.process_index = xm.get_ordinal() self.local_process_index = xm.get_local_ordinal() self.device = xm.xla_device() self.use_fp16 = False elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu: self.distributed_type = DistributedType.MULTI_GPU if not torch.distributed.is_initialized(): torch.distributed.init_process_group(backend="nccl") self.num_processes = torch.distributed.get_world_size() self.process_index = torch.distributed.get_rank() self.local_process_index = int(os.environ.get( "LOCAL_RANK", -1)) self.device = torch.device("cuda", self.local_process_index) torch.cuda.set_device(self.device) self.use_fp16 = parse_flag_from_env( "USE_FP16", False) if fp16 is None else fp16 else: self.distributed_type = DistributedType.NO self.num_processes = 1 self.process_index = self.local_process_index = 0 self.device = torch.device( "cuda" if torch.cuda.is_available() and not cpu else "cpu") self.use_fp16 = parse_flag_from_env( "USE_FP16", False) if fp16 is None else fp16 self.initialized = True
def __setup_tpu_training(self, model): # use the default device from the process tpu_device = xm.xla_device() # if given an ordinal device, use this as the device if self.trainer.tpu_id is not None: tpu_device = xm.xla_device(self.trainer.tpu_id) # track the device and move model to it self.trainer._device = tpu_device model.to(self.trainer._device) # get the appropriate tpu ranks self.trainer.tpu_local_core_rank = xm.get_local_ordinal() self.trainer.tpu_global_core_rank = xm.get_ordinal() # avoid duplicating progress bar if self.trainer.tpu_global_core_rank != 0 and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() self.trainer.global_rank = self.trainer.tpu_local_core_rank rank_zero_only.rank = self.trainer.global_rank # CHOOSE OPTIMIZER # allow for lr schedulers as well optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model) self.trainer.optimizers = optimizers self.trainer.lr_schedulers = lr_schedulers self.trainer.optimizer_frequencies = optimizer_frequencies # init 16 bit for TPU if self.trainer.precision == 16: os.environ['XLA_USE_BF16'] = str(1) log.info(f'INIT TPU local core: {self.trainer.tpu_local_core_rank},' f' global rank: {self.trainer.tpu_global_core_rank}')
def distributed_init(cfg: FairseqConfig): if isinstance(cfg, Namespace): from fairseq.dataclass.utils import convert_namespace_to_omegaconf cfg = convert_namespace_to_omegaconf(cfg) if not cfg.common.tpu: if torch.distributed.is_available( ) and torch.distributed.is_initialized(): warnings.warn( "Distributed is already initialized, cannot initialize twice!") else: logger.info("distributed init (rank {}): {}".format( cfg.distributed_training.distributed_rank, cfg.distributed_training.distributed_init_method, )) dist.init_process_group( backend=cfg.distributed_training.distributed_backend, init_method=cfg.distributed_training.distributed_init_method, world_size=cfg.distributed_training.distributed_world_size, rank=cfg.distributed_training.distributed_rank, ) logger.info("initialized host {} as rank {}".format( socket.gethostname(), cfg.distributed_training.distributed_rank, )) # perform a dummy all-reduce to initialize the NCCL communicator if torch.cuda.is_available(): dist.all_reduce(torch.zeros(1).cuda()) cfg.distributed_training.distributed_rank = torch.distributed.get_rank( ) else: assert xm.xrt_world_size( ) == cfg.distributed_training.distributed_world_size global _USE_XLA _USE_XLA = True cfg.distributed_training.device_id = xm.get_local_ordinal() cfg.distributed_training.distributed_rank = xm.get_ordinal() xm.rendezvous("distributed_init") # wait for all workers xm.mark_step() if is_master(cfg.distributed_training): logging.getLogger().setLevel(logging.INFO) else: logging.getLogger().setLevel(logging.WARNING) if cfg.common.model_parallel_size > 1: try: from fairseq.model_parallel.megatron.mpu import ( initialize_model_parallel, model_parallel_cuda_manual_seed, ) except ImportError: raise ImportError("\n\nPlease install the megatron submodule:" "\n\n git submodule update --init " "fairseq/model_parallel/megatron") global _USE_MEGATRON _USE_MEGATRON = True initialize_model_parallel(cfg.common.model_parallel_size) model_parallel_cuda_manual_seed(cfg.common.seed) model_part_number = get_model_parallel_rank() cfg.checkpoint.checkpoint_suffix += "-model_part-{0}".format( model_part_number) return cfg.distributed_training.distributed_rank
def get_local_rank(self) -> int: return xm.get_local_ordinal()
def _worker_setup(self, process_idx: int): reset_seed() self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal() rank_zero_only.rank = self.global_rank
def set_world_ranks(self, process_idx: int) -> None: self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal() self.global_rank = self.tpu_local_core_rank self.world_size = self.num_nodes * self.num_processes
def __init__(self, fp16: bool = None, cpu: bool = False, deepspeed_plugin=None, _from_accelerator: bool = False, **kwargs): self.__dict__ = self._shared_state if not getattr(self, "initialized", False): self.backend = None self.deepspeed_plugin = None if not _from_accelerator: raise ValueError( "Please make sure to properly initialize your accelerator via `accelerator = Accelerator()` " "before using any functionality from the `accelerate` library." ) elif is_tpu_available() and not cpu: self.distributed_type = DistributedType.TPU self.num_processes = xm.xrt_world_size() self.process_index = xm.get_ordinal() self.local_process_index = xm.get_local_ordinal() self.device = xm.xla_device() self.use_fp16 = False elif os.environ.get("USE_DEEPSPEED", "false") == "true" and not cpu: assert ( is_deepspeed_available() ), "DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source" self.distributed_type = DistributedType.DEEPSPEED if not torch.distributed.is_initialized(): torch.distributed.init_process_group(backend="nccl", **kwargs) self.backend = "nccl" self.num_processes = torch.distributed.get_world_size() self.process_index = torch.distributed.get_rank() self.local_process_index = int(os.environ.get( "LOCAL_RANK", -1)) self.device = torch.device("cuda", self.local_process_index) torch.cuda.set_device(self.device) self.use_fp16 = False # deepspeed handles fp16 using deepspeed_config fp16 = parse_flag_from_env("USE_FP16", False) if fp16 is None else fp16 deepspeed_plugin.deepspeed_config.update( {"fp16": { "enabled": fp16 }}) self.deepspeed_plugin = deepspeed_plugin elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu: self.distributed_type = DistributedType.MULTI_GPU if not torch.distributed.is_initialized(): torch.distributed.init_process_group(backend="nccl", **kwargs) self.backend = "nccl" self.num_processes = torch.distributed.get_world_size() self.process_index = torch.distributed.get_rank() self.local_process_index = int(os.environ.get( "LOCAL_RANK", -1)) self.device = torch.device("cuda", self.local_process_index) torch.cuda.set_device(self.device) self.use_fp16 = parse_flag_from_env( "USE_FP16", False) if fp16 is None else fp16 elif get_int_from_env([ "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE" ], 1) > 1: self.distributed_type = DistributedType.MULTI_CPU if is_ccl_available() and get_int_from_env( ["CCL_WORKER_COUNT"], 0) > 0: backend = "ccl" elif torch.distributed.is_mpi_available(): backend = "mpi" else: backend = "gloo" # Try to get launch configuration from environment variables set by MPI launcher - works for Intel MPI, OpenMPI and MVAPICH rank = get_int_from_env([ "RANK", "PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK" ], 0) size = get_int_from_env([ "WORLD_SIZE", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE" ], 1) local_rank = get_int_from_env([ "LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK" ], 0) local_size = get_int_from_env([ "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE" ], 1) self.local_process_index = local_rank os.environ["RANK"] = str(rank) os.environ["WORLD_SIZE"] = str(size) os.environ["LOCAL_RANK"] = str(local_rank) if not os.environ.get("MASTER_PORT", None): os.environ["MASTER_PORT"] = "29500" if not os.environ.get("MASTER_ADDR", None): if local_size != size and backend != "mpi": raise ValueError( "Looks like distributed multinode run but MASTER_ADDR env not set, " "please try exporting rank 0's hostname as MASTER_ADDR" ) if not torch.distributed.is_initialized(): torch.distributed.init_process_group(backend, rank=rank, world_size=size, **kwargs) self.backend = backend self.num_processes = torch.distributed.get_world_size() self.process_index = torch.distributed.get_rank() self.local_process_index = local_rank self.device = torch.device("cpu") self.use_fp16 = False else: self.distributed_type = DistributedType.NO self.num_processes = 1 self.process_index = self.local_process_index = 0 self.device = torch.device( "cuda" if torch.cuda.is_available() and not cpu else "cpu") self.use_fp16 = parse_flag_from_env( "USE_FP16", False) if fp16 is None else fp16 self.initialized = True
def set_world_ranks(self, process_idx: int = 0) -> None: self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal()
def test_default_ordinals(self): global_ordinal = xm.get_ordinal() self.assertEqual(global_ordinal, 0) local_ordinal = xm.get_local_ordinal() self.assertEqual(local_ordinal, 0)
def __init__(self, fp16: bool = None, cpu: bool = False, _from_accelerator: bool = False): self.__dict__ = self._shared_state if not getattr(self, "initialized", False): self.backend = None if not _from_accelerator: raise ValueError( "Please make sure to properly initialize your accelerator via `accelerator = Accelerator()` " "before using any functionality from the `accelerate` library." ) elif is_tpu_available() and not cpu: self.distributed_type = DistributedType.TPU self.num_processes = xm.xrt_world_size() self.process_index = xm.get_ordinal() self.local_process_index = xm.get_local_ordinal() self.device = xm.xla_device() self.use_fp16 = False elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu: self.distributed_type = DistributedType.MULTI_GPU if not torch.distributed.is_initialized(): torch.distributed.init_process_group(backend="nccl") self.backend = "nccl" self.num_processes = torch.distributed.get_world_size() self.process_index = torch.distributed.get_rank() self.local_process_index = int(os.environ.get("LOCAL_RANK", -1)) self.device = torch.device("cuda", self.local_process_index) torch.cuda.set_device(self.device) self.use_fp16 = parse_flag_from_env("USE_FP16", False) if fp16 is None else fp16 elif env2int(["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"]) > 1: self.distributed_type = DistributedType.MULTI_CPU if is_ccl_available() and env2int(["CCL_WORKER_COUNT"]) > 0: backend = "ccl" elif torch.distributed.is_mpi_available(): backend = "mpi" else: backend = "gloo" rank = env2int(["RANK", "PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK"], 0) size = env2int(["WORLD_SIZE", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE"], 1) local_rank = env2int(["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0) local_size = env2int(["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1) self.local_process_index = local_rank os.environ["RANK"] = str(rank) os.environ["WORLD_SIZE"] = str(size) if not os.environ.get("MASTER_PORT", None): os.environ["MASTER_PORT"] = "29500" if not os.environ.get("MASTER_ADDR", None): if local_size != size and backend != "mpi": print("Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default") print("If this run hangs, try exporting rank 0's hostname as MASTER_ADDR") os.environ["MASTER_ADDR"] = "127.0.0.1" if not torch.distributed.is_initialized(): torch.distributed.init_process_group(backend, rank=rank, world_size=size) self.backend = backend self.num_processes = torch.distributed.get_world_size() self.process_index = torch.distributed.get_rank() self.local_process_index = local_rank self.local_num_processes = local_size self.device = torch.device("cpu") self.use_fp16 = False else: self.distributed_type = DistributedType.NO self.num_processes = 1 self.process_index = self.local_process_index = 0 self.device = torch.device("cuda" if torch.cuda.is_available() and not cpu else "cpu") self.use_fp16 = parse_flag_from_env("USE_FP16", False) if fp16 is None else fp16 self.initialized = True
def pre_training(self) -> None: if isinstance(self.device, int): self.device = xm.xla_device(self.device) self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal()
def get_local_rank(self, ): return xm.get_local_ordinal()