def _create_ep_parallel_group(self, moe_experts): # Call the init process self.ep_group = {} self.expert_mp_group = {} moe_experts = moe_experts if type(moe_experts) is list else [moe_experts] for e in moe_experts: self.ep_group.update({e: None}) self.expert_mp_group.update({e: None}) for moe_ep_size in self.ep_group.keys(): num_ep_groups = dist.get_world_size() // moe_ep_size for i in range(num_ep_groups): ep_cnt = i * moe_ep_size size = dist.get_world_size( ) if moe_ep_size > dist.get_world_size() else moe_ep_size ranks = list(range(ep_cnt, ep_cnt + size)) _ep_group = dist.new_group(ranks) if dist.get_rank() in ranks: self.ep_group.update({moe_ep_size: _ep_group}) if dist.get_world_size() > moe_ep_size: num_expert_mp_groups = dist.get_world_size() // num_ep_groups expert_mp_size = dist.get_world_size() // moe_ep_size for i in range(num_expert_mp_groups): expert_mp_comm_ranks = [ i + nr * moe_ep_size for nr in range(expert_mp_size) ] _expert_mp_group = dist.new_group(expert_mp_comm_ranks) if dist.get_rank() in expert_mp_comm_ranks: self.expert_mp_group.update({moe_ep_size: _expert_mp_group})
def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu): """ Create expert and data parallel groups based on MPU (model parallel) group. Note: Caller of this function is responsible to check if the groups already exist. Example - E + M + D parallel world_size = 16 model_degree = 2 expert_degree = 4 # number of experts in same group mp_group = [0, 1], [2,3], [4,5] ... data_parallel_group =[0,2,4,6,8,10, 12,14], [1,3,5,7,9,11,13,15] expert_parallel_group = [0,2,4,6], [8,10,12,14] [1,3,5,7], [9,11,13,15] expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14], [1,9],[3,11],[5,13],[7,15] """ assert dist.is_initialized(), "dist is not initialized" model_parallel_size_ = mpu.get_model_parallel_world_size() global expert_tensor_parallel_world_size expert_tensor_parallel_world_size = model_parallel_size_ world_size = dist.get_world_size() rank = dist.get_rank() dp_world_size = mpu.get_data_parallel_world_size() dp_rank = mpu.get_data_parallel_rank() _ensure_divisibility(world_size, model_parallel_size_) _ensure_divisibility(dp_world_size, expert_parallel_size_) log_dist( f"Creating deepspeed groups with model parallel size {model_parallel_size_}, expert parallel size {expert_parallel_size_}, world size {world_size}, dp world size {dp_world_size}", [0]) global _EXPERT_PARALLEL_GROUP, _EXPERT_DATA_PARALLEL_GROUP # Get world size and rank. Ensure some consistencies. _DATA_PARALLEL_GROUP = mpu.get_data_parallel_group() _MODEL_PARALLEL_GROUP = mpu.get_model_parallel_group() group_name = f"ep_size_{expert_parallel_size_}" # Only create groups if they don't already exist # Need to check conditions outside the group creation loop because of the way torch.dist group creation works if group_name not in _EXPERT_DATA_PARALLEL_GROUP and group_name not in _EXPERT_PARALLEL_GROUP: expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks( world_size, model_parallel_size_, expert_parallel_size_) for ranks in expert_parallel_groups: group = dist.new_group(ranks) if rank in list(ranks): _EXPERT_PARALLEL_GROUP[group_name] = group for ranks in expert_data_parallel_groups: group = dist.new_group(ranks) if rank in list(ranks): _EXPERT_DATA_PARALLEL_GROUP[group_name] = group
def _create_expert_and_data_parallel(expert_parallel_size_): """ Create expert and data parallel groups. Note: Caller of this function is responsible to check if the groups already exist. Example - E + D parallel world_size = 16 expert_parallel_size = 2 # number of experts in same group expert_data_parallel_group = [0,2,4,6,8,10,12,14], [1,3,5,7,9,11,13,15] - all reduce is only on MoE params expert_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - no all reduce, but all to all data_parallel_group = [0,1,...,15] - all reduce is only on non-MoE """ assert dist.is_initialized() log_dist( f'Creating expert and data parallel groups with size {expert_parallel_size_}', ranks=[0]) world_size = dist.get_world_size() rank = dist.get_rank() _ensure_divisibility(world_size, expert_parallel_size_) group_name = f"ep_size_{expert_parallel_size_}" # Build the expert data parallel groups. global _EXPERT_DATA_PARALLEL_GROUP # Only create group if it does not already exist if group_name not in _EXPERT_DATA_PARALLEL_GROUP: for i in range(expert_parallel_size_): ranks = range(i, world_size, expert_parallel_size_) group = dist.new_group(ranks) log_dist( f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}', [0]) if i == (rank % expert_parallel_size_): _EXPERT_DATA_PARALLEL_GROUP[group_name] = group # Build the expert parallel groups. global _EXPERT_PARALLEL_GROUP # Only create group if it does not already exist if group_name not in _EXPERT_PARALLEL_GROUP: for i in range(world_size // expert_parallel_size_): ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_) group = dist.new_group(ranks) log_dist( f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', [0]) if i == (rank // expert_parallel_size_): _EXPERT_PARALLEL_GROUP[group_name] = group
def _create_model_parallel(model_parallel_size_): """ Initialize model data parallel groups. Arguments: model_parallel_size: number of GPUs used to parallelize model. Returns: Tuple of data parallel group and model parallel group Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we use 2 GPUs to parallelize the model. The present function will create 4 model parallel groups and 2 data parallel groups as: 4 model parallel groups: [g0, g1], [g2, g3], [g4, g5], [g6, g7] 2 data parallel groups: [g0, g2, g4, g6], [g1, g3, g5, g7] Note that for efficiency, the caller should make sure adjacent ranks are on the same DGX box. For example if we are using 2 DGX-1 boxes with a total of 16 GPUs, rank 0 to 7 belong to the first box and ranks 8 to 15 belong to the second box. """ log_dist(f'Creating model parallel group with size {model_parallel_size_}', ranks=[0]) # Get world size and rank. Ensure some consistencies. assert dist.is_initialized() world_size = dist.get_world_size() model_parallel_size = min(model_parallel_size_, world_size) _ensure_divisibility(world_size, model_parallel_size) rank = dist.get_rank() _DATA_PARALLEL_GROUP = None _MODEL_PARALLEL_GROUP = None # Build the data parallel groups. for i in range(model_parallel_size): ranks = range(i, world_size, model_parallel_size) group = dist.new_group(ranks) if i == (rank % model_parallel_size): _DATA_PARALLEL_GROUP = group # Build the model parallel groups. for i in range(world_size // model_parallel_size): ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size) group = dist.new_group(ranks) if i == (rank // model_parallel_size): _MODEL_PARALLEL_GROUP = group return _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP
def init_process_groups(grid): global _groups, _grid _grid = grid assert _grid.pipe_parallel_size > 1, "There is no pipeline parallelism" if not can_send_recv(): _groups = [dist.new_group(ranks=group) for group in _grid.p2p_groups]
def _index_tied_modules(self): ''' Build communication structures for tied modules. ''' tied_comms = {} if self._topo.get_dim('pipe') == 1: return tied_comms specs = self._layer_specs tie_keys = set(s.key for s in specs if isinstance(s, TiedLayerSpec)) for key in tie_keys: # Find the layers that the tied module appears in tied_layers = [] for idx, layer in enumerate(specs): if isinstance(layer, TiedLayerSpec) and layer.key == key: tied_layers.append(idx) # Find all stages with this tied module # TODO: Would be nice to remove the nested data/model parallelism loops and # TODO: instead generalize in some way, since we really just care about the # TODO: stage that owns the tied layer. Then loop over each (dp, mp, ...) # TODO: fiber to generate process groups. tied_stages = set(self.stage_owner(idx) for idx in tied_layers) for dp in range(self._grid.data_parallel_size): for mp in range(self._grid.get_slice_parallel_world_size()): tied_ranks = [] for s in sorted(tied_stages): if self._grid.get_slice_parallel_world_size() > 1: tied_ranks.append( self._grid.stage_to_global(stage_id=s, data=dp, model=mp)) else: tied_ranks.append( self._grid.stage_to_global(stage_id=s, data=dp)) group = dist.new_group(ranks=tied_ranks) # Record this tied module if we own a local copy of it. if self.global_rank in tied_ranks: assert key in self.tied_modules if key in self.tied_modules: tied_comms[key] = { 'ranks': tied_ranks, 'group': group, 'weight_attr': self.tied_weight_attrs[key], 'module': self.tied_modules[key], } # Only count the tied module once in the eyes of the FP16 optimizer if self.global_rank != tied_ranks[0]: for p in self.tied_modules[key].parameters(): p.ds_pipe_replicated = True ''' if len(tied_comms) > 0: print(f'RANK={self.global_rank} tied_comms={tied_comms}') ''' return tied_comms
def _clone_world_group(): """Create a clone of the world group Note: We need to clone the dist world group because we use dist.get_global_rank() utility function in DeepSpeed at many places. As that function does not work on dist.group.WORLD, we need to keep a clone of it. """ assert dist.is_initialized(), "dist is not initialized" global _WORLD_GROUP if _WORLD_GROUP is None: # If not cloned already, clone the world group _WORLD_GROUP = dist.new_group(ranks=range(dist.get_world_size())) return _WORLD_GROUP
def __init__(self, mpu=None): if mpu is None: self.world_group = dist.new_group(ranks=range(dist.get_world_size())) else: self.mpu = mpu self.world_group = self.mpu.get_data_parallel_group() self.rank = dist.get_rank(group=self.world_group) self.size = dist.get_world_size(group=self.world_group) self.compression_backend = CupyBackend() self.bool_not_supported = False TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) if TORCH_MAJOR >= 1 and TORCH_MINOR >= 10: self.bool_not_supported = True
def _create_model_parallel_group(self): # Call the init process if InferenceEngine.inference_mp_group is None: init_distributed() local_rank = int(os.getenv('LOCAL_RANK', '0')) torch.cuda.set_device(local_rank) ranks = [i for i in range(self.mp_world_size)] self.mp_group = dist.new_group(ranks) InferenceEngine.inference_mp_group = self.mp_group else: self.mp_group = InferenceEngine.inference_mp_group
def _initialize_parameter_parallel_groups(parameter_parallel_size=None): data_parallel_size = int(dist.get_world_size()) parameter_parallel_size = parameter_parallel_size or data_parallel_size logger.info("data_parallel_size: %s, parameter_parallel_size: %s", data_parallel_size, parameter_parallel_size) assert data_parallel_size % parameter_parallel_size == 0, \ 'world size should be divisible by parameter parallel size' rank = dist.get_rank() my_group = None for i in range(data_parallel_size // parameter_parallel_size): ranks = range(i * parameter_parallel_size, (i + 1) * parameter_parallel_size) group = dist.new_group(ranks) if rank in ranks: my_group = group return my_group
def test_partitioned_tensor_meta(): world = dist.get_world_size() rank = dist.get_rank() group = dist.new_group(ranks=list(range(world))) rows = world * 7 cols = 3 full = torch.rand(rows, cols).cuda() dist.broadcast(full, src=0, group=group) part = PartitionedTensor(full, group=group) my_meta = PartitionedTensor.from_meta(part.to_meta(), part.local_data, group) assert torch.equal(full, my_meta.full())
def test_partitioned_tensor(): world = dist.get_world_size() rank = dist.get_rank() group = dist.new_group(ranks=list(range(world))) rows = world * 4 cols = 3 full = torch.rand(rows, cols).cuda() dist.broadcast(full, src=0, group=group) part = PartitionedTensor(full, group=group) assert len(part.local_size()) == 1 assert part.local_size()[0] * world == full.numel() reconstructed = part.full() assert torch.equal(full, reconstructed)
def __init__(self, topology=None, process_group=None): # TODO use process_group if provided self.global_rank = dist.get_rank() self.world_size = dist.get_world_size() if topology is not None: if self.global_rank == 0: print('Using topology:', topology) self._topo = topology else: num_pp = 1 num_dp = 1 for idx, prime in enumerate(_prime_factors(self.world_size)): if idx % 2 == 0: num_pp *= prime else: num_dp *= prime self._topo = PipeDataParallelTopology(num_dp=num_dp, num_pp=num_pp) self.data_parallel_size = max(self._topo.get_dim('data'), 1) self.pipe_parallel_size = max(self._topo.get_dim('pipe'), 1) self.model_parallel_size = max(self._topo.get_dim('model'), 1) self.slice_parallel_size = self.model_parallel_size assert self._is_grid_valid(), "Invalid Grid" self.stage_id = self.get_stage_id() self.data_parallel_id = self.get_data_parallel_id() # Create new ProcessGroups for all model parallelism. DeepSpeedLight uses these # to detect overflow, etc. self.ds_model_proc_group = None self.ds_model_rank = -1 for dp in range(self.data_parallel_size): ranks = sorted(self._topo.get_axis_list(axis='data', idx=dp)) if self.global_rank == 0: #print(f'RANK={self.global_rank} building DeepSpeed model group: {ranks}') pass proc_group = dist.new_group(ranks=ranks) if self.global_rank in ranks: self.ds_model_proc_group = proc_group self.ds_model_world_size = len(ranks) self.ds_model_rank = ranks.index(self.global_rank) assert self.ds_model_rank > -1 assert self.ds_model_proc_group is not None # Create new ProcessGroup for gradient all-reduces - these are the data parallel groups self.dp_group = [] self.dp_groups = self._topo.get_axis_comm_lists('data') for g in self.dp_groups: proc_group = dist.new_group(ranks=g) if self.global_rank in g: self.dp_group = g self.dp_proc_group = proc_group self.is_first_stage = (self.stage_id == 0) self.is_last_stage = (self.stage_id == (self.pipe_parallel_size - 1)) self.p2p_groups = self._build_p2p_groups() # Create new ProcessGroup for pipeline collectives - these are pipe parallel groups self.pp_group = [] self.pp_proc_group = None self.pipe_groups = self._topo.get_axis_comm_lists('pipe') for ranks in self.pipe_groups: if self.global_rank == 0: #print(f'RANK={self.global_rank} building pipeline group: {ranks}') pass proc_group = dist.new_group(ranks=ranks) if self.global_rank in ranks: self.pp_group = ranks self.pp_proc_group = proc_group assert self.pp_proc_group is not None # Create new ProcessGroup for model (tensor-slicing) collectives # Short circuit case without model parallelism. # TODO: it would be nice if topology had bcast semantics to avoid this branching # case? if self.model_parallel_size == 1: for group_rank in range(self.world_size): group_rank = [group_rank] group = dist.new_group(ranks=group_rank) if group_rank[0] == self.global_rank: self.slice_group = group_rank self.slice_proc_group = group return else: self.mp_group = [] self.model_groups = self._topo.get_axis_comm_lists('model') for g in self.model_groups: proc_group = dist.new_group(ranks=g) if self.global_rank in g: self.slice_group = g self.slice_proc_group = proc_group
def __init__(self, layers, num_stages=None, topology=None, loss_fn=None, seed_layers=False, seed_fn=None, base_seed=1234, partition_method='parameters', activation_checkpoint_interval=0, activation_checkpoint_func=checkpointing.checkpoint, checkpointable_layers=None): """Modules to be parallelized with pipeline parallelism. The key constraint that enables pipeline parallelism is the representation of the forward pass as a sequence of layers and the enforcement of a simple interface between them. The forward pass is implicitly defined by the module ``layers``. The key assumption is that the output of each layer can be directly fed as input to the next, like a ``torch.nn.Sequence``. The forward pass is implicitly: .. code-block:: python def forward(self, inputs): x = inputs for layer in self.layers: x = layer(x) return x .. note:: Pipeline parallelism is not compatible with ZeRO-2 and ZeRO-3. Args: layers (Iterable): A sequence of layers defining pipeline structure. Can be a ``torch.nn.Sequential`` module. num_stages (int, optional): The degree of pipeline parallelism. If not specified, ``topology`` must be provided. topology (``deepseed.runtime.pipe.ProcessTopology``, optional): Defines the axes of parallelism axes for training. Must be provided if ``num_stages`` is ``None``. loss_fn (callable, optional): Loss is computed ``loss = loss_fn(outputs, label)`` base_seed (int, optional): [description]. Defaults to 1234. partition_method (str, optional): [description]. Defaults to 'parameters'. activation_checkpoint_interval (int, optional): The granularity activation checkpointing in terms of number of layers. 0 disables activation checkpointing. activation_checkpoint_func (callable, optional): The function to use for activation checkpointing. Defaults to ``deepspeed.checkpointing.checkpoint``. """ super().__init__() if num_stages is None and topology is None: raise RuntimeError('must provide num_stages or topology') self.micro_offset = 0 self.loss_fn = loss_fn self.checkpointable_layers = checkpointable_layers if checkpointable_layers is not None: assert isinstance(checkpointable_layers, list), "param `checkpointable_layers` must be type of list." self.seed_layers = seed_layers self.seed_fn = seed_fn self.base_seed = base_seed if dist.get_rank() == 0: try: seed_str = self.seed_fn.__name__ except AttributeError: seed_str = None print( f'SEED_LAYERS={self.seed_layers} BASE_SEED={self.base_seed} SEED_FN={seed_str}' ) # Setup world info self.world_group = dist.new_group(ranks=range(dist.get_world_size())) self.global_rank = dist.get_rank(group=self.world_group) self.world_size = dist.get_world_size(group=self.world_group) self.local_rank = int(os.environ.get("LOCAL_RANK", None)) assert self.local_rank != None if topology: self._topo = topology self.num_stages = self._topo.get_dim('pipe') else: self.num_stages = num_stages if topology is None: if self.world_size % self.num_stages != 0: raise RuntimeError( f'num_stages ({self.num_stages}) must divide distributed world size ({self.world_size})' ) dp = self.world_size // num_stages topology = PipeDataParallelTopology(num_pp=num_stages, num_dp=dp) self._topo = topology # Construct communicators for pipeline topology self._grid = PipelineParallelGrid(process_group=self.world_group, topology=self._topo) self.stage_id = self._topo.get_coord(self.global_rank).pipe # Initialize partition information self._layer_specs = list(layers) self._num_layers = len(self._layer_specs) self._local_start = 0 self._local_stop = None self._partition_layers(method=partition_method) self.forward_funcs = [] self.fwd_map = {} self.tied_modules = nn.ModuleDict() self.tied_weight_attrs = {} # Offset the random seed by the stage ID. #newseed = torch.cuda.initial_seed() + self._grid.get_stage_id() #ds_utils.set_random_seed(newseed) #with torch.random.fork_rng(devices=[torch.cuda.current_device()]): self._build() self.to(f'cuda:{self.local_rank}') self.tied_comms = self._index_tied_modules() self._synchronize_tied_weights() self.activation_checkpoint_interval = activation_checkpoint_interval self.activation_checkpoint_func = activation_checkpoint_func