def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False): """Split a tensor along its last dimension. Arguments: tensor: input tensor. num_partitions: number of partitions to split the tensor contiguous_split_chunks: If True, make each chunk contiguous in memory. """ # Get the size and dimension. last_dim = tensor.dim() - 1 last_dim_size = divide(tensor.size()[last_dim], num_partitions) # Split. tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) # Note: torch.split does not create contiguous tensors by default. if contiguous_split_chunks: return tuple(chunk.contiguous() for chunk in tensor_list) return tensor_list
def _initialize_affine_weight_cpu( weight, output_size, input_size, per_partition_size, partition_dim, init_method, stride=1, return_master_weight=False, *, params_dtype=torch.float32, ): """Initialize affine weight for model parallel. Build the master weight on all processes and scatter the relevant chunk.""" set_tensor_model_parallel_attributes(tensor=weight, is_parallel=True, dim=partition_dim, stride=stride) # Initialize master weight master_weight = torch.empty(output_size, input_size, dtype=torch.float, requires_grad=False) init_method(master_weight) master_weight = master_weight.to(dtype=params_dtype) # Split and copy per_partition_per_stride_size = divide(per_partition_size, stride) weight_list = torch.split(master_weight, per_partition_per_stride_size, dim=partition_dim) rank = get_tensor_model_parallel_rank() world_size = get_tensor_model_parallel_world_size() my_weight_list = weight_list[rank::world_size] with torch.no_grad(): torch.cat(my_weight_list, dim=partition_dim, out=weight) if return_master_weight: return master_weight return None
def __init__( self, input_size, output_size, bias=True, input_is_parallel=False, init_method=init.xavier_normal_, stride=1, keep_master_weight_for_test=False, skip_bias_add=False, *, params_dtype=torch.float32, use_cpu_initialization=False, ): super(RowParallelLinear, self).__init__() # Keep input parameters self.input_size = input_size self.output_size = output_size self.input_is_parallel = input_is_parallel # Divide the weight matrix along the last dimension. world_size = get_tensor_model_parallel_world_size() self.input_size_per_partition = divide(input_size, world_size) self.skip_bias_add = skip_bias_add # as an argument to this function? # Parameters. # Note: torch.nn.functional.linear performs XA^T + b and as a result # we allocate the transpose. # Initialize weight. if use_cpu_initialization: self.weight = Parameter( torch.empty( self.output_size, self.input_size_per_partition, dtype=params_dtype ) ) self.master_weight = _initialize_affine_weight_cpu( self.weight, self.output_size, self.input_size, self.input_size_per_partition, 1, init_method, stride=stride, return_master_weight=keep_master_weight_for_test, params_dtype=params_dtype, ) else: self.weight = Parameter( torch.empty( self.output_size, self.input_size_per_partition, device=torch.cuda.current_device(), dtype=params_dtype, ) ) _initialize_affine_weight_gpu( self.weight, init_method, partition_dim=1, stride=stride ) if bias: if use_cpu_initialization: self.bias = Parameter(torch.empty(self.output_size, dtype=params_dtype)) else: self.bias = Parameter( torch.empty( self.output_size, device=torch.cuda.current_device(), dtype=params_dtype, ) ) # Always initialize bias to zero. with torch.no_grad(): self.bias.zero_() else: self.register_parameter("bias", None)
def __init__( self, input_size, output_size, bias=True, gather_output=True, init_method=init.xavier_normal_, stride=1, keep_master_weight_for_test=False, skip_bias_add=False, *, no_async_tensor_model_parallel_allreduce=False, params_dtype=torch.float32, use_cpu_initialization=False, gradient_accumulation_fusion=False, accumulation_in_fp16: bool = False, ): super(ColumnParallelLinear, self).__init__() # Keep input parameters self.input_size = input_size self.output_size = output_size self.gather_output = gather_output # Divide the weight matrix along the last dimension. world_size = get_tensor_model_parallel_world_size() self.output_size_per_partition = divide(output_size, world_size) self.skip_bias_add = skip_bias_add # Parameters. # Note: torch.nn.functional.linear performs XA^T + b and as a result # we allocate the transpose. # Initialize weight. if use_cpu_initialization: self.weight = Parameter( torch.empty( self.output_size_per_partition, self.input_size, dtype=params_dtype ) ) self.master_weight = _initialize_affine_weight_cpu( self.weight, self.output_size, self.input_size, self.output_size_per_partition, 0, init_method, stride=stride, return_master_weight=keep_master_weight_for_test, params_dtype=params_dtype, ) else: self.weight = Parameter( torch.empty( self.output_size_per_partition, self.input_size, device=torch.cuda.current_device(), dtype=params_dtype, ) ) _initialize_affine_weight_gpu( self.weight, init_method, partition_dim=0, stride=stride ) if bias: if use_cpu_initialization: self.bias = Parameter( torch.empty(self.output_size_per_partition, dtype=params_dtype) ) else: self.bias = Parameter( torch.empty( self.output_size_per_partition, device=torch.cuda.current_device(), dtype=params_dtype, ) ) set_tensor_model_parallel_attributes(self.bias, True, 0, stride) # Always initialize bias to zero. with torch.no_grad(): self.bias.zero_() else: self.register_parameter("bias", None) self.async_tensor_model_parallel_allreduce = ( not no_async_tensor_model_parallel_allreduce and world_size > 1 ) if gradient_accumulation_fusion: if not _grad_accum_fusion_available: # Basically, apex.transformer module users are expected to install APEX's # `--cpp_ext` and `--cuda_ext`. The example installation command is as follows: # `pip install --global-option="--cpp_ext" --global-option="--cuda_ext ." # at the root of APEX repository. import warnings warnings.warn( "`gradient_accumulation_fusion` is set to `True` but " "the custom CUDA extension of `fused_weight_gradient_mlp_cuda` module not " "found. Thus `gradient_accumulation_fusion` set to `False`. " "Note that the extension requires CUDA>=11." ) gradient_accumulation_fusion = False self.gradient_accumulation_fusion = gradient_accumulation_fusion self._forward_impl = ( linear_with_grad_accumulation_and_async_allreduce_in16bit if accumulation_in_fp16 else linear_with_grad_accumulation_and_async_allreduce )
def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): per_partition_vocab_size = divide(global_vocab_size, world_size) return VocabUtility.vocab_range_from_per_partition_vocab_size( per_partition_vocab_size, rank, world_size)
def __init__( self, input_size, output_size, bias=True, gather_output=True, init_method=init.xavier_normal_, stride=1, keep_master_weight_for_test=False, skip_bias_add=False, *, no_async_tensor_model_parallel_allreduce=False, params_dtype=torch.float32, use_cpu_initialization=False, gradient_accumulation_fusion=False, accumulation_in_fp16: bool = False, ): super(ColumnParallelLinear, self).__init__() # Keep input parameters self.input_size = input_size self.output_size = output_size self.gather_output = gather_output # Divide the weight matrix along the last dimension. world_size = get_tensor_model_parallel_world_size() self.output_size_per_partition = divide(output_size, world_size) self.skip_bias_add = skip_bias_add # Parameters. # Note: torch.nn.functional.linear performs XA^T + b and as a result # we allocate the transpose. # Initialize weight. if use_cpu_initialization: self.weight = Parameter( torch.empty(self.output_size_per_partition, self.input_size, dtype=params_dtype)) self.master_weight = _initialize_affine_weight_cpu( self.weight, self.output_size, self.input_size, self.output_size_per_partition, 0, init_method, stride=stride, return_master_weight=keep_master_weight_for_test, params_dtype=params_dtype, ) else: self.weight = Parameter( torch.empty( self.output_size_per_partition, self.input_size, device=torch.cuda.current_device(), dtype=params_dtype, )) _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=stride) if bias: if use_cpu_initialization: self.bias = Parameter( torch.empty(self.output_size_per_partition, dtype=params_dtype)) else: self.bias = Parameter( torch.empty(self.output_size_per_partition, device=torch.cuda.current_device(), dtype=params_dtype)) set_tensor_model_parallel_attributes(self.bias, True, 0, stride) # Always initialize bias to zero. with torch.no_grad(): self.bias.zero_() else: self.register_parameter("bias", None) self.async_tensor_model_parallel_allreduce = ( not no_async_tensor_model_parallel_allreduce and world_size > 1) self.gradient_accumulation_fusion = gradient_accumulation_fusion and _grad_accum_fusion_available self._forward_impl = linear_with_grad_accumulation_and_async_allreduce_in16bit if accumulation_in_fp16 else linear_with_grad_accumulation_and_async_allreduce