def from_pretrained(cls, model_name_or_path, seq_len=512): init_method = 'tcp://' + os.getenv('MASTER_ADDR', 'localhost') + ':' + os.getenv( 'MASTER_PORT', '6000') torch.distributed.init_process_group(backend='nccl', world_size=1, rank=0, init_method=init_method) mpu.initialize_model_parallel(1) seed = 1234 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) mpu.model_parallel_cuda_manual_seed(seed) tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path) logger.info("Check cached model files...") weights_path, deepspeed_config_path = download_model_files( model_name_or_path) model = setup_model(weights_path, deepspeed_config_path) model.cuda() model = model.eval() return cls(model, tokenizer=tokenizer, seq_len=seq_len, model_path=model_name_or_path)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) mpu.initialize_model_parallel(args.model_parallel_size) # create model conf_dict = EasyDict(yaml.load(open(args.cfg, "r"), Loader=yaml.Loader)) conf_dict.world_size = mpu.get_model_parallel_world_size() conf_dict.gpu = args.gpu conf_dict.device = torch.device( "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") solver = Solver(conf_dict) solver.train()
def initialize_distributed(args): """Initialize torch.distributed.""" # Manually set the device ids. device = args.rank % torch.cuda.device_count() if args.local_rank is not None: device = args.local_rank torch.cuda.set_device(device) # Call the init process init_method = 'tcp://' master_ip = os.getenv('MASTER_ADDR', 'localhost') master_port = os.getenv('MASTER_PORT', '6000') init_method += master_ip + ':' + master_port torch.distributed.init_process_group(backend=args.distributed_backend, world_size=args.world_size, rank=args.rank, init_method=init_method) # Set the model-parallel / data-parallel communicators. mpu.initialize_model_parallel(args.model_parallel_size) groups.initialize(ep_size=args.expert_parallel_size, mpu=mpu) # Optional DeepSpeed Activation Checkpointing Features # if args.deepspeed and args.deepspeed_activation_checkpointing: set_deepspeed_activation_checkpointing(args)
def parallel_self_attention(model_parallel_size, num_att_heads_per_partition, hidden_size_per_att_head, dropout_prob, batch_size, sequence_length): mpu.initialize_model_parallel(model_parallel_size) model_parallel_size = mpu.get_model_parallel_world_size() seed = 12345 set_random_seed(seed) num_att_heads = num_att_heads_per_partition * \ torch.distributed.get_world_size() hidden_size = hidden_size_per_att_head * num_att_heads # Network identity_layer = IdentityLayer3D(batch_size, sequence_length, hidden_size).cuda() attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads, dropout_prob).cuda() loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda() attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() # Forward input_ = identity_layer() output = attention_layer(input_, attention_mask) loss = torch.mul(output, loss_weight).sum() # Backward loss.backward() rank = mpu.get_model_parallel_rank() mpu.destroy_model_parallel() return rank, hidden_size, model_parallel_size, loss, \ attention_layer, identity_layer
def test_boradcast_data(model_parallel_size): if torch.distributed.get_rank() == 0: print( '> testing boradcast_data with model parallel size {} ...'.format( model_parallel_size)) mpu.initialize_model_parallel(model_parallel_size) torch.manual_seed(1234 + mpu.get_data_parallel_rank()) model_parallel_size = mpu.get_model_parallel_world_size() key_size_t = { 'key1': [7, 11], 'key2': [8, 2, 1], 'key3': [13], 'key4': [5, 1, 2], 'key5': [5, 12] } keys = list(key_size_t.keys()) data = {} data_t = {} for key in key_size_t: data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) data_t[key] = data[key].clone() data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) data_t['keyX'] = data['keyX'].clone() if mpu.get_model_parallel_rank() != 0: data = None data_utils._check_data_types(keys, data_t, torch.int64) key_size, key_numel, \ total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) for key in keys: assert key_size[key] == key_size_t[key] total_numel_t = 0 for key in keys: target_size = functools.reduce(operator.mul, key_size_t[key], 1) assert key_numel[key] == target_size total_numel_t += target_size assert total_numel == total_numel_t data_b = data_utils.broadcast_data(keys, data, torch.int64) for key in keys: tensor = data_t[key].cuda() assert data_b[key].sub(tensor).abs().max() == 0 # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def initialize_distributed_env(distributed_init_method, rank, local_rank, world_size, model_parallel_size, pipeline_parallel_size): torch.cuda.set_device(local_rank) dist.init_process_group( backend='nccl', init_method=distributed_init_method, world_size=world_size, rank=rank, ) # A small all_reduce for warmup. dist.all_reduce(torch.zeros(1).cuda()) mpu.initialize_model_parallel(model_parallel_size, pipeline_parallel_size) set_random_seed(0) mpu.model_parallel_cuda_manual_seed(0)
def initialize_distributed(args): """Initialize torch.distributed.""" # Manually set the device ids. device = args.rank % torch.cuda.device_count() if args.local_rank is not None: device = args.local_rank torch.cuda.set_device(device) # Call the init process init_method = 'tcp://' master_ip = os.getenv('MASTER_ADDR', 'localhost') master_port = os.getenv('MASTER_PORT', '6000') init_method += master_ip + ':' + master_port torch.distributed.init_process_group(backend=args.distributed_backend, world_size=args.world_size, rank=args.rank, init_method=init_method) # Set the model-parallel / data-parallel communicators. mpu.initialize_model_parallel(args.model_parallel_size)
def test_get_model_parallel_src_rank(model_parallel_size_): if torch.distributed.get_rank() == 0: print('> testing get_model_parallel_src_rank with size {} ...'.format( model_parallel_size_)) model_parallel_size = min(model_parallel_size_, torch.distributed.get_world_size()) assert not mpu.model_parallel_is_initialized() mpu.initialize_model_parallel(model_parallel_size) assert mpu.model_parallel_is_initialized() # Checks src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank() assert mpu.get_model_parallel_src_rank() == src_rank # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def test_cross_entropy(model_parallel_size): if torch.distributed.get_rank() == 0: print('> testing cross entropy with model parallel size {} ...'.format( model_parallel_size)) mpu.initialize_model_parallel(model_parallel_size) model_parallel_size = mpu.get_model_parallel_world_size() batch_size = 13 seq_length = 17 vocab_size_per_partition = 11 logits_scale = 1000.0 vocab_size = vocab_size_per_partition * model_parallel_size seed = 1234 loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed) loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed) error = loss_torch.sub_(loss_mpu).abs().max() print(' max error in loss on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 error = grad_torch.sub_(grad_mpu).abs().max() print(' max error in grad on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def test_initialize_model_parallel(model_parallel_size): if torch.distributed.get_rank() == 0: print('> testing initialize_model_parallel with size {} ...'.format( model_parallel_size)) model_parallel_size_ = min(model_parallel_size, torch.distributed.get_world_size()) assert not mpu.model_parallel_is_initialized() mpu.initialize_model_parallel(model_parallel_size_) assert mpu.model_parallel_is_initialized() # Checks. def check(group, world_size, rank): assert world_size == torch.distributed.get_world_size(group=group) assert rank == torch.distributed.get_rank(group=group) # Model parallel. world_size = model_parallel_size_ rank = torch.distributed.get_rank() % model_parallel_size_ assert world_size == mpu.get_model_parallel_world_size() assert rank == mpu.get_model_parallel_rank() check(mpu.get_model_parallel_group(), world_size, rank) # Data parallel. world_size = torch.distributed.get_world_size() // model_parallel_size_ rank = torch.distributed.get_rank() // model_parallel_size assert world_size == mpu.get_data_parallel_world_size() assert rank == mpu.get_data_parallel_rank() check(mpu.get_data_parallel_group(), world_size, rank) # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def parallel_transformer(model_parallel_size, num_att_heads_per_partition, hidden_size_per_att_head, batch_size, sequence_length): mpu.initialize_model_parallel(model_parallel_size) model_parallel_size = mpu.get_model_parallel_world_size() seed = 12345 set_random_seed(seed) num_att_heads = num_att_heads_per_partition * \ torch.distributed.get_world_size() hidden_size = hidden_size_per_att_head * num_att_heads intermediate_size = 4 * hidden_size # Network identity_layer = IdentityLayer3D(batch_size, sequence_length, hidden_size).cuda() transformer_layer = mpu.BertParallelTransformerLayer( hidden_size, intermediate_size, num_att_heads, 0.0, 0.0, torch.nn.functional.relu, 1.0e-5).cuda() loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda() attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() # Forward input_ = identity_layer() output = transformer_layer(input_, attention_mask) loss = torch.mul(output, loss_weight).sum() # Backward loss.backward() rank = mpu.get_model_parallel_rank() mpu.destroy_model_parallel() return rank, hidden_size, model_parallel_size, loss, \ transformer_layer, identity_layer
def initialize_distributed(args): """Initialize torch.distributed.""" # Manually set the device ids. device = args.rank % torch.cuda.device_count() if args.local_rank is not None: device = args.local_rank torch.cuda.set_device(device) # Call the init process init_method = 'tcp://' master_ip = os.getenv('MASTER_ADDR', 'localhost') master_port = os.getenv('MASTER_PORT', '6000') init_method += master_ip + ':' + master_port torch.distributed.init_process_group(backend=args.distributed_backend, world_size=args.world_size, rank=args.rank, init_method=init_method) # Set the model-parallel / data-parallel communicators. mpu.initialize_model_parallel(args.model_parallel_size) #Checkpoints are partitioned across the model parallel process #instead of having replicas in the original megatron mpu.partition_activations_in_checkpoint(args.partition_activations)
def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size): if torch.distributed.get_rank() == 0: print('> testing model parallel cuda manual seed with size {} ...'. format(tensor_model_parallel_size)) mpu.initialize_model_parallel(tensor_model_parallel_size) tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() mpu.model_parallel_cuda_manual_seed(12345) assert torch.cuda.initial_seed() == 12345 with mpu.get_cuda_rng_tracker().fork(): assert torch.cuda.initial_seed() == ( 12345 + 2718 + mpu.get_tensor_model_parallel_rank()) # Reset the tracker mpu.get_cuda_rng_tracker().reset() # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def test_cuda_rng_tracker(tensor_model_parallel_size): if torch.distributed.get_rank() == 0: print('> testing cuda rng tracker with size {} ...'.format( tensor_model_parallel_size)) mpu.initialize_model_parallel(tensor_model_parallel_size) tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() seed_1 = 1234 seed_2 = 4321 size = [12, 21] tensor = torch.cuda.FloatTensor(size) # Set to seed_1 and generate two tensors. torch.cuda.manual_seed(seed_1) torch.randn(size, out=tensor) target_11 = tensor.clone() torch.randn(size, out=tensor) target_12 = tensor.clone() # Set to seed_2 and generate two tensors. torch.cuda.manual_seed(seed_2) torch.randn(size, out=tensor) target_21 = tensor.clone() torch.randn(size, out=tensor) target_22 = tensor.clone() # Now if we interleave seed_1 and seed_2, # we should still get the same tensors torch.cuda.manual_seed(seed_1) mpu.get_cuda_rng_tracker().add('test', seed_2) torch.randn(size, out=tensor) result_11 = tensor.clone() with mpu.get_cuda_rng_tracker().fork('test'): torch.randn(size, out=tensor) result_21 = tensor.clone() torch.randn(size, out=tensor) result_12 = tensor.clone() with mpu.get_cuda_rng_tracker().fork('test'): torch.randn(size, out=tensor) result_22 = tensor.clone() diff = result_11.sub(result_21).abs().max() diff = min(diff, result_12.sub(result_22).abs().max()) print(' max diff in generated tensors (should be non-zero) on ' 'global rank {}: {}'.format(torch.distributed.get_rank(), diff)) assert diff > 1.0e-6 error = max( result_11.sub(target_11).abs().max(), result_12.sub(target_12).abs().max()) error = max(error, result_21.sub(target_21).abs().max()) error = max(error, result_22.sub(target_22).abs().max()) print(' max error in generated tensors (should be zero) on ' 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) assert error < 1.0e-6 # Reset the tracker mpu.get_cuda_rng_tracker().reset() # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def test_set_cuda_rng_state(tensor_model_parallel_size): if torch.distributed.get_rank() == 0: print('> testing set_rng_state with size {} ...'.format( tensor_model_parallel_size)) mpu.initialize_model_parallel(tensor_model_parallel_size) tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() size = 123 seed = 1234 torch.cuda.manual_seed(1234) tensor = torch.cuda.FloatTensor(size) # Get the state rng_state = torch.cuda.get_rng_state() rng_state_copy = rng_state.clone() # Do some stuff. for _ in range(5): torch.randn(size, out=tensor) result_1 = tensor.clone() assert rng_state.sub(rng_state_copy).max() == 0 assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0 # State should be different. new_rng_state = torch.cuda.get_rng_state() max_diff = new_rng_state.sub(rng_state).max() print( ' max diff in rng state (should be non-zero) on global rank {}: {}'. format(torch.distributed.get_rank(), max_diff)) assert max_diff > 0 # Reset the rng state and do the same stuff. mpu.random._set_cuda_rng_state(rng_state) for _ in range(5): torch.randn(size, out=tensor) mpu.random._set_cuda_rng_state(rng_state) for _ in range(5): torch.randn(size, out=tensor) result_2 = tensor.clone() # Results should be the same error = result_2.sub(result_1).abs().max() print(' max error in generated tensors (should be zero) on ' 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) assert error < 1.0e-6 # Input state should have remained intact. error = rng_state.sub(rng_state_copy).max() print(' max error in rng state (should be zero) on global rank {}: {}'. format(torch.distributed.get_rank(), error)) assert error == 0 # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
args.train_batch_size, args.max_seq_length) eval_dataloader_loss = DynamicBatchingLoader(args.eval_input_file, enc, args.normalize_data, args.eval_batch_size, args.max_seq_length) eval_dataloader_gen = get_eval_list_same_length(args.eval_input_file, enc, args.eval_batch_size, True) ######################################################################### # Prepare Model and Optimizer ########################################################################## mpu.initialize_model_parallel(1) model = load_model(GPT2LMHeadModel(config), args.init_checkpoint, args, verbose=True) if args.local_rank != -1: # when from scratch make sure initial models are the same params = [p.data for p in model.parameters()] all_reduce_and_rescale_tensors(params, float(torch.distributed.get_world_size())) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) total_params = sum([np.prod(p.size()) for p in model_parameters]) logger.info('Number of parameter = {}'.format(total_params))
def __init__(self, config, batch_slices, seq_slices, distributed_init_method, world_size, data_parallel_size, model_parallel_size, pipeline_parallel_size, rank, local_rank, mixed_precision=False, use_mpi=False, init_process_group=False, checkpoint_gradients=False): self.config = config self.batch_slices = batch_slices self.seq_slices = seq_slices torch.cuda.set_device(local_rank) if init_process_group: dist.init_process_group( backend='nccl', init_method=distributed_init_method, world_size=world_size, rank=rank, ) dist.all_reduce(torch.zeros(1).cuda()) mpu.initialize_model_parallel(model_parallel_size, pipeline_parallel_size) set_random_seed(0) mpu.model_parallel_cuda_manual_seed(0) self.rank = rank self.local_rank = local_rank self.world_size = world_size self.data_parallel_size = data_parallel_size self.model_parallel_size = model_parallel_size self.pipeline_parallel_size = pipeline_parallel_size self.pipeline_parallel_group_rank = mpu.get_pipeline_parallel_group_rank( ) self.data_parallel_group = mpu.get_data_parallel_group() self.model_parallel_group = mpu.get_model_parallel_group() self.pipeline_parallel_pred_group = mpu.get_pipeline_parallel_pred_group( ) self.pipeline_parallel_succ_group = mpu.get_pipeline_parallel_succ_group( ) self.model_parallel_src_rank = mpu.get_model_parallel_src_rank() self.model_parallel_dst_rank = mpu.get_model_parallel_dst_rank() self.model_parallel_next_src_rank = ( self.model_parallel_src_rank + self.model_parallel_size if self.pipeline_parallel_group_rank < self.pipeline_parallel_size - 1 else None) self.model_parallel_prev_dst_rank = ( self.model_parallel_dst_rank - self.model_parallel_size if self.pipeline_parallel_group_rank > 0 else None) self.n_layers = (config.n_layers // pipeline_parallel_size + int(rank < config.n_layers % pipeline_parallel_size)) self.config = config self.mixed_precision = mixed_precision self.checkpoint_gradients = checkpoint_gradients self.layers = [] for _ in range(self.n_layers): l = ModelParallelTransformerLayer( self.config.embedding_dim, self.config.ffn_embedding_dim, self.config.num_attention_heads, device="cuda", checkpoint_gradients=self.checkpoint_gradients) self.layers.append(l.half() if self.mixed_precision else l) self.all_parameters = [] for layer in self.layers: self.all_parameters.extend(layer.parameters()) self.n_params = len(self.all_parameters) if self.mixed_precision: self.master_parameters = [ p.clone().detach().float() for p in self.all_parameters ] for p in self.master_parameters: p.requires_grad_() self.optimizer = optimizers.FusedAdam(self.master_parameters, lr=1e-10) else: self.optimizer = torch.optim.Adam(self.all_parameters, lr=1e-10)
def test_initialize_affine_weight(model_parallel_size): mpu.initialize_model_parallel(model_parallel_size) if torch.distributed.get_rank() == 0: print('> testing initialize_affine_weight with model parallel ' 'size: {}'.format(model_parallel_size)) model_parallel_size = mpu.get_model_parallel_world_size() seed = 12345 input_size_coeff = 13 input_size = input_size_coeff * model_parallel_size output_size_coeff = 17 output_size = output_size_coeff * model_parallel_size # --------------- # Column parallel # --------------- weight = torch.empty(output_size_coeff, input_size) set_random_seed(seed) layers._initialize_affine_weight(weight, output_size, input_size, output_size_coeff, 0, torch.nn.init.normal_) # Target. set_random_seed(seed) master_weight = torch.empty(output_size, input_size) torch.nn.init.normal_(master_weight) rank = mpu.get_model_parallel_rank() my_weight = torch.split(master_weight, output_size_coeff, dim=0)[rank].contiguous().clone() # Compare. error = weight.sub(my_weight).abs().max() torch.distributed.barrier() print(' column parallel max error (should be zero) on global rank ' '{}: {}'.format(torch.distributed.get_rank(), error)) assert error < 1.0e-6 # ------------ # Row parallel # ------------ weight = torch.empty(output_size, input_size_coeff) set_random_seed(seed) mpu.layers._initialize_affine_weight(weight, output_size, input_size, input_size_coeff, 1, torch.nn.init.normal_) # Target. set_random_seed(seed) master_weight = torch.empty(output_size, input_size) torch.nn.init.normal_(master_weight) rank = mpu.get_model_parallel_rank() my_weight = torch.split(master_weight, input_size_coeff, dim=1)[rank].contiguous().clone() # Compare. error = weight.sub(my_weight).abs().max() torch.distributed.barrier() print(' row parallel max error (should be zero) on global rank ' '{}: {}'.format(torch.distributed.get_rank(), error)) assert error < 1.0e-6 # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(' >> passed the test :-)')
def test_parallel_embedding(model_parallel_size): if torch.distributed.get_rank() == 0: print('> testing parallel embedding with model parallel size {} ...'. format(model_parallel_size)) mpu.initialize_model_parallel(model_parallel_size) model_parallel_size = mpu.get_model_parallel_world_size() batch_size = 17 seq_length = 23 vocab_size = 48 hidden_size = 16 seed = 1236 set_random_seed(123) input_data = torch.LongTensor(size=(batch_size, seq_length)).random_( 0, vocab_size).cuda() loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda() set_random_seed(seed) embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda() output = embedding_original(input_data) loss_original = torch.mul(output, loss_weight).sum() loss_original.backward() set_random_seed(seed) embedding_parallel = layers.ParallelEmbedding( vocab_size, hidden_size, init_method=init.normal_).cuda() output = embedding_parallel(input_data) loss_parallel = torch.mul(output, loss_weight).sum() loss_parallel.backward() set_random_seed(seed) embedding_vocab_parallel = layers.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init.normal_).cuda() output = embedding_vocab_parallel(input_data) loss_vocab_parallel = torch.mul(output, loss_weight).sum() loss_vocab_parallel.backward() torch.distributed.barrier() error = loss_parallel.sub(loss_original).abs() print(' error in loss (parallel) on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-12, 'error: {}'.format(error) torch.distributed.barrier() error = loss_vocab_parallel.sub(loss_original).abs() print(' error in loss (vocab parallel) on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-12, 'error: {}'.format(error) weight_grad_orig = torch.split(embedding_original.weight.grad, hidden_size // model_parallel_size, 1)[mpu.get_model_parallel_rank()] error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max() print(' error in grad (parallel) on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-12, 'error: {}'.format(error) weight_grad_orig = torch.split(embedding_original.weight.grad, vocab_size // model_parallel_size, 0)[mpu.get_model_parallel_rank()] error = embedding_vocab_parallel.weight.grad.sub( weight_grad_orig).abs().max() print(' error in grad (vocab parallel) on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-12, 'error: {}'.format(error) # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def test_row_parallel_linear(model_parallel_size): mpu.initialize_model_parallel(model_parallel_size) if torch.distributed.get_rank() == 0: print('> testing RowParallelLinear with model parallel ' 'size: {}'.format(model_parallel_size)) model_parallel_size = mpu.get_model_parallel_world_size() seed = 12345 set_random_seed(seed) input_size_coeff = 13 input_size = input_size_coeff * model_parallel_size output_size_coeff = 17 output_size = output_size_coeff * model_parallel_size batch_size = 7 # Network identity_layer = IdentityLayer2D(batch_size, input_size).cuda() linear_layer = mpu.RowParallelLinear( input_size, output_size, keep_master_weight_for_test=True).cuda() loss_weight = torch.randn([batch_size, output_size]).cuda() # Forward input_ = identity_layer() output = linear_layer(input_) loss = torch.mul(output, loss_weight).sum() # Backward loss.backward() # Values. dLdY = loss_weight X = identity_layer.weight A = linear_layer.master_weight.cuda() dLdA = torch.matmul(dLdY.t(), X) dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1) dLdX = torch.matmul(dLdY, A) rank = mpu.get_model_parallel_rank() my_dLdA = torch.split(dLdA, input_size_coeff, dim=1)[rank].contiguous().clone() error = my_dLdA.sub(linear_layer.weight.grad).abs().max() torch.distributed.barrier() print(' error in dLdA on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 error = dLdb.sub(linear_layer.bias.grad).abs().max() torch.distributed.barrier() print(' error in dLdb on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 error = dLdX.sub(identity_layer.weight.grad).abs().max() torch.distributed.barrier() print(' error in dLdX on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(' >> passed the test :-)')