def from_pretrained(cls, model_name_or_path, seq_len=512): init_method = 'tcp://' + os.getenv('MASTER_ADDR', 'localhost') + ':' + os.getenv( 'MASTER_PORT', '6000') torch.distributed.init_process_group(backend='nccl', world_size=1, rank=0, init_method=init_method) mpu.initialize_model_parallel(1) seed = 1234 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) mpu.model_parallel_cuda_manual_seed(seed) tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path) logger.info("Check cached model files...") weights_path, deepspeed_config_path = download_model_files( model_name_or_path) model = setup_model(weights_path, deepspeed_config_path) model.cuda() model = model.eval() return cls(model, tokenizer=tokenizer, seq_len=seq_len, model_path=model_name_or_path)
def set_random_seed(seed): """Set random seed for reproducability.""" if seed is not None and seed > 0: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) mpu.model_parallel_cuda_manual_seed(seed)
def initialize_distributed_env(distributed_init_method, rank, local_rank, world_size, model_parallel_size, pipeline_parallel_size): torch.cuda.set_device(local_rank) dist.init_process_group( backend='nccl', init_method=distributed_init_method, world_size=world_size, rank=rank, ) # A small all_reduce for warmup. dist.all_reduce(torch.zeros(1).cuda()) mpu.initialize_model_parallel(model_parallel_size, pipeline_parallel_size) set_random_seed(0) mpu.model_parallel_cuda_manual_seed(0)
def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size): if torch.distributed.get_rank() == 0: print('> testing model parallel cuda manual seed with size {} ...'. format(tensor_model_parallel_size)) mpu.initialize_model_parallel(tensor_model_parallel_size) tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() mpu.model_parallel_cuda_manual_seed(12345) assert torch.cuda.initial_seed() == 12345 with mpu.get_cuda_rng_tracker().fork(): assert torch.cuda.initial_seed() == ( 12345 + 2718 + mpu.get_tensor_model_parallel_rank()) # Reset the tracker mpu.get_cuda_rng_tracker().reset() # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)')
def __init__(self, config, batch_slices, seq_slices, distributed_init_method, world_size, data_parallel_size, model_parallel_size, pipeline_parallel_size, rank, local_rank, mixed_precision=False, use_mpi=False, init_process_group=False, checkpoint_gradients=False): self.config = config self.batch_slices = batch_slices self.seq_slices = seq_slices torch.cuda.set_device(local_rank) if init_process_group: dist.init_process_group( backend='nccl', init_method=distributed_init_method, world_size=world_size, rank=rank, ) dist.all_reduce(torch.zeros(1).cuda()) mpu.initialize_model_parallel(model_parallel_size, pipeline_parallel_size) set_random_seed(0) mpu.model_parallel_cuda_manual_seed(0) self.rank = rank self.local_rank = local_rank self.world_size = world_size self.data_parallel_size = data_parallel_size self.model_parallel_size = model_parallel_size self.pipeline_parallel_size = pipeline_parallel_size self.pipeline_parallel_group_rank = mpu.get_pipeline_parallel_group_rank( ) self.data_parallel_group = mpu.get_data_parallel_group() self.model_parallel_group = mpu.get_model_parallel_group() self.pipeline_parallel_pred_group = mpu.get_pipeline_parallel_pred_group( ) self.pipeline_parallel_succ_group = mpu.get_pipeline_parallel_succ_group( ) self.model_parallel_src_rank = mpu.get_model_parallel_src_rank() self.model_parallel_dst_rank = mpu.get_model_parallel_dst_rank() self.model_parallel_next_src_rank = ( self.model_parallel_src_rank + self.model_parallel_size if self.pipeline_parallel_group_rank < self.pipeline_parallel_size - 1 else None) self.model_parallel_prev_dst_rank = ( self.model_parallel_dst_rank - self.model_parallel_size if self.pipeline_parallel_group_rank > 0 else None) self.n_layers = (config.n_layers // pipeline_parallel_size + int(rank < config.n_layers % pipeline_parallel_size)) self.config = config self.mixed_precision = mixed_precision self.checkpoint_gradients = checkpoint_gradients self.layers = [] for _ in range(self.n_layers): l = ModelParallelTransformerLayer( self.config.embedding_dim, self.config.ffn_embedding_dim, self.config.num_attention_heads, device="cuda", checkpoint_gradients=self.checkpoint_gradients) self.layers.append(l.half() if self.mixed_precision else l) self.all_parameters = [] for layer in self.layers: self.all_parameters.extend(layer.parameters()) self.n_params = len(self.all_parameters) if self.mixed_precision: self.master_parameters = [ p.clone().detach().float() for p in self.all_parameters ] for p in self.master_parameters: p.requires_grad_() self.optimizer = optimizers.FusedAdam(self.master_parameters, lr=1e-10) else: self.optimizer = torch.optim.Adam(self.all_parameters, lr=1e-10)
def set_random_seed(seed): """Set random seed for reproducability.""" random.seed(seed) numpy.random.seed(seed) torch.manual_seed(seed) mpu.model_parallel_cuda_manual_seed(seed)