def initialize_megatron(extra_args_provider=None, args_defaults={}, ignore_unknown_args=False): """Set global variables, initialize distributed, and set autoresume and random seeds.""" # Make sure cuda is available. assert torch.cuda.is_available(), 'Megatron requires CUDA.' # Parse args, build tokenizer, and set adlr-autoresume, # tensorboard-writer, and timers. set_global_variables(extra_args_provider=extra_args_provider, args_defaults=args_defaults, ignore_unknown_args=ignore_unknown_args) # Pytorch distributed. _initialize_distributed() # Autoresume. _init_autoresume() # Random seeds for reproducibility. args = get_args() if args.rank == 0: print('> setting random seeds to {} ...'.format(args.seed)) _set_random_seed(args.seed) # Write arguments to tensorboard. _write_args_to_tensorboard()
def initialize_megatron(extra_args_provider=None, args_defaults={}, ignore_unknown_args=False, allow_no_cuda=False): """Set global variables, initialize distributed, and set autoresume and random seeds. `allow_no_cuda` should not be set unless using megatron for cpu only data processing. In general this arg should not be set unless you know what you are doing.""" if not allow_no_cuda: # Make sure cuda is available. assert torch.cuda.is_available(), 'Megatron requires CUDA.' # Parse args, build tokenizer, and set adlr-autoresume, # tensorboard-writer, and timers. set_global_variables(extra_args_provider=extra_args_provider, args_defaults=args_defaults, ignore_unknown_args=ignore_unknown_args) # Pytorch distributed. _initialize_distributed() # Autoresume. _init_autoresume() # Random seeds for reproducibility. args = get_args() if args.rank == 0: print('> setting random seeds to {} ...'.format(args.seed)) _set_random_seed(args.seed) # Write arguments to tensorboard. _write_args_to_tensorboard()
def initialize_megatron(extra_args_provider=None, args_defaults={}, ignore_unknown_args=False, allow_no_cuda=False): """Set global variables, initialize distributed, and set autoresume and random seeds. `allow_no_cuda` should not be set unless using megatron for cpu only data processing. In general this arg should not be set unless you know what you are doing. Returns a function to finalize distributed env initialization (optionally, only when args.lazy_mpu_init == True) """ if not allow_no_cuda: # Make sure cuda is available. assert torch.cuda.is_available(), 'Megatron requires CUDA.' # Parse args, build tokenizer, and set adlr-autoresume, # tensorboard-writer, and timers. set_global_variables(extra_args_provider=extra_args_provider, args_defaults=args_defaults, ignore_unknown_args=ignore_unknown_args) # torch.distributed initialization def finish_mpu_init(): args = get_args() # Pytorch distributed. _initialize_distributed() # Random seeds for reproducibility. if args.rank == 0: print('> setting random seeds to {} ...'.format(args.seed)) _set_random_seed(args.seed) args = get_args() if args.lazy_mpu_init: args.use_cpu_initialization = True # delayed initialization of DDP-related stuff # We only set basic DDP globals set_tensor_model_parallel_world_size(args.tensor_model_parallel_size) # and return function for external DDP manager # to call when it has DDP initialized set_tensor_model_parallel_rank(args.rank) return finish_mpu_init else: # Megatron's MPU is the master. Complete initialization right away. finish_mpu_init() # Initialize memory buffers. _initialize_mem_buffs() # Autoresume. _init_autoresume() # Compile dependencies. _compile_dependencies() # No continuation function return None
def main(): # Arguments do sanity checks on the world size, but we don't care, # so trick it into thinking we are plenty of processes os.environ["WORLD_SIZE"] = f'{2**31}' # Args set_global_variables(extra_args_provider=get_mp_merge_args, args_defaults={ 'use_cpu_initialization': True, 'micro_batch_size': 1, 'no_load_optim': True, 'no_load_rng': True, 'no_save_optim': True, 'no_save_rng': True, 'save_interval': 1 }) args = get_args() if args.pipeline_model_parallel_size > 1: print( "Checkpoints with pipeline model parallelism are not currently supported." ) exit() model_type = args.model_type orig_tensor_model_parallel_size = args.tensor_model_parallel_size args.tensor_model_parallel_size = 1 tokenizer = rebuild_tokenizer(args) print('\n merging model parallel partitions ...') print( ' > number of partitions: {}'.format(orig_tensor_model_parallel_size)) print(' > checkpoint path: {}'.format(args.load)) print(' > model parameters:') print(' number of tokens ................ {} '.format( tokenizer.vocab_size)) print(' number of layers ................ {}'.format(args.num_layers)) print(' hidden size ..................... {}'.format(args.hidden_size)) print(' number of attention heads ....... {}'.format( args.num_attention_heads)) print(' maximum position embeddings ..... {}'.format( args.max_position_embeddings)) # Full model. print('> building the full model ...') mpu.initialize.set_tensor_model_parallel_world_size(1) mpu.initialize.set_tensor_model_parallel_rank(0) mpu.initialize.set_pipeline_model_parallel_world_size(1) mpu.initialize.set_pipeline_model_parallel_rank(0) merged_model = get_model(model_type) # Build and load partitions. partitions = [] iteration = 0 args.tensor_model_parallel_size = orig_tensor_model_parallel_size tokenizer = rebuild_tokenizer(args) mpu.initialize.set_tensor_model_parallel_world_size( args.tensor_model_parallel_size) for rank in range(args.tensor_model_parallel_size): # Reset these since load_checkpoint asserts they are 0, but we are loading # multiple checkpoints in the same process and they get set each time args.consumed_train_samples = 0 args.consumed_valid_samples = 0 mpu.initialize.set_tensor_model_parallel_rank(rank) checkpoint_name, iteration = get_parallel_checkpoint_name(args.load) model_ = get_model(model_type) print(f'> loading {checkpoint_name} ...') load_checkpoint(model_, None, None) print(f'> checkpoint version {get_checkpoint_version()}') partitions.append(model_) # Parameter generators so we can loop through them semiltaneouly. merged_params_gen = merged_model.named_parameters() partitions_params_gen = [ partition.named_parameters() for partition in partitions ] while True: try: # Get the params and check names. name, merged_param = next(merged_params_gen) print(' > working on {} ...'.format(name)) print(' merged type: {}, size: {}'.format( merged_param.dtype, list(merged_param.size()))) partitions_param = [] for rank, partition_params_gen in enumerate(partitions_params_gen): partition_name, partition_param = next(partition_params_gen) assert partition_name == name partitions_param.append(partition_param) print(' partition {} type: {}, size: {}'.format( rank, partition_param.dtype, list(partition_param.size()))) # For the non-parallel parameters, simply copy the rank 0 values. if not hasattr(merged_param, 'tensor_model_parallel'): print(' none-parallel parameter, simple copy from rank 0') with torch.no_grad(): merged_param.data.copy_(partitions_param[0].data) # For parallel parameters, merge the values else: dim = merged_param.partition_dim stride = merged_param.partition_stride print( f' parallel parameter merge with stride {stride} along ' f'dimention {dim}') merge_partitions(merged_param, partitions_param, dim, stride) except StopIteration: break partitions = [] args.tensor_model_parallel_size = 1 args.pipeline_model_parallel_size = args.target_pipeline_model_parallel_size assert args.num_layers % args.pipeline_model_parallel_size == 0, \ 'num_layers must be divisible by target pipeline model parallel size' layers_per_part = args.num_layers // args.pipeline_model_parallel_size tokenizer = rebuild_tokenizer(args) mpu.initialize.set_tensor_model_parallel_world_size( args.tensor_model_parallel_size) mpu.initialize.set_tensor_model_parallel_rank(0) mpu.initialize.set_pipeline_model_parallel_world_size( args.pipeline_model_parallel_size) # regex to parse out layer number from param name layer_re = re.compile('layers\.([0-9]+)') if args.pipeline_model_parallel_size > 1: merged_params = {} for name, merged_param in merged_model.named_parameters(): merged_params[name] = merged_param for rank in range(args.pipeline_model_parallel_size): mpu.initialize.set_pipeline_model_parallel_rank(rank) model = get_model(model_type) def update_layer_num(m): # TODO! This assumes no interleaved pipeline execution layer = int(m.group(1)) layer += rank * layers_per_part return f'layers.{layer}' for dst_name, partition_param in model.named_parameters(): if dst_name == "word_embeddings.weight": # See comment in MegatronModule.initialize_word_embeddings() src_name = "language_model.embedding.word_embeddings.weight" else: # Translate destination layer number (0-N for each partition) # to source layer number (single-model layer number) src_name = re.sub(layer_re, update_layer_num, dst_name) print( f" > copying {src_name} to {dst_name} in rank {rank}'s model" ) partition_param.data.copy_(merged_params[src_name].data) partitions.append(model) else: partitions = [merged_model] for rank, model in enumerate(partitions): mpu.initialize.set_pipeline_model_parallel_rank(rank) print(f"> saving rank {rank}'s model") save_checkpoint(iteration, model, None, None) print('done :-)')