def parse_args(extra_args_provider=None, defaults={}, ignore_unknown_args=False): """Parse all arguments.""" parser = _get_parser(extra_args_provider) # Parse. if ignore_unknown_args: args, _ = parser.parse_known_args() else: args = parser.parse_args() # Tee logs to file ASAP if args.log_dir: os.makedirs(args.log_dir, exist_ok=True) hostname = gethostname() file_prefix = os.path.join(args.log_dir, hostname) Tee(file_prefix + '_stdout.txt', err=False) Tee(file_prefix + '_stderr.txt', err=True) # Distributed args. configure_distributed_args(args) # Fp16 loss scaling. args.dynamic_loss_scale = False if args.loss_scale is None: args.dynamic_loss_scale = True # Parameters dtype. args.params_dtype = torch.float if args.fp16: args.params_dtype = torch.half if args.rank == 0: print('using {} for parameters ...'.format(args.params_dtype), flush=True) # Set input defaults. for key in defaults: # For default to be valid, it should not be provided in the # arguments that are passed to the program. We check this by # ensuring the arg is set to None. if getattr(args, key) is not None: if args.rank == 0: print('WARNING: overriding default arguments for {key}:{v} \ with {key}:{v2}'.format(key=key, v=defaults[key], v2=getattr(args, key)), flush=True) else: setattr(args, key, defaults[key]) # Check required arguments. required_args = [ 'num_layers', 'hidden_size', 'num_attention_heads', 'max_position_embeddings' ] for req_arg in required_args: _check_arg_is_not_none(args, req_arg) # Checks. assert args.hidden_size % args.num_attention_heads == 0 if args.seq_length is not None: assert args.max_position_embeddings >= args.seq_length if args.lr is not None: assert args.min_lr <= args.lr if args.save is not None: assert args.save_interval is not None # Parameters sharing does not work with torch DDP. if (args.num_unique_layers is not None) and (args.num_layers is not None): assert args.num_unique_layers <= args.num_layers assert args.num_layers % args.num_unique_layers == 0, \ 'num-layers should be divisible by num-unique-layers.' # Mixed precision checks. if args.fp16_lm_cross_entropy: assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.' # Activation checkpointing. if args.distribute_checkpointed_activations: assert args.checkpoint_activations, \ 'for distribute-checkpointed-activations to work you ' \ 'need to enable checkpoint-activations' # load scaled_upper_triang_masked_softmax_fusion kernel if args.scaled_upper_triang_masked_softmax_fusion: fused_kernels.load_scaled_upper_triang_masked_softmax_fusion_kernel() # load scaled_masked_softmax_fusion kernel if args.scaled_masked_softmax_fusion: fused_kernels.load_scaled_masked_softmax_fusion_kernel() _print_args(args) return args
def parse_args(extra_args_provider=None, defaults={}, ignore_unknown_args=False): """Parse all arguments.""" parser = argparse.ArgumentParser(description='Megatron-LM Arguments', allow_abbrev=False) # Standard arguments. parser = _add_network_size_args(parser) parser = _add_regularization_args(parser) parser = _add_training_args(parser) parser = _add_initialization_args(parser) parser = _add_learning_rate_args(parser) parser = _add_checkpointing_args(parser) parser = _add_mixed_precision_args(parser) parser = _add_distributed_args(parser) parser = _add_validation_args(parser) parser = _add_data_args(parser) parser = _add_autoresume_args(parser) parser = _add_realm_args(parser) parser = _add_zero_args(parser) parser = _add_activation_checkpoint_args(parser) # Custom arguments. if extra_args_provider is not None: parser = extra_args_provider(parser) # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) # Parse. if ignore_unknown_args: args, _ = parser.parse_known_args() else: args = parser.parse_args() # Distributed args. args.rank = int(os.getenv('RANK', '0')) args.world_size = int(os.getenv("WORLD_SIZE", '1')) args.model_parallel_size = min(args.model_parallel_size, args.world_size) if args.rank == 0: print('using world size: {} and model-parallel size: {} '.format( args.world_size, args.model_parallel_size)) # Fp16 loss scaling. args.dynamic_loss_scale = False if args.loss_scale is None: args.dynamic_loss_scale = True # Parameters dtype. args.params_dtype = torch.float if args.fp16: args.params_dtype = torch.half if args.rank == 0: print('using {} for parameters ...'.format(args.params_dtype), flush=True) # Set input defaults. for key in defaults: # For default to be valid, it should not be provided in the # arguments that are passed to the program. We check this by # ensuring the arg is set to None. if getattr(args, key) is not None: if args.rank == 0: print('WARNING: overriding default arguments for {key}:{v} \ with {key}:{v2}'.format(key=key, v=defaults[key], v2=getattr(args, key)), flush=True) else: setattr(args, key, defaults[key]) # Check required arguments. required_args = [ 'num_layers', 'hidden_size', 'num_attention_heads', 'max_position_embeddings' ] for req_arg in required_args: _check_arg_is_not_none(args, req_arg) # Checks. assert args.hidden_size % args.num_attention_heads == 0 if args.seq_length is not None: assert args.max_position_embeddings >= args.seq_length if args.lr is not None: assert args.min_lr <= args.lr if args.save is not None: assert args.save_interval is not None # Parameters sharing does not work with torch DDP. if (args.num_unique_layers is not None) and (args.num_layers is not None): assert args.num_unique_layers <= args.num_layers assert args.num_layers % args.num_unique_layers == 0, \ 'num-layers should be divisible by num-unique-layers.' if args.num_unique_layers < args.num_layers: assert args.DDP_impl == 'local', \ 'torch-DDP does not work with parameters sharing.' # Mixed precision checks. if args.fp16_lm_cross_entropy: assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.' # Activation checkpointing. if args.distribute_checkpointed_activations: assert args.checkpoint_activations, \ 'for distribute-checkpointed-activations to work you '\ 'need to enable checkpoint-activations' # load scaled_upper_triang_masked_softmax_fusion kernel if args.scaled_upper_triang_masked_softmax_fusion: fused_kernels.load_scaled_upper_triang_masked_softmax_fusion_kernel() # load scaled_masked_softmax_fusion kernel if args.scaled_masked_softmax_fusion: fused_kernels.load_scaled_masked_softmax_fusion_kernel() _print_args(args) return args
def parse_args(extra_args_provider=None, defaults={}, ignore_unknown_args=False): """Parse all arguments.""" parser = argparse.ArgumentParser(description='Megatron-LM Arguments', allow_abbrev=False) # Standard arguments. parser = _add_network_size_args(parser) parser = _add_regularization_args(parser) parser = _add_training_args(parser) parser = _add_initialization_args(parser) parser = _add_learning_rate_args(parser) parser = _add_checkpointing_args(parser) parser = _add_mixed_precision_args(parser) parser = _add_distributed_args(parser) parser = _add_validation_args(parser) parser = _add_data_args(parser) parser = _add_autoresume_args(parser) parser = _add_realm_args(parser) # Custom arguments. if extra_args_provider is not None: parser = extra_args_provider(parser) # Parse. if ignore_unknown_args: args, _ = parser.parse_known_args() else: args = parser.parse_args() if args.use_set==1: args=set_args(args) # Distributed args. args.rank = int(os.getenv('RANK', '0')) args.world_size = int(os.getenv("WORLD_SIZE", '1')) # Tensor model parallel size. args.tensor_model_parallel_size = min( args.tensor_model_parallel_size, args.world_size) assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\ ' ({}) is not divisible by tensor model parallel size ({})'.format( args.world_size, args.tensor_model_parallel_size) # Pipeline model parallel size. args.pipeline_model_parallel_size = min( args.pipeline_model_parallel_size, (args.world_size // args.tensor_model_parallel_size)) # Checks. model_parallel_size = args.pipeline_model_parallel_size * \ args.tensor_model_parallel_size assert args.world_size % model_parallel_size == 0, 'world size is not'\ ' divisible by tensor parallel size ({}) times pipeline paralle ' \ 'size ({})'.format(args.world_size, args.tensor_model_parallel_size, args.pipeline_model_parallel_size) args.data_parallel_size = args.world_size // model_parallel_size if args.rank == 0: print('using world size: {}, data-parallel-size: {}, ' 'tensor-model-parallel size: {}, ' 'pipeline-model-parallel size: {} '.format( args.world_size, args.data_parallel_size, args.tensor_model_parallel_size, args.pipeline_model_parallel_size), flush=True) # Deprecated arguments assert args.batch_size is None, '--batch-size argument is no longer ' \ 'valid, use --micro-batch-size instead' del args.batch_size assert args.warmup is None, '--warmup argument is no longer valid, use ' \ '--lr-warmup-fraction instead' del args.warmup assert args.model_parallel_size is None, '--model-parallel-size is no ' \ 'longer valid, use --tensor-model-parallel-size instead' del args.model_parallel_size # Batch size. assert args.micro_batch_size is not None assert args.micro_batch_size > 0 if args.global_batch_size is None: args.global_batch_size = args.micro_batch_size * args.data_parallel_size if args.rank == 0: print('setting global batch size to {}'.format( args.global_batch_size), flush=True) assert args.global_batch_size > 0 # Parameters dtype. args.params_dtype = torch.float if args.fp16: args.params_dtype = torch.half if args.rank == 0: print('using {} for parameters ...'.format(args.params_dtype), flush=True) # Consumed tokens. args.consumed_train_samples = 0 args.consumed_valid_samples = 0 # Set input defaults. for key in defaults: # For default to be valid, it should not be provided in the # arguments that are passed to the program. We check this by # ensuring the arg is set to None. if getattr(args, key) is not None: if args.rank == 0: print('WARNING: overriding default arguments for {key}:{v} \ with {key}:{v2}'.format(key=key, v=defaults[key], v2=getattr(args, key)), flush=True) else: setattr(args, key, defaults[key]) # Iteration-based training. if args.train_iters: # If we use iteration-based training, make sure the # sample-based options are off. assert args.train_samples is None, \ 'expected iteration-based training' assert args.lr_decay_samples is None, \ 'expected iteration-based learning rate decay' assert args.lr_warmup_samples == 0, \ 'expected iteration-based learning rate warmup' assert args.rampup_batch_size is None, \ 'expected no batch-size rampup for iteration-based training' if args.lr_warmup_fraction is not None: assert args.lr_warmup_iters == 0, \ 'can only specify one of lr-warmup-fraction and lr-warmup-iters' # Sample-based training. if args.train_samples: # If we use sample-based training, make sure the # iteration-based options are off. assert args.train_iters is None, \ 'expected sample-based training' assert args.lr_decay_iters is None, \ 'expected sample-based learning rate decay' assert args.lr_warmup_iters == 0, \ 'expected sample-based learnig rate warmup' if args.lr_warmup_fraction is not None: assert args.lr_warmup_samples == 0, \ 'can only specify one of lr-warmup-fraction and lr-warmup-samples' # Check required arguments. required_args = ['num_layers', 'hidden_size', 'num_attention_heads', 'max_position_embeddings'] for req_arg in required_args: _check_arg_is_not_none(args, req_arg) # Checks. assert args.hidden_size % args.num_attention_heads == 0 if args.seq_length is not None: assert args.max_position_embeddings >= args.seq_length if args.lr is not None: assert args.min_lr <= args.lr if args.save is not None: assert args.save_interval is not None # Mixed precision checks. if args.fp16_lm_cross_entropy: assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.' if args.fp32_residual_connection: assert args.fp16, \ 'residual connection in fp32 only supported when using fp16.' # Activation checkpointing. if args.distribute_checkpointed_activations: assert args.checkpoint_activations, \ 'for distribute-checkpointed-activations to work you '\ 'need to enable checkpoint-activations' if args.scaled_masked_softmax_fusion: if args.scaled_upper_triang_masked_softmax_fusion: fused_kernels.load_scaled_upper_triang_masked_softmax_fusion_kernel() else: fused_kernels.load_scaled_masked_softmax_fusion_kernel() else: # This argument will eventually go away, for now make sure it is off # if scaled_masked_softmax_fusion is off. args.scaled_upper_triang_masked_softmax_fusion = False # Load mixed precision fused layer norm. if args.fp32_residual_connection: fused_kernels.load_fused_mix_prec_layer_norm_kernel() _print_args(args) return args