def setup_for_inference_or_eval(inference=True, get_key_value=True, overwrite_values=None): from megatron.neox_arguments import NeoXArgs from megatron.initialize import initialize_megatron from megatron.training import setup_model_and_optimizer _overwrite_values = { "checkpoint_activations": False, "partition_activations": False, "no_load_optim": True, } if overwrite_values: _overwrite_values.update(overwrite_values) neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values) neox_args.configure_distributed_args() neox_args.build_tokenizer() if neox_args.load is None: raise ValueError("`load` parameter must be supplied to load a model`") # initialize megatron initialize_megatron(neox_args) # set up model and load checkpoint. model, _, _ = setup_model_and_optimizer( neox_args=neox_args, inference=inference, get_key_value=get_key_value ) # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed print_rank_0('Finished loading model') return model, neox_args
def __init__(self, context: DeepSpeedTrialContext) -> None: self.context = context self.exp_config = self.context.get_experiment_config() self.args = AttrMap(self.context.get_hparams()) # Initalize and get arguments, timers, and Tensorboard writer. try: self.neox_args = get_neox_args(self.context) except: traceback.print_exc() raise InvalidHP("Could not parse neox_args.") self.wrapped_writer = TorchWriter() self.neox_args.tensorboard_writer = self.wrapped_writer.writer self.neox_args.configure_distributed_args() # The tokenizer needs to be built before model initialization in order to set the # required padded_vocab_size argument. self.neox_args.build_tokenizer() megatron_train.initialize_megatron(neox_args=self.neox_args) self.timers = megatron_utils.Timers( use_wandb=False, tensorboard_writer=self.neox_args.tensorboard_writer) # Model, optimizer, and learning rate. self.timers("model and optimizer").start() ( model, self.optimizer, self.lr_scheduler, ) = megatron_train.setup_model_and_optimizer(neox_args=self.neox_args) self.model = self.context.wrap_model_engine(model) self.timers("model and optimizer").stop() # Print setup timing. megatron_utils.print_rank_0("done with setups ...") self.timers.log(["model and optimizer"]) megatron_utils.print_rank_0("training ...") # For tracking. if not self.args.search_world_size: self.reducer = self.context.wrap_reducer(LMReducers( self.neox_args), for_training=False, for_validation=True) self.report_memory_flag = True self.total_train_loss_dict = {} self.total_val_loss_dict = {} self.tflops = 0 self.reported_flops = False self.overflow_monitor = megatron_utils.OverflowMonitor(self.optimizer) self.noise_scale_logger = megatron_utils.get_noise_scale_logger( self.neox_args) self.timers("interval time").start()
def setup_for_inference_or_eval( use_cache=True, overwrite_values=None, ): """ Initializes the model for evaluation or inference (doesn't load optimizer states, etc.) from command line args. use_cache: bool Whether to use key value caching in inference. overwrite_values: dict Optional Values to overwrite in the model config. """ from megatron.neox_arguments import NeoXArgs from megatron.initialize import initialize_megatron from megatron.training import setup_model_and_optimizer _overwrite_values = { "checkpoint_activations": False, "partition_activations": False, "no_load_optim": True, "zero_optimization": None, # disable zero optimization (won't be used in inference, and loading zero optimizer can cause errors) } if overwrite_values: _overwrite_values.update(overwrite_values) neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values) neox_args.configure_distributed_args() neox_args.build_tokenizer() if neox_args.load is None: raise ValueError("`load` parameter must be supplied to load a model`") # initialize megatron initialize_megatron(neox_args) # set up model and load checkpoint. model, _, _ = setup_model_and_optimizer( neox_args=neox_args, use_cache=use_cache, iteration=neox_args.iteration, ) # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed print_rank_0("Finished loading model") model.module.inference_mode(use_cache=use_cache) return model, neox_args
def model_setup(yaml_list=None, param_dict=None, clear_data=True, inference=False): from megatron.neox_arguments import NeoXArgs from megatron.mpu import destroy_model_parallel from megatron import initialize_megatron from megatron.training import setup_model_and_optimizer destroy_model_parallel( ) # mpu model parallel contains remaining global vars if clear_data and (not torch.distributed.is_initialized() or torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0): clear_test_dirs() overwrite_values = { "user_script": str(get_root_directory() / "pretrain_gpt2.py"), "save": TEST_CHECKPOINT_DIR, "load": TEST_CHECKPOINT_DIR, "log_dir": TEST_LOG_DIR, "tensorboard_dir": TEST_TENSORBOARD_DIR, } # should not both be none assert yaml_list is not None or param_dict is not None # initially load config from files as would be the case in deepy.py if yaml_list is not None: args_loaded = NeoXArgs.from_ymls(yaml_list, overwrite_values=overwrite_values) else: p_dict = param_dict.copy() p_dict.update(overwrite_values) args_loaded = NeoXArgs.from_dict(p_dict) args_loaded.build_tokenizer() initialize_megatron(neox_args=args_loaded) model, optimizer, lr_scheduler = setup_model_and_optimizer( neox_args=args_loaded, inference=inference, get_key_value=True) return model, optimizer, lr_scheduler, args_loaded
def finetune(train_valid_datasets_provider, model_provider, forward_step=_cross_entropy_forward_step, end_of_epoch_callback_provider=None): """Main finetune function used across all tasks.""" args = get_args() timers = get_timers() # Train and validation data loaders. timers('train/valid/test dataset/dataloder').start() if args.epochs > 0: train_dataset, valid_dataset = train_valid_datasets_provider() train_dataloader, valid_dataloader = _build_train_valid_dataloaders( train_dataset, valid_dataset) timers('train/valid/test dataset/dataloder').stop() # Build calback function. timers('callback function').start() end_of_epoch_callback = None if end_of_epoch_callback_provider is not None: end_of_epoch_callback = end_of_epoch_callback_provider() timers('callback function').stop() # Build model, optimizer and learning rate scheduler. timers('model and optimizer').start() model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) timers('model and optimizer').stop() # If pretrained checkpoint is provided and we have not trained for # any iteration (i.e., iteration is zero), then load the pretrained # checkpoint. timers('pretrained checkpoint').start() if args.iteration == 0 and args.pretrained_checkpoint is not None: original_load = args.load args.load = args.pretrained_checkpoint _ = load_checkpoint(model, None, None) args.load = original_load # This is critical when only model is loaded. We should make sure # master parameters are also updated. if args.fp16: optimizer._model_params_to_master_params() timers('pretrained checkpoint').stop() # Print setup timing. print_rank_0('done with setups ...') timers.log([ 'train/valid/test dataset/dataloder', 'callback function', 'model and optimizer', 'pretrained checkpoint' ]) print_rank_0('training ...') # Finetune the model. if args.epochs > 0: _train(model, optimizer, lr_scheduler, forward_step, train_dataloader, valid_dataloader, end_of_epoch_callback) # Or just evaluate. else: if end_of_epoch_callback is not None: print_rank_0('evaluation only mode, setting epoch to -1') end_of_epoch_callback(model, epoch=-1, output_predictions=True) print_rank_0('done :-)')