예제 #1
0
def setup_for_inference_or_eval(inference=True,
                                get_key_value=True,
                                overwrite_values=None):

    from megatron.neox_arguments import NeoXArgs
    from megatron.initialize import initialize_megatron
    from megatron.training import setup_model_and_optimizer

    _overwrite_values = {
        "checkpoint_activations": False,
        "partition_activations": False,
        "no_load_optim": True,
    }
    if overwrite_values:
        _overwrite_values.update(overwrite_values)
    neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values)
    neox_args.configure_distributed_args()
    neox_args.build_tokenizer()

    if neox_args.load is None:
        raise ValueError("`load` parameter must be supplied to load a model`")

    # initialize megatron
    initialize_megatron(neox_args)

    # set up model and load checkpoint.
    model, _, _ = setup_model_and_optimizer(
        neox_args=neox_args, inference=inference, get_key_value=get_key_value
    )  # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed
    print_rank_0('Finished loading model')
    return model, neox_args
예제 #2
0
    def __init__(self, context: DeepSpeedTrialContext) -> None:
        self.context = context
        self.exp_config = self.context.get_experiment_config()
        self.args = AttrMap(self.context.get_hparams())

        # Initalize and get arguments, timers, and Tensorboard writer.
        try:
            self.neox_args = get_neox_args(self.context)
        except:
            traceback.print_exc()
            raise InvalidHP("Could not parse neox_args.")
        self.wrapped_writer = TorchWriter()
        self.neox_args.tensorboard_writer = self.wrapped_writer.writer
        self.neox_args.configure_distributed_args()
        # The tokenizer needs to be built before model initialization in order to set the
        # required padded_vocab_size argument.
        self.neox_args.build_tokenizer()
        megatron_train.initialize_megatron(neox_args=self.neox_args)
        self.timers = megatron_utils.Timers(
            use_wandb=False,
            tensorboard_writer=self.neox_args.tensorboard_writer)

        # Model, optimizer, and learning rate.
        self.timers("model and optimizer").start()
        (
            model,
            self.optimizer,
            self.lr_scheduler,
        ) = megatron_train.setup_model_and_optimizer(neox_args=self.neox_args)
        self.model = self.context.wrap_model_engine(model)
        self.timers("model and optimizer").stop()

        # Print setup timing.
        megatron_utils.print_rank_0("done with setups ...")
        self.timers.log(["model and optimizer"])
        megatron_utils.print_rank_0("training ...")

        # For tracking.
        if not self.args.search_world_size:
            self.reducer = self.context.wrap_reducer(LMReducers(
                self.neox_args),
                                                     for_training=False,
                                                     for_validation=True)
        self.report_memory_flag = True
        self.total_train_loss_dict = {}
        self.total_val_loss_dict = {}
        self.tflops = 0
        self.reported_flops = False
        self.overflow_monitor = megatron_utils.OverflowMonitor(self.optimizer)
        self.noise_scale_logger = megatron_utils.get_noise_scale_logger(
            self.neox_args)
        self.timers("interval time").start()
예제 #3
0
파일: utils.py 프로젝트: neuroidss/gpt-neox
def setup_for_inference_or_eval(
    use_cache=True,
    overwrite_values=None,
):
    """
    Initializes the model for evaluation or inference (doesn't load optimizer states, etc.) from command line args.

    use_cache: bool
        Whether to use key value caching in inference.
    overwrite_values: dict
        Optional Values to overwrite in the model config.
    """

    from megatron.neox_arguments import NeoXArgs
    from megatron.initialize import initialize_megatron
    from megatron.training import setup_model_and_optimizer

    _overwrite_values = {
        "checkpoint_activations": False,
        "partition_activations": False,
        "no_load_optim": True,
        "zero_optimization":
        None,  # disable zero optimization (won't be used in inference, and loading zero optimizer can cause errors)
    }
    if overwrite_values:
        _overwrite_values.update(overwrite_values)
    neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values)
    neox_args.configure_distributed_args()
    neox_args.build_tokenizer()

    if neox_args.load is None:
        raise ValueError("`load` parameter must be supplied to load a model`")

    # initialize megatron
    initialize_megatron(neox_args)

    # set up model and load checkpoint.
    model, _, _ = setup_model_and_optimizer(
        neox_args=neox_args,
        use_cache=use_cache,
        iteration=neox_args.iteration,
    )  # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed
    print_rank_0("Finished loading model")

    model.module.inference_mode(use_cache=use_cache)
    return model, neox_args
예제 #4
0
def model_setup(yaml_list=None,
                param_dict=None,
                clear_data=True,
                inference=False):
    from megatron.neox_arguments import NeoXArgs
    from megatron.mpu import destroy_model_parallel
    from megatron import initialize_megatron
    from megatron.training import setup_model_and_optimizer

    destroy_model_parallel(
    )  # mpu model parallel contains remaining global vars
    if clear_data and (not torch.distributed.is_initialized()
                       or torch.distributed.get_world_size() == 1
                       or torch.distributed.get_rank() == 0):
        clear_test_dirs()

    overwrite_values = {
        "user_script": str(get_root_directory() / "pretrain_gpt2.py"),
        "save": TEST_CHECKPOINT_DIR,
        "load": TEST_CHECKPOINT_DIR,
        "log_dir": TEST_LOG_DIR,
        "tensorboard_dir": TEST_TENSORBOARD_DIR,
    }

    # should not both be none
    assert yaml_list is not None or param_dict is not None

    # initially load config from files as would be the case in deepy.py
    if yaml_list is not None:
        args_loaded = NeoXArgs.from_ymls(yaml_list,
                                         overwrite_values=overwrite_values)
    else:
        p_dict = param_dict.copy()
        p_dict.update(overwrite_values)
        args_loaded = NeoXArgs.from_dict(p_dict)

    args_loaded.build_tokenizer()

    initialize_megatron(neox_args=args_loaded)
    model, optimizer, lr_scheduler = setup_model_and_optimizer(
        neox_args=args_loaded, inference=inference, get_key_value=True)
    return model, optimizer, lr_scheduler, args_loaded
def finetune(train_valid_datasets_provider,
             model_provider,
             forward_step=_cross_entropy_forward_step,
             end_of_epoch_callback_provider=None):
    """Main finetune function used across all tasks."""
    args = get_args()
    timers = get_timers()

    # Train and validation data loaders.
    timers('train/valid/test dataset/dataloder').start()
    if args.epochs > 0:
        train_dataset, valid_dataset = train_valid_datasets_provider()
        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
            train_dataset, valid_dataset)
    timers('train/valid/test dataset/dataloder').stop()

    # Build calback function.
    timers('callback function').start()
    end_of_epoch_callback = None
    if end_of_epoch_callback_provider is not None:
        end_of_epoch_callback = end_of_epoch_callback_provider()
    timers('callback function').stop()

    # Build model, optimizer and learning rate scheduler.
    timers('model and optimizer').start()
    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
    timers('model and optimizer').stop()

    # If pretrained checkpoint is provided and we have not trained for
    # any iteration (i.e., iteration is zero), then load the pretrained
    # checkpoint.
    timers('pretrained checkpoint').start()
    if args.iteration == 0 and args.pretrained_checkpoint is not None:
        original_load = args.load
        args.load = args.pretrained_checkpoint
        _ = load_checkpoint(model, None, None)
        args.load = original_load
        # This is critical when only model is loaded. We should make sure
        # master parameters are also updated.
        if args.fp16:
            optimizer._model_params_to_master_params()
    timers('pretrained checkpoint').stop()

    # Print setup timing.
    print_rank_0('done with setups ...')
    timers.log([
        'train/valid/test dataset/dataloder', 'callback function',
        'model and optimizer', 'pretrained checkpoint'
    ])
    print_rank_0('training ...')

    # Finetune the model.
    if args.epochs > 0:
        _train(model, optimizer, lr_scheduler, forward_step, train_dataloader,
               valid_dataloader, end_of_epoch_callback)
    # Or just evaluate.
    else:
        if end_of_epoch_callback is not None:
            print_rank_0('evaluation only mode, setting epoch to -1')
            end_of_epoch_callback(model, epoch=-1, output_predictions=True)

    print_rank_0('done :-)')