Пример #1
0
 def __init__(self, cfg: DictConfig, trainer: Trainer = None):
     super().__init__(cfg, trainer)
     # handles model parallel save and restore logic
     self._save_restore_connector = NLPSaveRestoreConnector()
     self.set_world_size(trainer)
     if not HAVE_APEX:
         logging.warning("Apex was not found. Using model parallel or megatron models will error out.")
Пример #2
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    trainer = None
    if cfg.trainer.precision == 16:
        trainer = Trainer(
            plugins=[
                NLPDDPPlugin(),
                NLPNativeMixedPrecisionPlugin(
                    init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
                    growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                ),
            ],
            **cfg.trainer,
        )
    elif cfg.trainer.precision == 'bf16':
        trainer = Trainer(plugins=[NLPDDPPlugin(), NLPNativeBfloat16PrecisionPlugin(),], **cfg.trainer,)
    else:
        trainer = Trainer(plugins=[NLPDDPPlugin(), NLPPrecisionPlugin()], **cfg.trainer)

    app_state = AppState()
    app_state.model_parallel_size = cfg.model.tensor_model_parallel_size
    app_state.model_parallel_rank = compute_model_parallel_rank(trainer.local_rank, app_state.model_parallel_size)

    model = MegatronGPTModel.restore_from(
        cfg.restore_from_path, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(),
    )

    # Note: most nemo models must have the data paths configured before instantiating the model
    # MegatronGPTMOdel sets up the data in the PTL method .setup which happens after DDP spawns.
    model.cfg.data.splits_string = cfg.model.data.splits_string

    trainer.test(model)
Пример #3
0
def convert(rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    trainer = Trainer(gpus=args.tensor_model_parallel_size)
    # TODO: reach out to PTL For an API-safe local rank override
    trainer.accelerator.training_type_plugin._local_rank = rank

    if args.tensor_model_parallel_size is not None and args.tensor_model_parallel_size > 1:
        # inject model parallel rank
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       f'mp_rank_{rank:02d}',
                                       args.checkpoint_name)
    else:
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       args.checkpoint_name)

    if args.model_type == 'gpt':
        model = MegatronGPTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 'bert':
        model = MegatronBertModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 't5':
        model = MegatronT5Model.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)

    model._save_restore_connector = NLPSaveRestoreConnector()

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    model.save_to(args.nemo_file_path)

    logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Пример #4
0
def convert(local_rank, rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    num_nodes = world_size // args.gpus_per_node
    if args.bcp:
        trainer = Trainer(devices=args.gpus_per_node,
                          num_nodes=num_nodes,
                          accelerator='gpu',
                          plugins=[TorchElasticEnvironment()])
    else:
        trainer = Trainer(devices=args.gpus_per_node,
                          num_nodes=num_nodes,
                          accelerator='gpu')

    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size

    parallel_state.initialize_model_parallel(
        tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
        pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
    )

    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank(
    )
    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank(
    )

    # inject model parallel rank
    checkpoint_path = inject_model_parallel_rank(
        os.path.join(args.checkpoint_folder, args.checkpoint_name))

    logging.info(
        f'rank: {rank}, local_rank: {local_rank}, is loading checkpoint: {checkpoint_path} for tp_rank: {app_state.tensor_model_parallel_rank} and pp_rank: {app_state.pipeline_model_parallel_rank}'
    )

    if args.model_type == 'gpt':
        model = MegatronGPTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 'bert':
        model = MegatronBertModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 't5':
        model = MegatronT5Model.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    elif args.model_type == 'nmt':
        model = MegatronNMTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    model._save_restore_connector = NLPSaveRestoreConnector()

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    model.save_to(args.nemo_file_path)

    logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Пример #5
0
def main(cfg) -> None:

    # trainer required for restoring model parallel models
    trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer)
    assert (
        cfg.trainer.devices * cfg.trainer.num_nodes
        == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
    ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"

    app_state = AppState()
    app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
    (
        app_state.tensor_model_parallel_rank,
        app_state.pipeline_model_parallel_rank,
        app_state.model_parallel_size,
        app_state.data_parallel_size,
        app_state.pipeline_model_parallel_split_rank,
    ) = fake_initialize_model_parallel(
        world_size=app_state.model_parallel_size,
        rank=trainer.global_rank,
        tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
        pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
        pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
    )

    if cfg.model_file is not None:
        if not os.path.exists(cfg.model_file):
            raise ValueError(f"Model file {cfg.model_file} does not exist")
        model = MegatronNMTModel.restore_from(
            restore_path=cfg.model_file, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(),
        )
    elif cfg.checkpoint_dir is not None:
        checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
        model = MegatronNMTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)
    else:
        raise ValueError("need at least a nemo file or checkpoint dir")

    model.freeze()

    logging.info(f"Translating: {cfg.srctext}")
    src_text = []
    translations = []
    with open(cfg.srctext, 'r') as src_f, open(cfg.tgtout, 'w') as tgt_f:
        for line in src_f:
            src_text.append(line.strip())
            if len(src_text) == cfg.batch_size:
                translations = model.translate(
                    text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang,
                )
                for translation in translations:
                    tgt_f.write(translation + "\n")
                src_text = []
        if len(src_text) > 0:
            translations = model.translate(text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang,)
            for translation in translations:
                tgt_f.write(translation + "\n")
Пример #6
0
def convert(rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    trainer = Trainer(gpus=args.tensor_model_parallel_size)
    # TODO: reach out to PTL For an API-safe local rank override
    trainer.accelerator.training_type_plugin._local_rank = rank

    if args.tensor_model_parallel_size is not None and args.tensor_model_parallel_size > 1:
        # inject model parallel rank
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       f'mp_rank_{rank:02d}',
                                       args.checkpoint_name)
    else:
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       args.checkpoint_name)

    if args.model_type == 'gpt':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        model = load_from_checkpoint(
            MegatronGPTModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    elif args.model_type == 'bert':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        model = load_from_checkpoint(
            MegatronBertModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    else:
        raise NotImplemented("{} is not supported".format(args.model_type))

    model._save_restore_connector = NLPSaveRestoreConnector()

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    model.save_to(args.nemo_file_path)

    logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Пример #7
0
def convert(rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    trainer = Trainer(gpus=args.tensor_model_parallel_size)
    # TODO: reach out to PTL For an API-safe local rank override
    trainer.accelerator.training_type_plugin._local_rank = rank
    checkpoint_path = os.path.join(args.checkpoint_folder, f'mp_rank_{rank:02d}', args.checkpoint_name)
    model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
    model._save_restore_connector = NLPSaveRestoreConnector()
    model.save_to(args.nemo_file_path)
    logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Пример #8
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=True,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision == 16:
        scaler = GradScaler(
            init_scale=cfg.model.get('native_amp_init_scale', 2**32),
            growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
            hysteresis=cfg.model.get('hysteresis', 2),
        )
        plugins.append(
            PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision,
                                         device='cuda',
                                         scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)
    exp_manager(trainer, cfg.exp_manager)

    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
    with open_dict(cfg):
        cfg.model.precision = cfg.trainer.precision

    # load existing or init new soft prompt GPT model
    if cfg.model.get("restore_path", None):
        model = MegatronGPTPromptLearningModel.restore_from(
            cfg.model.restore_path,
            cfg.model,
            trainer=trainer,
            save_restore_connector=NLPSaveRestoreConnector())
    else:
        model = MegatronGPTPromptLearningModel(cfg.model, trainer=trainer)

    trainer.fit(model)
Пример #9
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--model_file",
                        type=str,
                        required=True,
                        help="Path to source .nemo file")
    parser.add_argument("--target_file",
                        type=str,
                        required=True,
                        help="Path to write target .nemo file")
    parser.add_argument("--tensor_model_parallel_size",
                        type=int,
                        required=True,
                        help="TP size of source model")
    parser.add_argument("--target_tensor_model_parallel_size",
                        type=int,
                        required=True,
                        help="TP size of target model")
    parser.add_argument(
        "--model_class",
        type=str,
        default=
        "nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel",
        help=
        "NeMo model class. This script should support all NeMo megatron models that use Tensor Parallel",
    )
    parser.add_argument("--precision",
                        default=16,
                        help="PyTorch Lightning Trainer precision flag")

    args = parser.parse_args()

    precision = args.precision
    if args.precision in ["32", "16"]:
        precision = int(float(args.precision))
    tp_size = args.tensor_model_parallel_size
    tgt_tp_size = args.target_tensor_model_parallel_size
    cls = model_utils.import_class_by_path(args.model_class)

    trainer = Trainer(devices=1,
                      plugins=NLPDDPPlugin(),
                      accelerator="cpu",
                      precision=precision)
    app_state = AppState()
    app_state.data_parallel_rank = 0
    app_state.pipeline_model_parallel_size = 1  # not supported yet in this script
    app_state.tensor_model_parallel_size = tp_size
    app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size

    if tp_size > 1:
        partitions = []
        for i in range(tp_size):
            app_state.tensor_model_parallel_rank = i
            model = cls.restore_from(restore_path=args.model_file,
                                     trainer=trainer,
                                     map_location=torch.device("cpu"))
            params = [p for _, p in model.named_parameters()]
            partitions.append(params)
            # app_state is being updated incorrectly during restore
            app_state.data_parallel_rank = 0
            app_state.pipeline_model_parallel_size = 1  # not supported yet in this script
            app_state.tensor_model_parallel_size = tp_size
            app_state.model_parallel_size = (
                app_state.pipeline_model_parallel_size *
                app_state.tensor_model_parallel_size)

        model.cfg.tensor_model_parallel_size = 1
        app_state.model_parallel_size = 1
        trainer = Trainer(devices=1,
                          plugins=NLPDDPPlugin(),
                          accelerator="cpu",
                          precision=precision)
        model = cls(model.cfg, trainer).to('cpu')
        model._save_restore_connector = NLPSaveRestoreConnector()

        if tgt_tp_size > 1:
            merge_partition(model, partitions)
        else:
            merge_partition(model, partitions, args.target_file)
    else:
        app_state.model_parallel_size = 1
        model = cls.restore_from(restore_path=args.model_file, trainer=trainer)

    if tgt_tp_size > 1:
        partitions = []
        params = [p for _, p in model.named_parameters()]
        partitions.append(params)

        model.cfg.tensor_model_parallel_size = tgt_tp_size
        app_state.model_parallel_size = tgt_tp_size
        trainer = Trainer(devices=1,
                          plugins=NLPDDPPlugin(),
                          accelerator="cpu",
                          precision=precision)
        model = cls(model.cfg, trainer).to('cpu')
        model._save_restore_connector = NLPSaveRestoreConnector()

        split_partition(model, partitions, tgt_tp_size, args.target_file)

    logging.info("Successfully finished changing partitions!")
Пример #10
0
def convert(local_rank, rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    tensor_model_parallel_size = args.tensor_model_parallel_size
    num_nodes = world_size // args.gpus_per_node
    pipeline_model_parallel_size = world_size // args.tensor_model_parallel_size
    assert args.pipeline_model_parallel_size == pipeline_model_parallel_size

    trainer = Trainer(devices=args.gpus_per_node,
                      accelerator='gpu',
                      num_nodes=num_nodes)

    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size

    parallel_state.initialize_model_parallel(
        tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
        pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
    )

    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank(
    )
    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank(
    )

    pipeline_rank = rank // tensor_model_parallel_size
    tensor_rank = app_state.tensor_model_parallel_rank
    assert pipeline_rank == app_state.pipeline_model_parallel_rank

    if tensor_model_parallel_size is not None and tensor_model_parallel_size > 1 and pipeline_model_parallel_size == 1:
        # inject model parallel rank
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       f'mp_rank_{tensor_rank:02d}',
                                       args.checkpoint_name)
    elif tensor_model_parallel_size is not None and pipeline_model_parallel_size > 1:
        checkpoint_path = os.path.join(
            args.checkpoint_folder,
            f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}',
            args.checkpoint_name)
    else:
        checkpoint_path = os.path.join(args.checkpoint_folder,
                                       args.checkpoint_name)
    logging.info(f"loading checkpoint {checkpoint_path}")

    if args.model_type == 'gpt':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        # nemo megatron doesn't have _for_head key
        name_translate['word_embeddings_for_head'] = 'word_embeddings'
        checkpoint, consumed, steps, version = load_from_checkpoint(
            MegatronGPTModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    elif args.model_type == 'bert':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        # nemo megatron doesn't have _for_head key
        name_translate['word_embeddings_for_head'] = 'word_embeddings'
        checkpoint, consumed, steps, version = load_from_checkpoint(
            MegatronBertModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    else:
        raise NotImplemented("{} is not supported".format(args.model_type))

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    if args.output_ckpt_file_path:
        filepath = args.output_ckpt_file_path
        base_dir = pathlib.Path(filepath).parent
        filename_str = pathlib.Path(filepath).name
        suffix = '.ckpt'
        content = {}
        if consumed is not None:
            content['consumed'] = consumed
        else:
            content['consumed'] = 0
        if steps is not None:
            content['steps'] = steps
        else:
            content['steps'] = 0
        filename = filename_str.format(**content) + suffix
        checkpoint_path_output = inject_model_parallel_rank(
            os.path.join(base_dir, filename))
        trainer.accelerator.training_type_plugin.checkpoint_io.save_checkpoint(
            checkpoint, checkpoint_path_output)
        logging.info(
            f'NeMo model checkpoint files saved to: {args.output_ckpt_file_path}'
        )

    if args.nemo_file_path:
        if args.model_type == 'gpt':
            model = load_model(MegatronGPTModel,
                               checkpoint,
                               strict=False,
                               trainer=trainer)
        elif args.model_type == 'bert':
            model = load_model(MegatronBertModel,
                               checkpoint,
                               strict=False,
                               trainer=trainer)
        else:
            raise NotImplemented("{} is not supported".format(args.model_type))

        # verify tensor parallel rank id and pipeline parallel rank id matches
        assert app_state.data_parallel_size == 1
        assert app_state.tensor_model_parallel_size == tensor_model_parallel_size
        assert app_state.tensor_model_parallel_rank == tensor_rank
        assert app_state.pipeline_model_parallel_size == pipeline_model_parallel_size
        assert app_state.pipeline_model_parallel_rank == pipeline_rank
        model._save_restore_connector = NLPSaveRestoreConnector()
        model.save_to(args.nemo_file_path)
        logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Пример #11
0
def nemo_convert(argv):
    args = get_args(argv)
    loglevel = logging.INFO
    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    if args.verbose is not None:
        numeric_level = getattr(logging, args.verbose.upper(), None)
        if not isinstance(numeric_level, int):
            raise ValueError('Invalid log level: %s' % numeric_level)
        loglevel = numeric_level

    logger = logging.getLogger(__name__)
    if logger.handlers:
        for handler in logger.handlers:
            logger.removeHandler(handler)
    logging.basicConfig(level=loglevel,
                        format='%(asctime)s [%(levelname)s] %(message)s')
    logging.info("Logging level set to {}".format(loglevel))
    """Convert a .nemo saved model trained on previous versions of nemo into a nemo fie with current version."""
    nemo_in = args.source
    out = args.out

    # Create a PL trainer object which is required for restoring Megatron models
    cfg_trainer = TrainerConfig(
        gpus=1,
        accelerator="ddp",
        num_nodes=1,
        # Need to set the following two to False as ExpManager will take care of them differently.
        logger=False,
        checkpoint_callback=False,
    )
    trainer = pl.Trainer(cfg_trainer)

    logging.info("Restoring NeMo model from '{}'".format(nemo_in))
    try:
        # If the megatron based NLP model was trained on NeMo < 1.5, then we need to update the lm_checkpoint on the model config
        if args.megatron_legacy:
            if args.megatron_checkpoint:
                connector = NLPSaveRestoreConnector()
                model_cfg = ModelPT.restore_from(
                    restore_path=nemo_in,
                    save_restore_connector=connector,
                    trainer=trainer,
                    return_config=True)
                OmegaConf.set_struct(model_cfg, True)
                with open_dict(model_cfg):
                    model_cfg.language_model.lm_checkpoint = args.megatron_checkpoint
                    model_cfg['megatron_legacy'] = True
                    model_cfg['masked_softmax_fusion'] = False
                    model_cfg['bias_gelu_fusion'] = False
                model = ModelPT.restore_from(
                    restore_path=nemo_in,
                    save_restore_connector=connector,
                    trainer=trainer,
                    override_config_path=model_cfg,
                )
            else:
                logging.error(
                    "Megatron Checkpoint must be provided if Megatron legacy is chosen"
                )
        else:
            model = ModelPT.restore_from(restore_path=nemo_in, trainer=trainer)
        logging.info("Model {} restored from '{}'".format(
            model.cfg.target, nemo_in))

        # Save the model
        model.save_to(out)
        logging.info("Successfully converted to {}".format(out))

        del model
    except Exception as e:
        logging.error(
            "Failed to restore model from NeMo file : {}. Please make sure you have the latest NeMo package installed with [all] dependencies."
            .format(nemo_in))
        raise e
Пример #12
0
    def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=False):

        self.hidden_size = None
        self.bert_model = None
        vocab_file = None
        nemo_file = None
        config_dict = None
        config_file = None

        # tokenizer needs to get initialized before the super.__init__()
        # as dataloaders and datasets need it to process the data
        pretrain_model_name = ''
        if cfg.get('language_model') and cfg.language_model.get('pretrained_model_name', ''):
            pretrain_model_name = cfg.language_model.get('pretrained_model_name', '')
        all_pretrained_megatron_bert_models = get_megatron_pretrained_bert_models()

        if cfg.get('tokenizer'):
            # Some models have their own tokenizer setup
            if (
                not hasattr(self, 'tokenizer')
                and cfg.tokenizer.get('tokenizer_name')
                and pretrain_model_name not in all_pretrained_megatron_bert_models
            ):
                self.setup_tokenizer(cfg.tokenizer)
            elif pretrain_model_name in all_pretrained_megatron_bert_models:
                copy_cfg = copy.deepcopy(cfg)
                bert_model = get_lm_model(
                    config_file=config_file,
                    config_dict=config_dict,
                    vocab_file=vocab_file,
                    trainer=trainer,
                    cfg=copy_cfg,
                )
                # set the tokenizer if it is not initialized explicitly
                if (
                    (hasattr(self, 'tokenizer') and self.tokenizer is None) or not hasattr(self, 'tokenizer')
                ) and hasattr(bert_model, 'tokenizer'):
                    self.tokenizer = bert_model.tokenizer
            if (
                cfg.get('tokenizer')
                and hasattr(cfg.get('tokenizer'), 'vocab_file')
                and cfg.get('tokenizer').get('vocab_file')
            ):
                vocab_file = self.register_artifact('tokenizer.vocab_file', cfg.tokenizer.vocab_file)
        super().__init__(cfg, trainer)

        # handles model parallel save and restore logic
        self._save_restore_connector = NLPSaveRestoreConnector()

        if cfg.get('language_model') and not no_lm_init:
            if cfg.get('language_model').get('nemo_file'):
                nemo_file = self.register_artifact('language_model.nemo_file', cfg.language_model.nemo_file)
            if cfg.get('language_model').get('config'):
                config_dict = OmegaConf.to_container(cfg.language_model.config)
            if cfg.get('language_model').get('config_file'):
                config_file = self.register_artifact('language_model.config_file', cfg.language_model.config_file)
            bert_model = get_lm_model(
                config_file=config_file, config_dict=config_dict, vocab_file=vocab_file, trainer=trainer, cfg=cfg,
            )
            # set the tokenizer if it is not initialized explicitly
            if ((hasattr(self, 'tokenizer') and self.tokenizer is None) or not hasattr(self, 'tokenizer')) and hasattr(
                bert_model, 'tokenizer'
            ):
                self.tokenizer = bert_model.tokenizer

            # Required to pull up the config for MegatronBert models
            self.pretrained_model_name = cfg.language_model.pretrained_model_name

            # register encoder config
            self.register_bert_model()

            if (
                cfg.tokenizer is not None
                and cfg.tokenizer.get("tokenizer_name", "") is not None
                and "megatron" in cfg.tokenizer.get("tokenizer_name", "")
            ) or pretrain_model_name in all_pretrained_megatron_bert_models:
                self.hidden_size = bert_model.cfg.hidden_size
            else:
                self.hidden_size = bert_model.config.hidden_size

        if cfg.get('language_model') and not no_lm_init:
            self.bert_model = bert_model
Пример #13
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer)

        self.cfg = cfg

        # Load pretrained GPT model and tokenizer
        if cfg.get('language_model_path', None):
            self.frozen_model = MegatronGPTModel.restore_from(
                cfg.get('language_model_path'),
                trainer=trainer,
                save_restore_connector=NLPSaveRestoreConnector(),
            )

        # Freeze all GPT model weights for prompt-tuning/p-tuning
        self.frozen_model.freeze()
        self.tokenizer = self.frozen_model.tokenizer
        self.float_type = self.frozen_model.model.language_model.encoder.layers[
            0].dtype
        self.hidden_size = self.frozen_model.cfg.hidden_size
        self.word_embeddings = self.frozen_model.model.language_model.embedding.word_embeddings
        self.existing_tasks = list(self.cfg.get('existing_tasks', []))
        self.new_tasks = list(self.cfg.get('new_tasks', []))

        # Load templates for assigning virtual prompt token positions
        self.load_task_templates(self.cfg.task_templates)

        # Prompt table stores all task embeddings, p-tuning virtual prompts get added to the table after training
        self.prompt_table = PromptTable(
            existing_tasks=self.existing_tasks,
            task_templates=self.task_templates,
            task_id_num_to_name=self.task_id_num_to_name,
            hidden_size=self.hidden_size,
        )
        self._prompt_table_key = VirtualPromptSource.PROMPT_TABLE.value
        self._prompt_encoder_key = VirtualPromptSource.PROMPT_ENCODER.value

        # Prepare pseudo token ids for virtual/virtual prompt tokens
        self.pseudo_tokens = get_pseudo_tokens(self.max_virtual_tokens)
        self.tokenizer.add_special_tokens(
            {'additional_special_tokens': self.pseudo_tokens})
        self.pseudo_token_ids = self.tokenizer.tokens_to_ids(
            self.pseudo_tokens)
        self.pseudo_token_ids_start = self.pseudo_token_ids[0]
        self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.unk_id
        self.virtual_prompt_style = VirtualPromptStyle(
            cfg.virtual_prompt_style)

        # Prompt tuning stores virtual prompts in the prompt table and tunes their weight directly
        if self.virtual_prompt_style in [
                VirtualPromptStyle.PROMPT_TUNING, VirtualPromptStyle.INFERENCE
        ]:
            self.virtual_prompt_source = VirtualPromptSource.PROMPT_TABLE

        # P-Tuning uses an LSTM Encoder to produce virtual token embeddings
        elif self.virtual_prompt_style == VirtualPromptStyle.P_TUNING:
            self.virtual_prompt_source = VirtualPromptSource.PROMPT_ENCODER
        else:
            raise ValueError(
                f"\nvirtual prompt style '{cfg.virtual_prompt_style}' not recognized, please use one of 'prompt-tuning' or 'p-tuning'"
            )

        self._reduced_loss_buffer = []
        self._inference_config = None

        if self.trainer.precision == 32:
            self.autocast_dtype = torch.float
        elif self.trainer.precision == 16:
            self.autocast_dtype = torch.half
        elif self.trainer.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')
        # make sure the default pytorch lightning gradient clipping in the basemodel
        self.grad_clip_pl_default = True
        # no support of amp o2
        self.megatron_amp_o2 = False
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer)

        self.cfg = cfg

        # Load pretrained GPT model and tokenizer
        self.model = MegatronGPTModel.restore_from(
            self.register_artifact('language_model_path',
                                   cfg.get('language_model_path', None)),
            trainer=trainer,
            save_restore_connector=NLPSaveRestoreConnector(),
        )

        # Freeze all GPT model weights for prompt-tuning/p-tuning
        if not cfg.lm_finetune:
            self.model.freeze()

        self.tokenizer = self.model.tokenizer
        self.float_type = self.model.model.language_model.encoder.layers[
            0].dtype
        self.hidden_size = self.model.cfg.hidden_size
        self.word_embeddings = self.model.model.language_model.embedding.word_embeddings
        self.existing_tasks = list(self.cfg.get('existing_tasks', []))
        self.new_tasks = list(self.cfg.get('new_tasks', []))

        # Load templates for assigning virtual prompt token positions
        self.load_task_templates(self.cfg.task_templates)

        # Prompt table stores all task embeddings, p-tuning virtual prompts get added to the table after training
        self.prompt_table = PromptTable(
            existing_tasks=self.existing_tasks,
            task_templates=self.task_templates,
            task_id_num_to_name=self.task_id_num_to_name,
            hidden_size=self.hidden_size,
        )

        # Prepare pseudo token ids for virtual/virtual prompt tokens
        self.pseudo_token_base = cfg.pseudo_token_base
        self.pseudo_tokens = [
            self.pseudo_token_base + str(i)
            for i in range(self.max_virtual_tokens)
        ]
        self.tokenizer.add_special_tokens(
            {'additional_special_tokens': self.pseudo_tokens})
        self.pseudo_token_ids = self.tokenizer.tokens_to_ids(
            self.pseudo_tokens)
        self.pseudo_token_ids_start = self.pseudo_token_ids[0]
        self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.unk_id
        self.virtual_prompt_style = cfg.virtual_prompt_style.lower()

        # Prompt tuning stores virtual prompts in the prompt table and tunes their weight directly
        if self.virtual_prompt_style in ['prompt-tuning', 'inference']:
            self.virtual_prompt_source = 'prompt-table'

        # P-Tuning uses an LSTM Encoder to produce virtual token embeddings
        elif self.virtual_prompt_style == 'p-tuning':
            self.virtual_prompt_source = 'prompt-encoder'
        else:
            raise ValueError(
                f"\nvirtual prompt style '{cfg.virtual_prompt_type}' not recognized, please use one of 'prompt-tuning' or 'p-tuning'"
            )

        self._reduced_loss_buffer = []
        self._inference_config = None

        if self.trainer.precision == 32:
            self.autocast_dtype = torch.float
        elif self.trainer.precision == 16:
            self.autocast_dtype = torch.half
        elif self.trainer.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')
Пример #15
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--model_file",
                        type=str,
                        default="",
                        required=True,
                        help="Pass path to model's .nemo file")
    parser.add_argument("--prompt",
                        type=str,
                        default="",
                        required=True,
                        help="Prompt for the model (a text to complete)")
    parser.add_argument("--tokens_to_generate",
                        type=int,
                        default="16",
                        required=False,
                        help="How many tokens to add to prompt")
    parser.add_argument(
        "--tensor_model_parallel_size",
        type=int,
        default=1,
        required=False,
    )
    parser.add_argument(
        "--pipeline_model_parallel_size",
        type=int,
        default=1,
        required=False,
    )
    parser.add_argument(
        "--pipeline_model_parallel_split_rank",
        type=int,
        default=0,
        required=False,
    )
    parser.add_argument("--precision",
                        default="16",
                        type=str,
                        help="PyTorch Lightning Trainer precision flag")
    args = parser.parse_args()

    # cast precision to int if 32 or 16
    if args.precision in ["32", "16"]:
        args.precision = int(float(args.precision))

    # trainer required for restoring model parallel models
    trainer = Trainer(
        plugins=NLPDDPPlugin(),
        devices=args.tensor_model_parallel_size *
        args.pipeline_model_parallel_size,
        accelerator='gpu',
        precision=args.precision,
    )

    app_state = AppState()
    if args.tensor_model_parallel_size > 1 or args.pipeline_model_parallel_size > 1:
        app_state.model_parallel_size = args.tensor_model_parallel_size * args.pipeline_model_parallel_size
        (
            app_state.tensor_model_parallel_rank,
            app_state.pipeline_model_parallel_rank,
            app_state.model_parallel_size,
            app_state.data_parallel_size,
            app_state.pipeline_model_parallel_split_rank,
        ) = fake_initialize_model_parallel(
            world_size=app_state.model_parallel_size,
            rank=trainer.global_rank,
            tensor_model_parallel_size_=args.tensor_model_parallel_size,
            pipeline_model_parallel_size_=args.pipeline_model_parallel_size,
            pipeline_model_parallel_split_rank_=args.
            pipeline_model_parallel_split_rank,
        )

    model = MegatronT5Model.restore_from(
        restore_path=args.model_file,
        trainer=trainer,
        save_restore_connector=NLPSaveRestoreConnector(),
    )
    model.freeze()

    request = {
        "prompt": args.prompt,
        "tokens_to_generate": args.tokens_to_generate,
    }

    dataset = T5RequestDataset(request, model.tokenizer)

    request_dl = DataLoader(dataset)

    response = trainer.predict(model, request_dl)

    print("***************************")
    print(response)
    print("***************************")
Пример #16
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=True,
            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision in [16, 'bf16']:
        scaler = None
        if cfg.trainer.precision == 16:
            scaler = GradScaler(
                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                hysteresis=cfg.model.get('hysteresis', 2),
            )
        if megatron_amp_o2:
            plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))
        else:
            plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)

    exp_manager(trainer, cfg.exp_manager)

    # update resume from checkpoint found by exp_manager
    if cfg.model.resume_from_checkpoint is not None:
        resume_from_checkpoint = cfg.model.resume_from_checkpoint
    else:
        resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path
    logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}')

    trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint)
    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)

    # Get the T5 Base configuration.
    t5_cfg = MegatronT5FinetuneModel.restore_from(
        restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
    )

    # Override the T5 configuration with the one from the config file.
    OmegaConf.set_struct(t5_cfg, True)
    with open_dict(t5_cfg):
        t5_cfg.masked_softmax_fusion = False
        t5_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
        t5_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.1)
        t5_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.1)
        t5_cfg.data = cfg.model.data
        t5_cfg.precision = cfg.trainer.precision
        t5_cfg.optim = cfg.model.optim
        t5_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
        t5_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
        # XNLI has eval languages in the yaml config.
        if hasattr(cfg.model, 'eval_languages'):
            t5_cfg.eval_languages = cfg.model.eval_languages

    if hasattr(cfg.model.data.train_ds, 'task_name'):
        model = MegatronT5GLUEModel.restore_from(
            restore_path=cfg.model.restore_from_path,
            trainer=trainer,
            override_config_path=t5_cfg,
            save_restore_connector=NLPSaveRestoreConnector(),
        )
    else:
        model = MegatronT5FinetuneModel.restore_from(
            restore_path=cfg.model.restore_from_path,
            trainer=trainer,
            override_config_path=t5_cfg,
            save_restore_connector=NLPSaveRestoreConnector(),
        )

    trainer.fit(model)
    trainer.validate(model)
    if hasattr(cfg.model.data, 'test_ds'):
        trainer.test(model)
Пример #17
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=True,
            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision in [16, 'bf16']:
        scaler = None
        if cfg.trainer.precision == 16:
            scaler = GradScaler(
                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                growth_interval=cfg.model.get('native_amp_growth_interval',
                                              1000),
                hysteresis=cfg.model.get('hysteresis', 2),
            )
        if megatron_amp_o2:
            plugins.append(
                MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision,
                                            device='cuda',
                                            scaler=scaler))
        else:
            plugins.append(
                PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision,
                                             device='cuda',
                                             scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins,
                      **cfg.trainer,
                      callbacks=[ModelSummary(max_depth=3)])

    # tokenizers will be trained and and tarred training data will be created if needed
    # model config is then updated
    if cfg.model.preproc_out_dir is not None:
        MTDataPreproc(cfg=cfg.model, trainer=trainer)

    exp_manager(trainer, cfg.exp_manager)

    # update resume from checkpoint found by exp_manager
    if cfg.model.resume_from_checkpoint is not None:
        resume_from_checkpoint = cfg.model.resume_from_checkpoint
    else:
        resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path
    logging.info(
        f'Resuming training from checkpoint: {resume_from_checkpoint}')

    trainer._checkpoint_connector = CheckpointConnector(
        trainer, resume_from_checkpoint=resume_from_checkpoint)
    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
    with open_dict(cfg):
        cfg.model.precision = cfg.trainer.precision

    if hasattr(cfg.model, 'pretrained_model_path'
               ) and cfg.model.pretrained_model_path is not None:
        if not hasattr(cfg.model, 'pretrained_model_type'):
            raise ValueError(f"Pretrained model type must be in [T5, BART].")

        assert cfg.model.pretrained_model_type in ['T5', 'BART']
        if cfg.model.pretrained_model_type == 'T5':
            pretrained_cfg = MegatronT5Model.restore_from(
                cfg.model.pretrained_model_path,
                trainer=trainer,
                return_config=True)
        else:
            pretrained_cfg = MegatronBARTModel.restore_from(
                cfg.model.pretrained_model_path,
                trainer=trainer,
                return_config=True)
        OmegaConf.set_struct(pretrained_cfg, True)
        with open_dict(pretrained_cfg):
            pretrained_cfg.masked_softmax_fusion = False
            # Set source and target language/multilingual
            pretrained_cfg.src_language = cfg.model.src_language
            pretrained_cfg.tgt_language = cfg.model.tgt_language
            pretrained_cfg.multilingual = cfg.model.multilingual
            pretrained_cfg.shared_tokenizer = True

            # Max generation delta
            pretrained_cfg.max_generation_delta = cfg.model.max_generation_delta

            # Set label smoothing
            pretrained_cfg.label_smoothing = cfg.model.label_smoothing

            # Set tokenizer paths:
            pretrained_cfg.encoder_tokenizer = pretrained_cfg.tokenizer
            pretrained_cfg.decoder_tokenizer = pretrained_cfg.tokenizer

            # Pre-trained models should use the legacy sentencepiece tokenizer ex: mT5
            pretrained_cfg.encoder_tokenizer.sentencepiece_legacy = True
            pretrained_cfg.decoder_tokenizer.sentencepiece_legacy = True

            # Override dropout
            pretrained_cfg.hidden_dropout = cfg.model.hidden_dropout
            pretrained_cfg.attention_dropout = cfg.model.attention_dropout

            # Override precision
            pretrained_cfg.precision = cfg.model.precision  # Set above from trainer.precision

            # Override data and global/micro batch size.
            pretrained_cfg.train_ds = cfg.model.train_ds
            pretrained_cfg.validation_ds = cfg.model.validation_ds
            pretrained_cfg.test_ds = cfg.model.test_ds

            pretrained_cfg.micro_batch_size = cfg.model.micro_batch_size
            pretrained_cfg.global_batch_size = cfg.model.global_batch_size

            # Class target for the new class being restored.
            pretrained_cfg.target = (
                "nemo.collections.nlp.models.machine_translation.megatron_nmt_model.MegatronNMTModel"
            )

            # Optimizer overrides.
            pretrained_cfg.optim = cfg.model.optim

        model = MegatronNMTModel.restore_from(
            cfg.model.pretrained_model_path,
            trainer=trainer,
            override_config_path=pretrained_cfg,
            save_restore_connector=NLPSaveRestoreConnector(),
        )
    else:
        model = MegatronNMTModel(cfg.model, trainer)
    if cfg.do_training:
        trainer.fit(model)

    if cfg.do_testing:
        trainer.test(model)
Пример #18
0
 def __init__(self, cfg: DictConfig, trainer: Trainer = None):
     super().__init__(cfg, trainer)
     # handles model parallel save and restore logic
     self._save_restore_connector = NLPSaveRestoreConnector()
     self.set_world_size(trainer)