Exemplo n.º 1
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    trainer = None
    if cfg.trainer.precision == 16:
        trainer = Trainer(
            plugins=[
                NLPDDPPlugin(),
                NLPNativeMixedPrecisionPlugin(
                    init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
                    growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                ),
            ],
            **cfg.trainer,
        )
    elif cfg.trainer.precision == 'bf16':
        trainer = Trainer(plugins=[NLPDDPPlugin(), NLPNativeBfloat16PrecisionPlugin(),], **cfg.trainer,)
    else:
        trainer = Trainer(plugins=[NLPDDPPlugin(), NLPPrecisionPlugin()], **cfg.trainer)

    app_state = AppState()
    app_state.model_parallel_size = cfg.model.tensor_model_parallel_size
    app_state.model_parallel_rank = compute_model_parallel_rank(trainer.local_rank, app_state.model_parallel_size)

    model = MegatronGPTModel.restore_from(
        cfg.restore_from_path, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(),
    )

    # Note: most nemo models must have the data paths configured before instantiating the model
    # MegatronGPTMOdel sets up the data in the PTL method .setup which happens after DDP spawns.
    model.cfg.data.splits_string = cfg.model.data.splits_string

    trainer.test(model)
Exemplo n.º 2
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    plugins = [NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)]

    trainer = Trainer(plugins=plugins, **cfg.trainer)
    exp_manager(trainer, cfg.exp_manager)
    model = MegatronGPTModel.restore_from(cfg.restore_from_path,
                                          cfg.model,
                                          trainer=trainer)

    # Init all new prompts
    for idx, tag in enumerate(cfg.model.new_prompt_tags):
        init_method = cfg.model.new_prompt_init_methods[idx]

        if init_method == "text":
            init_text = cfg.model.new_prompt_init_text[idx]
            model.init_prompt_from_text(tag, init_text)

        elif init_method == 'random':
            model.init_prompt_from_random(tag)

        else:
            logging.info(
                f'\n Soft prompt init method {init_method} is not recognized, please use text or random'
            )

    logging.info(f'\nCurrent soft prompts include {model.get_prompt_table()}')
    trainer.fit(model)
Exemplo n.º 3
0
def main(cfg: MTBottleneckConfig) -> None:
    # merge default config with user specified config
    default_cfg = MTBottleneckConfig()
    cfg = update_model_config(default_cfg, cfg)
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')

    # training is managed by PyTorch Lightning
    trainer_cfg = OmegaConf.to_container(cfg.trainer)
    trainer_cfg.pop('plugins', None)
    trainer = Trainer(plugins=[NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)], **trainer_cfg)

    # tokenizers will be trained and and tarred training data will be created if needed
    # model config is then updated
    if cfg.model.preproc_out_dir is not None:
        MTDataPreproc(cfg=cfg.model, trainer=trainer)

    # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning
    exp_manager(trainer, cfg.exp_manager)

    # everything needed to train translation models is encapsulated in the NeMo MTEncdDecModel
    mt_model = MTBottleneckModel(cfg.model, trainer=trainer)

    logging.info("\n\n************** Model parameters and their sizes ***********")
    for name, param in mt_model.named_parameters():
        print(name, param.size())
    logging.info("***********************************************************\n\n")

    if cfg.do_training:
        trainer.fit(mt_model)

    if cfg.do_testing:
        trainer.test(mt_model)
Exemplo n.º 4
0
    def setup_class(cls):
        if not torch.cuda.is_available():
            return
        GPUS = 1
        plugins = [NLPDDPPlugin()]
        TP_SIZE = GPUS
        PP_SIZE = 1
        MB_SIZE = 4
        GB_SIZE = 8
        SEED = 1234
        trainer = Trainer(
            plugins=plugins, devices=GPUS, accelerator='gpu', num_nodes=1, logger=None, log_gpu_memory=None
        )

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=TP_SIZE,
            pipeline_model_parallel_size=PP_SIZE,
            micro_batch_size=MB_SIZE,
            global_batch_size=GB_SIZE,
            seed=SEED,
            apex_transformer_log_level=30,
        )

        def dummy():
            return

        if trainer.strategy.launcher is not None:
            trainer.strategy.launcher.launch(dummy, trainer=trainer)
        trainer.strategy.setup_environment()
        torch.distributed.barrier()
Exemplo n.º 5
0
    def setup_method(self, test_method):
        trainer_config = {
            "devices": 1,
            "num_nodes": 1,
            "accelerator": "gpu",
            "logger": False,
            "precision": 16,
        }
        tensor_model_parallel_size = 1
        pipeline_model_parallel_size = 1
        model_file = '/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo'

        # trainer required for restoring model parallel models
        trainer = Trainer(plugins=NLPDDPPlugin(), **trainer_config)
        assert (
            trainer_config["devices"] *
            trainer_config['num_nodes'] == tensor_model_parallel_size *
            pipeline_model_parallel_size
        ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"

        model = MegatronGPTModel.restore_from(restore_path=model_file,
                                              trainer=trainer)
        model.freeze()

        # has to turn off activations_checkpoint_method for inference
        try:
            model.model.language_model.encoder.activations_checkpoint_method = None
        except AttributeError:
            pass

        self.model = model
Exemplo n.º 6
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=(megatron_amp_o2
                                       and cfg.trainer.precision == 'bf16'
                                       ),  # Only bf16 uses fp32_grad_accum.
            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision in [16, 'bf16']:
        scaler = None
        if cfg.trainer.precision == 16:
            scaler = GradScaler(
                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                growth_interval=cfg.model.get('native_amp_growth_interval',
                                              1000),
                hysteresis=cfg.model.get('hysteresis', 2),
            )
        if megatron_amp_o2:
            plugins.append(
                MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision,
                                            device='cuda',
                                            scaler=scaler))
        else:
            plugins.append(
                NativeMixedPrecisionPlugin(precision=cfg.trainer.precision,
                                           device='cuda',
                                           scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)

    exp_manager(trainer, cfg.exp_manager)

    # update resume from checkpoint found by exp_manager
    resume_from_checkpoint = trainer.checkpoint_connector.resume_from_checkpoint_fit_path
    logging.info(
        f'Resuming training from checkpoint: {resume_from_checkpoint}')

    trainer.checkpoint_connector = CheckpointConnector(
        trainer, resume_from_checkpoint=resume_from_checkpoint)
    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
    with open_dict(cfg):
        cfg.model.precision = cfg.trainer.precision

    model = MegatronXNlIModel(cfg.model, trainer)
    trainer.fit(model)
    trainer.test(model)
def main(cfg: DictConfig) -> None:
    logging.info(f'\nConfig Params:\n{OmegaConf.to_yaml(cfg)}')
    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    # TODO: can we drop strict=False
    model = TextClassificationModel.restore_from(cfg.model.nemo_path, trainer=trainer, strict=False)
    model.setup_test_data(test_data_config=cfg.model.test_ds)

    trainer.test(model=model, ckpt_path=None)
Exemplo n.º 8
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--model_file",
                        type=str,
                        default="",
                        required=True,
                        help="Pass path to model's .nemo file")
    parser.add_argument("--prompt",
                        type=str,
                        default="",
                        required=True,
                        help="Prompt for the model (a text to complete)")
    parser.add_argument("--tokens_to_generate",
                        type=int,
                        default="16",
                        required=False,
                        help="How many tokens to add to prompt")
    parser.add_argument(
        "--tensor_model_parallel_size",
        type=int,
        default=1,
        required=True,
    )

    args = parser.parse_args()

    torch.set_grad_enabled(False)

    # trainer required for restoring model parallel models
    trainer = Trainer(plugins=NLPDDPPlugin(),
                      devices=args.tensor_model_parallel_size,
                      precision=16,
                      accelerator='gpu')

    app_state = AppState()
    if args.tensor_model_parallel_size > 1:
        app_state.model_parallel_size = args.tensor_model_parallel_size
        app_state.model_parallel_rank = compute_model_parallel_rank(
            trainer.local_rank, app_state.model_parallel_size)

    model = MegatronT5Model.restore_from(restore_path=args.model_file,
                                         trainer=trainer)
    model.freeze()
    request = {
        "prompt": args.prompt,
        "tokens_to_generate": args.tokens_to_generate,
    }

    dataset = T5RequestDataset(request, model.tokenizer)

    request_dl = DataLoader(dataset)

    response = trainer.predict(model, request_dl)

    print("***************************")
    print(response)
    print("***************************")
Exemplo n.º 9
0
def main(cfg) -> None:

    # trainer required for restoring model parallel models
    trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer)
    assert (
        cfg.trainer.devices * cfg.trainer.num_nodes
        == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
    ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"

    app_state = AppState()
    app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
    (
        app_state.tensor_model_parallel_rank,
        app_state.pipeline_model_parallel_rank,
        app_state.model_parallel_size,
        app_state.data_parallel_size,
        app_state.pipeline_model_parallel_split_rank,
    ) = fake_initialize_model_parallel(
        world_size=app_state.model_parallel_size,
        rank=trainer.global_rank,
        tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
        pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
        pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
    )

    if cfg.model_file is not None:
        if not os.path.exists(cfg.model_file):
            raise ValueError(f"Model file {cfg.model_file} does not exist")
        model = MegatronNMTModel.restore_from(
            restore_path=cfg.model_file, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(),
        )
    elif cfg.checkpoint_dir is not None:
        checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
        model = MegatronNMTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)
    else:
        raise ValueError("need at least a nemo file or checkpoint dir")

    model.freeze()

    logging.info(f"Translating: {cfg.srctext}")
    src_text = []
    translations = []
    with open(cfg.srctext, 'r') as src_f, open(cfg.tgtout, 'w') as tgt_f:
        for line in src_f:
            src_text.append(line.strip())
            if len(src_text) == cfg.batch_size:
                translations = model.translate(
                    text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang,
                )
                for translation in translations:
                    tgt_f.write(translation + "\n")
                src_text = []
        if len(src_text) > 0:
            translations = model.translate(text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang,)
            for translation in translations:
                tgt_f.write(translation + "\n")
Exemplo n.º 10
0
def main(cfg: DictConfig) -> None:
    logging.info(f'\nConfig Params:\n{cfg.pretty()}')
    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))

    model = TextClassificationModel.restore_from(cfg.model.nemo_path,
                                                 trainer=trainer)
    model.setup_test_data(test_data_config=cfg.model.test_ds)

    trainer.test(model=model, ckpt_path=None)
Exemplo n.º 11
0
def main(args):
    config_path = args.config
    config = OmegaConf.load(config_path)

    config.trainer.gpus = args.gpus
    print(OmegaConf.to_yaml(config))

    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **config.trainer)
    exp_dir = exp_manager(trainer, config.get("exp_manager", None))

    model = GLUEModel(cfg=config.model, trainer=trainer)

    trainer.fit(model)
Exemplo n.º 12
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
    plugins = [
        NLPDDPPlugin(
            num_nodes=cfg.trainer.num_nodes,
            no_ddp_communication_hook=(megatron_amp_o2
                                       and cfg.trainer.precision == 'bf16'
                                       ),  # Only bf16 uses fp32_grad_accum.
            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision in [16, 'bf16']:
        scaler = None
        if cfg.trainer.precision == 16:
            scaler = GradScaler(
                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                growth_interval=cfg.model.get('native_amp_growth_interval',
                                              1000),
                hysteresis=cfg.model.get('hysteresis', 2),
            )
        if megatron_amp_o2:
            plugins.append(
                MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision,
                                            device='cuda',
                                            scaler=scaler))
        else:
            plugins.append(
                NativeMixedPrecisionPlugin(precision=cfg.trainer.precision,
                                           device='cuda',
                                           scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)
    exp_manager(trainer, cfg.exp_manager)

    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    model = MegatronT5GLUEModel.restore_from(
        restore_path=cfg.model.restore_from_finetuned_path, trainer=trainer)
    model.freeze()

    trainer.validate(model)
Exemplo n.º 13
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    plugins = [NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)]
    if cfg.trainer.precision == 16:
        scaler = GradScaler(
            init_scale=cfg.model.get('native_amp_init_scale', 2**32),
            growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
        )
        plugins.append(
            NativeMixedPrecisionPlugin(precision=16,
                                       device='cuda',
                                       scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)

    exp_manager(trainer, cfg.exp_manager)

    # update resume from checkpoint found by exp_manager
    resume_from_checkpoint = trainer.checkpoint_connector.resume_from_checkpoint_fit_path
    if resume_from_checkpoint is not None:
        # inject mp_rank into resume_from_checkpoint
        if cfg.model.tensor_model_parallel_size is not None and cfg.model.tensor_model_parallel_size > 1:
            mp_rank = compute_model_parallel_rank(
                trainer.local_rank, cfg.model.tensor_model_parallel_size)
            resume_from_checkpoint = Path(resume_from_checkpoint)
            resume_from_checkpoint = resume_from_checkpoint.parent.parent.joinpath(
                f'mp_rank_{mp_rank:02d}').joinpath(resume_from_checkpoint.name)
            resume_from_checkpoint = str(resume_from_checkpoint)
        logging.info(
            f'Resuming training from checkpoint: {resume_from_checkpoint}')

    trainer.checkpoint_connector = CheckpointConnector(
        trainer, resume_from_checkpoint=resume_from_checkpoint)
    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
    with open_dict(cfg):
        cfg.model.precision = cfg.trainer.precision

    model = MegatronT5Model(cfg.model, trainer)

    trainer.fit(model)
Exemplo n.º 14
0
def main(cfg: DictConfig) -> None:
    try:
        plugin = NLPDDPPlugin()
    except (ImportError, ModuleNotFoundError):
        plugin = None

    trainer = pl.Trainer(plugins=plugin, **cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))

    if not cfg.pretrained_model:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = TokenClassificationModel(cfg.model, trainer=trainer)
    else:
        if os.path.exists(cfg.pretrained_model):
            # TODO: can we drop strict=False?
            model = TokenClassificationModel.restore_from(cfg.pretrained_model,
                                                          trainer=trainer,
                                                          strict=False)
        elif cfg.pretrained_model in TokenClassificationModel.get_available_model_names(
        ):
            model = TokenClassificationModel.from_pretrained(
                cfg.pretrained_model)
        else:
            raise ValueError(
                f'Provide path to the pre-trained .nemo file or choose from {TokenClassificationModel.list_available_models()}'
            )

        data_dir = cfg.model.dataset.get('data_dir', None)
        if data_dir:
            if not os.path.exists(data_dir):
                raise ValueError(f'{data_dir} is not found at')

            # we can also do finetuning of the pretrained model but it will require
            # setup the data dir to get class weights statistics
            model.update_data_dir(data_dir=data_dir)
            # finally, setup train and validation Pytorch DataLoaders
            model.setup_training_data()
            model.setup_validation_data()
            # then we're setting up loss, use model.dataset.class_balancing,
            # if you want to add class weights to the CrossEntropyLoss
            model.setup_loss(class_balancing=cfg.model.dataset.class_balancing)
            logging.info(f'Using config file of the pretrained model')
        else:
            raise ValueError(
                'Specify a valid dataset directory that contains test_ds.text_file and test_ds.labels_file \
                with "model.dataset.data_dir" argument')

    trainer.fit(model)
Exemplo n.º 15
0
def main(args):
    config_path = args.qa_config
    config = OmegaConf.load(config_path)

    config.trainer.gpus = args.gpus
    print(OmegaConf.to_yaml(config))

    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **config.trainer)
    exp_dir = exp_manager(trainer, config.get("exp_manager", None))

    model_qa = nemo_nlp.models.QAModel(cfg=config.model, trainer=trainer)

    trainer.fit(model_qa)

    # model_qa.setup_test_data(test_data_config=config.model.test_ds)
    trainer.test(model_qa)
Exemplo n.º 16
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=True,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision == 16:
        scaler = GradScaler(
            init_scale=cfg.model.get('native_amp_init_scale', 2**32),
            growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
            hysteresis=cfg.model.get('hysteresis', 2),
        )
        plugins.append(
            PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision,
                                         device='cuda',
                                         scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)
    exp_manager(trainer, cfg.exp_manager)

    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
    with open_dict(cfg):
        cfg.model.precision = cfg.trainer.precision

    # load existing or init new soft prompt GPT model
    if cfg.model.get("restore_path", None):
        model = MegatronGPTPromptLearningModel.restore_from(
            cfg.model.restore_path,
            cfg.model,
            trainer=trainer,
            save_restore_connector=NLPSaveRestoreConnector())
    else:
        model = MegatronGPTPromptLearningModel(cfg.model, trainer=trainer)

    trainer.fit(model)
Exemplo n.º 17
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    plugins = [NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)]
    if cfg.trainer.precision == 16:
        plugins.append(
            NLPNativeMixedPrecisionPlugin(
                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                growth_interval=cfg.model.get('native_amp_growth_interval',
                                              1000),
            ))
    elif cfg.trainer.precision == 'bf16':
        plugins.append(NLPNativeBfloat16PrecisionPlugin())
    else:
        plugins.append(NLPPrecisionPlugin())

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)

    exp_manager(trainer, cfg.exp_manager)

    # update resume from checkpoint found by exp_manager
    resume_from_checkpoint = trainer.resume_from_checkpoint
    if resume_from_checkpoint is not None:
        mp_rank = compute_model_parallel_rank(
            trainer.local_rank, cfg.model.tensor_model_parallel_size)
        resume_from_checkpoint = Path(resume_from_checkpoint)
        resume_from_checkpoint = resume_from_checkpoint.parent.parent.joinpath(
            f'mp_rank_{mp_rank:02d}').joinpath(resume_from_checkpoint.name)
        resume_from_checkpoint = str(resume_from_checkpoint)
        logging.info(
            f'Resuming training from checkpoint: {resume_from_checkpoint}')

    trainer.checkpoint_connector = CheckpointConnector(
        trainer, resume_from_checkpoint=resume_from_checkpoint)
    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    model = MegatronGPTModel(cfg.model, trainer)

    trainer.fit(model)
Exemplo n.º 18
0
def main(args):
    config_path = args.re_config
    config = OmegaConf.load(config_path)

    config.trainer.gpus = args.gpus
    print(OmegaConf.to_yaml(config))

    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **config.trainer)
    exp_dir = exp_manager(trainer, config.get("exp_manager", None))

    model = nemo_nlp.models.TextClassificationModel(cfg=config.model,
                                                    trainer=trainer)

    trainer.fit(model)

    model.save_to(config.model.nemo_path)

    all_preds = []
    mode = model.training
    with torch.no_grad():
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        model.eval()
        model.to(device)
        # infer_datalayer = model._setup_infer_dataloader(queries, batch_size)
        ts_dl = model._setup_dataloader_from_config(cfg=config.model.test_ds)
        for i, batch in enumerate(ts_dl):
            if i == 0:
                print(batch)
            input_ids, input_type_ids, input_mask, subtokens_mask = batch

            logits = model.forward(
                input_ids=input_ids.to(device),
                token_type_ids=input_type_ids.to(device),
                attention_mask=input_mask.to(device),
            )
            preds = tensor2list(torch.argmax(logits, axis=-1))
            all_preds.extend(preds)

    output_path = Path(args.pred_output)
    output_path.mkdir(parents=True, exist_ok=True)
    with open(output_path / "predict_labels.txt", "w") as f:
        f.write("\n".join([str(e) for e in all_preds]))
Exemplo n.º 19
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    plugins = [NLPDDPPlugin(find_unused_parameters=False)]
    if cfg.trainer.precision == 16:
        scaler = GradScaler(
            init_scale=cfg.model.get('native_amp_init_scale', 2**32),
            growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
        )
        plugins.append(
            NativeMixedPrecisionPlugin(precision=16,
                                       device='cuda',
                                       scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)

    exp_manager(trainer, cfg.exp_manager)

    # update resume from checkpoint found by exp_manager
    resume_from_checkpoint = trainer.checkpoint_connector.resume_from_checkpoint_fit_path
    logging.info(
        f'Resuming training from checkpoint: {resume_from_checkpoint}')

    trainer.checkpoint_connector = CheckpointConnector(
        trainer, resume_from_checkpoint=resume_from_checkpoint)
    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
    with open_dict(cfg):
        cfg.model.precision = cfg.trainer.precision

    model = MegatronBertModel(cfg.model, trainer)

    trainer.fit(model)
Exemplo n.º 20
0
def main(cfg) -> None:

    # trainer required for restoring model parallel models
    trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer)
    assert (
        cfg.trainer.devices *
        cfg.trainer.num_nodes == cfg.tensor_model_parallel_size *
        cfg.pipeline_model_parallel_size
    ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"

    # Load prompt tuned model, virtual_prompt_model_file must be provided in config
    if cfg.get('virtual_prompt_model_file', None) is not None:

        # Update frozen GPT model path in case it has changed
        prompt_learning_cfg = MegatronGPTPromptLearningModel.restore_from(
            cfg.virtual_prompt_model_file, trainer=trainer, return_config=True)
        with open_dict(prompt_learning_cfg):
            prompt_learning_cfg.language_model_path = cfg.gpt_model_file

        # Now load prompt learning model with frozen gpt model base
        model = MegatronGPTPromptLearningModel.restore_from(
            restore_path=cfg.virtual_prompt_model_file,
            trainer=trainer,
            override_config_path=prompt_learning_cfg)

    # Or load regular GPT model
    elif cfg.gpt_model_file:
        model = MegatronGPTModel.restore_from(restore_path=cfg.gpt_model_file,
                                              trainer=trainer)
    elif cfg.checkpoint_dir:
        app_state = AppState()
        if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1:
            app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
            (
                app_state.tensor_model_parallel_rank,
                app_state.pipeline_model_parallel_rank,
                app_state.model_parallel_size,
                app_state.data_parallel_size,
                app_state.pipeline_model_parallel_split_rank,
            ) = fake_initialize_model_parallel(
                world_size=app_state.model_parallel_size,
                rank=trainer.global_rank,
                tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
                pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
                pipeline_model_parallel_split_rank_=cfg.
                pipeline_model_parallel_split_rank,
            )
        checkpoint_path = inject_model_parallel_rank(
            os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
        model = MegatronGPTModel.load_from_checkpoint(
            checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)
    else:
        raise ValueError("need at least a nemo file or checkpoint dir")

    model.freeze()

    # Have to turn off activations_checkpoint_method for inference
    try:
        model.model.language_model.encoder.activations_checkpoint_method = None
    except AttributeError:
        pass

    try:
        model.frozen_model.language_model.encoder.activations_checkpoint_method = None
    except AttributeError:
        pass

    length_params: LengthParam = {
        "max_length": cfg.inference.tokens_to_generate,
        "min_length": cfg.inference.min_tokens_to_generate,
    }

    sampling_params: SamplingParam = {
        "use_greedy": cfg.inference.greedy,
        "temperature": cfg.inference.temperature,
        "top_k": cfg.inference.top_k,
        "top_p": cfg.inference.top_p,
        "repetition_penalty": cfg.inference.repetition_penalty,
        "add_BOS": cfg.inference.add_BOS,
        "all_probs": cfg.inference.all_probs,
        "compute_logprob": cfg.inference.compute_logprob,
    }

    # First method of running text generation, call model.generate method
    response = model.generate(inputs=OmegaConf.to_container(cfg.prompts),
                              length_params=length_params,
                              sampling_params=sampling_params)

    print("***************************")
    print(response)
    print("***************************")

    # Second method of running text generation, call trainer.predict
    collate_fn = None
    if cfg.get('virtual_prompt_model', False):
        collate_fn = lambda x: list(x)

    ds = RequestDataSet(OmegaConf.to_container(cfg.prompts))
    request_dl = DataLoader(dataset=ds, collate_fn=collate_fn, batch_size=2)

    config = OmegaConf.to_container(cfg.inference)
    model.set_inference_config(config)
    response = trainer.predict(model, request_dl)

    print("***************************")
    print(response)
    print("***************************")

    # Third method of running text generation, use inference server
    if cfg.server:
        if parallel_state.is_pipeline_first_stage(
        ) and parallel_state.get_tensor_model_parallel_rank() == 0:
            server = MegatronServer(model.cuda())
            server.run("0.0.0.0", port=cfg.port)

        while True:
            choice = torch.cuda.LongTensor(1)
            torch.distributed.broadcast(choice, 0)
            if choice[0].item() == 0:
                generate(model.cuda())
Exemplo n.º 21
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    # setup the data processor
    for processor_config in cfg.model.task_processors:
        processor = TemplateProcessor(
            template=processor_config.template, limit_length_field=processor_config.limit_length_field
        )
        register_taskdata_processor(processor_config.taskname, processor)

    plugins = [NLPDDPPlugin()]
    if cfg.trainer.precision == 16:
        scaler = GradScaler(
            init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
            growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
        )
        plugins.append(NativeMixedPrecisionPlugin(precision=16, device='cuda', scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)

    exp_manager(trainer, cfg.exp_manager)

    # update resume from checkpoint found by exp_manager
    resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path
    logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}')

    trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint)
    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)

    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
    with open_dict(cfg):
        cfg.model.precision = cfg.trainer.precision
    model = MegatronT5PTuneModel(cfg.model, trainer)
    trainer.fit(model)

    if cfg.model.data.test_ds.file_path:
        logging.info("===========================================================================================")
        logging.info("Starting the testing of the trained model on test set...")
        trainer.test(model)
        logging.info("Testing finished!")
        logging.info("===========================================================================================")
        # extract the path of the best checkpoint from the training, you may update it to any checkpoint
        checkpoint_path = trainer.checkpoint_callback.best_model_path
        tensor_parallel_size = cfg.model.tensor_model_parallel_size
        pathobj = Path(checkpoint_path)
        checkpoint_folder = str(pathobj.parent)
        checkpoint_name = str(pathobj.name)

        rank = trainer.accelerator.training_type_plugin.local_rank
        if tensor_parallel_size > 1:
            # inject model parallel rank
            checkpoint_path = os.path.join(checkpoint_folder, f'mp_rank_{rank:02d}', checkpoint_name)
        else:
            checkpoint_path = os.path.join(checkpoint_folder, checkpoint_name)

        # Load the checkpoint
        best_eval_model = MegatronT5PTuneModel.load_from_checkpoint(
            checkpoint_path=checkpoint_path, strict=False, trainer=trainer
        )
        logging.info(f'Best checkpoint path: {checkpoint_path}')
        logging.info("Running Test with best EVAL checkpoint!")
        # setup the test dataset
        #  best_eval_model.setup_test_data(test_data_config=cfg.model.data.test_ds)
        if torch.distributed.is_initialized():
            torch.distributed.barrier()
        trainer.test(model=best_eval_model, ckpt_path=None, verbose=False)
        logging.info("Beset EVAL Testing finished!")
        logging.info("===========================================================================================")

    if cfg.model.nemo_path:
        # '.nemo' file contains the last checkpoint and the params to initialize the model
        best_eval_model.save_to(cfg.model.nemo_path)
        logging.info(f'Model is saved into `.nemo` file: {cfg.model.nemo_path}')

    # perform inference on a list of queries.
    if "infer_samples" in cfg.model and cfg.model.infer_samples:
        logging.info("===========================================================================================")
        logging.info("Starting the inference on some sample queries...")

        # max_seq_length=512 is the maximum length BERT supports.
        results = best_eval_model.cuda().ptune_inference(
            queries=cfg.model.infer_samples, batch_size=1, decode_token_len=5
        )
        logging.info('The prediction results of some sample queries with the trained model:')
        for query, result in zip(cfg.model.infer_samples, results):
            logging.info(f'Query : {query}')
            logging.info(f'Predicted label: {result}')

        logging.info("Inference finished!")
        logging.info("===========================================================================================")
Exemplo n.º 22
0
def main(args):
    config_path = args.ner_config
    config = OmegaConf.load(config_path)

    config.trainer.gpus = args.gpus
    print(OmegaConf.to_yaml(config))

    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **config.trainer)
    exp_dir = exp_manager(trainer, config.get("exp_manager", None))
    model_ner = None

    if args.do_train:
        if not config.pretrained_model:
            model_ner = nemo_nlp.models.TokenClassificationModel(
                cfg=config.model, trainer=trainer)
        else:
            model_ner = nemo_nlp.models.TokenClassificationModel.restore_from(
                config.pretrained_model)

        trainer.fit(model_ner)

        try:
            if config.model.nemo_path:
                model_ner.save_to(config.model.nemo_path)
        except:
            traceback.print_exc()
            config.model.nemo_path = Path(
                str(exp_dir)) / "checkpoints/default.nemo"

        dev_text = Path(
            config.model.dataset.data_dir) / config.model.test_ds.text_file
        dev_label = Path(
            config.model.dataset.data_dir) / config.model.test_ds.labels_file
        dev_preds = predict_in_sent(dev_text,
                                    model_ner,
                                    bz=config.model.test_ds.batch_size,
                                    combine=False)

        try:
            dev_gs = load_gs_labels(dev_label)
            bio_eval = BioEval()
            bio_eval.eval_mem(dev_gs, dev_preds, do_flat=False)
            bio_eval.show_evaluation()
        except:
            pass

        dev_preds = predict_in_sent(dev_text,
                                    model_ner,
                                    bz=config.model.test_ds.batch_size,
                                    combine=True)
        pout = Path(args.pred_output)
        pout.mkdir(exist_ok=True)
        ofn = pout / "labels_test.txt"
        with open(ofn, "w") as f:
            f.write("\n".join(dev_preds))

    if args.do_pred and args.pred_output:
        trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **config.trainer)
        exp_dir = exp_manager(trainer, config.get("exp_manager", None))

        try:
            model_ner = nemo_nlp.models.TokenClassificationModel.restore_from(
                config.model.nemo_path, trainer=trainer, strict=False)
        except:
            traceback.print_exc()
            config.model.nemo_path = str(exp_dir) + "/checkpoints/default.nemo"
            model_ner = nemo_nlp.models.TokenClassificationModel.restore_from(
                config.model.nemo_path, trainer=trainer, strict=False)

        text_file = Path(
            config.model.dataset.data_dir) / config.model.test_ds.text_file

        res = predict_in_sent(text_file, model_ner, combine=True)

        test_label = Path(
            config.model.dataset.data_dir) / config.model.test_ds.labels_file
        test_gs = load_gs_labels(test_label)
        bio_eval = BioEval()
        bio_eval.eval_mem(test_gs, res, do_flat=False)
        bio_eval.show_evaluation()

        pout = Path(args.pred_output)
        pout.mkdir(exist_ok=True)
        ofn = pout / "labels_test1.txt"
        with open(ofn, "w") as f:
            f.write("\n".join(res))
Exemplo n.º 23
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=True,
            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision in [16, 'bf16']:
        scaler = None
        if cfg.trainer.precision == 16:
            scaler = GradScaler(
                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                growth_interval=cfg.model.get('native_amp_growth_interval',
                                              1000),
                hysteresis=cfg.model.get('hysteresis', 2),
            )
        if megatron_amp_o2:
            plugins.append(
                MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision,
                                            device='cuda',
                                            scaler=scaler))
        else:
            plugins.append(
                PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision,
                                             device='cuda',
                                             scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins,
                      **cfg.trainer,
                      callbacks=[ModelSummary(max_depth=3)])

    # tokenizers will be trained and and tarred training data will be created if needed
    # model config is then updated
    if cfg.model.preproc_out_dir is not None:
        MTDataPreproc(cfg=cfg.model, trainer=trainer)

    exp_manager(trainer, cfg.exp_manager)

    # update resume from checkpoint found by exp_manager
    if cfg.model.resume_from_checkpoint is not None:
        resume_from_checkpoint = cfg.model.resume_from_checkpoint
    else:
        resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path
    logging.info(
        f'Resuming training from checkpoint: {resume_from_checkpoint}')

    trainer._checkpoint_connector = CheckpointConnector(
        trainer, resume_from_checkpoint=resume_from_checkpoint)
    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
    with open_dict(cfg):
        cfg.model.precision = cfg.trainer.precision

    if hasattr(cfg.model, 'pretrained_model_path'
               ) and cfg.model.pretrained_model_path is not None:
        if not hasattr(cfg.model, 'pretrained_model_type'):
            raise ValueError(f"Pretrained model type must be in [T5, BART].")

        assert cfg.model.pretrained_model_type in ['T5', 'BART']
        if cfg.model.pretrained_model_type == 'T5':
            pretrained_cfg = MegatronT5Model.restore_from(
                cfg.model.pretrained_model_path,
                trainer=trainer,
                return_config=True)
        else:
            pretrained_cfg = MegatronBARTModel.restore_from(
                cfg.model.pretrained_model_path,
                trainer=trainer,
                return_config=True)
        OmegaConf.set_struct(pretrained_cfg, True)
        with open_dict(pretrained_cfg):
            pretrained_cfg.masked_softmax_fusion = False
            # Set source and target language/multilingual
            pretrained_cfg.src_language = cfg.model.src_language
            pretrained_cfg.tgt_language = cfg.model.tgt_language
            pretrained_cfg.multilingual = cfg.model.multilingual
            pretrained_cfg.shared_tokenizer = True

            # Max generation delta
            pretrained_cfg.max_generation_delta = cfg.model.max_generation_delta

            # Set label smoothing
            pretrained_cfg.label_smoothing = cfg.model.label_smoothing

            # Set tokenizer paths:
            pretrained_cfg.encoder_tokenizer = pretrained_cfg.tokenizer
            pretrained_cfg.decoder_tokenizer = pretrained_cfg.tokenizer

            # Pre-trained models should use the legacy sentencepiece tokenizer ex: mT5
            pretrained_cfg.encoder_tokenizer.sentencepiece_legacy = True
            pretrained_cfg.decoder_tokenizer.sentencepiece_legacy = True

            # Override dropout
            pretrained_cfg.hidden_dropout = cfg.model.hidden_dropout
            pretrained_cfg.attention_dropout = cfg.model.attention_dropout

            # Override precision
            pretrained_cfg.precision = cfg.model.precision  # Set above from trainer.precision

            # Override data and global/micro batch size.
            pretrained_cfg.train_ds = cfg.model.train_ds
            pretrained_cfg.validation_ds = cfg.model.validation_ds
            pretrained_cfg.test_ds = cfg.model.test_ds

            pretrained_cfg.micro_batch_size = cfg.model.micro_batch_size
            pretrained_cfg.global_batch_size = cfg.model.global_batch_size

            # Class target for the new class being restored.
            pretrained_cfg.target = (
                "nemo.collections.nlp.models.machine_translation.megatron_nmt_model.MegatronNMTModel"
            )

            # Optimizer overrides.
            pretrained_cfg.optim = cfg.model.optim

        model = MegatronNMTModel.restore_from(
            cfg.model.pretrained_model_path,
            trainer=trainer,
            override_config_path=pretrained_cfg,
            save_restore_connector=NLPSaveRestoreConnector(),
        )
    else:
        model = MegatronNMTModel(cfg.model, trainer)
    if cfg.do_training:
        trainer.fit(model)

    if cfg.do_testing:
        trainer.test(model)
Exemplo n.º 24
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=True,
            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision in [16, 'bf16']:
        scaler = None
        if cfg.trainer.precision == 16:
            scaler = GradScaler(
                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                growth_interval=cfg.model.get('native_amp_growth_interval',
                                              1000),
                hysteresis=cfg.model.get('hysteresis', 2),
            )
        if megatron_amp_o2:
            plugins.append(
                MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision,
                                            device='cuda',
                                            scaler=scaler))
        else:
            plugins.append(
                PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision,
                                             device='cuda',
                                             scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)
    exp_manager(trainer, cfg.exp_manager)

    app_state = AppState()
    if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1:
        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size
        (
            app_state.tensor_model_parallel_rank,
            app_state.pipeline_model_parallel_rank,
            app_state.model_parallel_size,
            _,
        ) = fake_initialize_model_parallel(
            world_size=app_state.model_parallel_size,
            rank=trainer.global_rank,
            tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size,
            pipeline_model_parallel_size_=cfg.model.
            pipeline_model_parallel_size,
        )

    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
    with open_dict(cfg):
        cfg.model.precision = cfg.trainer.precision

    model = MegatronGPTModel.restore_from(cfg.restore_from_path,
                                          cfg.model,
                                          trainer=trainer)
    trainer.fit(model)
Exemplo n.º 25
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=True,
            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision in [16, 'bf16']:
        scaler = None
        if cfg.trainer.precision == 16:
            scaler = GradScaler(
                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                hysteresis=cfg.model.get('hysteresis', 2),
            )
        if megatron_amp_o2:
            plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))
        else:
            plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)

    exp_manager(trainer, cfg.exp_manager)

    # update resume from checkpoint found by exp_manager
    if cfg.model.resume_from_checkpoint is not None:
        resume_from_checkpoint = cfg.model.resume_from_checkpoint
    else:
        resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path
    logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}')

    trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint)
    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)

    # Get the T5 Base configuration.
    t5_cfg = MegatronT5FinetuneModel.restore_from(
        restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
    )

    # Override the T5 configuration with the one from the config file.
    OmegaConf.set_struct(t5_cfg, True)
    with open_dict(t5_cfg):
        t5_cfg.masked_softmax_fusion = False
        t5_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
        t5_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.1)
        t5_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.1)
        t5_cfg.data = cfg.model.data
        t5_cfg.precision = cfg.trainer.precision
        t5_cfg.optim = cfg.model.optim
        t5_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
        t5_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
        # XNLI has eval languages in the yaml config.
        if hasattr(cfg.model, 'eval_languages'):
            t5_cfg.eval_languages = cfg.model.eval_languages

    if hasattr(cfg.model.data.train_ds, 'task_name'):
        model = MegatronT5GLUEModel.restore_from(
            restore_path=cfg.model.restore_from_path,
            trainer=trainer,
            override_config_path=t5_cfg,
            save_restore_connector=NLPSaveRestoreConnector(),
        )
    else:
        model = MegatronT5FinetuneModel.restore_from(
            restore_path=cfg.model.restore_from_path,
            trainer=trainer,
            override_config_path=t5_cfg,
            save_restore_connector=NLPSaveRestoreConnector(),
        )

    trainer.fit(model)
    trainer.validate(model)
    if hasattr(cfg.model.data, 'test_ds'):
        trainer.test(model)
Exemplo n.º 26
0
def main(cfg: DictConfig) -> None:
    pl.seed_everything(42)
    logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')

    plugin = NLPDDPPlugin()
    trainer = pl.Trainer(**cfg.trainer, plugins=plugin)

    exp_manager(trainer, cfg.get("exp_manager", None))

    app_state = AppState()
    if cfg.model.tensor_model_parallel_size > 1:
        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size
        app_state.model_parallel_rank = compute_model_parallel_rank(
            trainer.local_rank, app_state.model_parallel_size)

    if 'bert' in cfg.model.language_model.pretrained_model_name:
        if cfg.model.dataset.task == 'sgd':
            model_class = SGDQAModel
        else:
            model_class = IntentSlotClassificationModel
    elif 'gpt' in cfg.model.language_model.pretrained_model_name.lower():
        model_class = DialogueGPTModel

    if cfg.pretrained_model or (cfg.model.nemo_path
                                and os.path.exists(cfg.model.nemo_path)):
        if cfg.pretrained_model:
            logging.info(f'Loading pretrained model {cfg.pretrained_model}')
            model = model_class.from_pretrained(cfg.pretrained_model)
        else:
            logging.info(f'Restoring model from {cfg.model.nemo_path}')
            model = model_class.restore_from(cfg.model.nemo_path)
        if cfg.do_training:
            model.setup_training_data(train_data_config=cfg.model.train_ds)
            model.setup_multiple_validation_data(
                val_data_config=cfg.model.validation_ds)
    else:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = model_class(cfg.model, trainer=trainer)

    if cfg.do_training:
        trainer.fit(model)
        if cfg.model.nemo_path:
            model.save_to(cfg.model.nemo_path)
    else:
        data_dir = cfg.model.dataset.get('data_dir', None)
        dialogues_example_dir = cfg.model.dataset.get('dialogues_example_dir',
                                                      None)

        if data_dir is None or dialogues_example_dir is None:
            raise ValueError(
                'No dataset directory provided. Skipping evaluation. ')
        elif not os.path.exists(data_dir):
            raise ValueError(
                f'{data_dir} is not found, skipping evaluation on the test set.'
            )
        else:
            model.update_data_dirs(data_dir=data_dir,
                                   dialogues_example_dir=dialogues_example_dir)
            model._cfg.dataset = cfg.model.dataset

    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None:
        trainer = pl.Trainer(devices=1,
                             accelerator=cfg.trainer.accelerator,
                             plugins=plugin,
                             precision=16)
        model.setup_multiple_test_data(test_data_config=cfg.model.test_ds)
        if model.prepare_test(trainer):
            trainer.test(model)
Exemplo n.º 27
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--model_file",
                        type=str,
                        default="",
                        required=True,
                        help="Pass path to model's .nemo file")
    parser.add_argument("--prompt",
                        type=str,
                        default="",
                        required=True,
                        help="Prompt for the model (a text to complete)")
    parser.add_argument("--tokens_to_generate",
                        type=int,
                        default="64",
                        required=False,
                        help="How many tokens to add to prompt")
    parser.add_argument(
        "--stop_after_sentence",
        type=bool,
        default="True",
        required=False,
        help=
        "True/False: whether to stop after full sentence has been generated.",
    )
    parser.add_argument(
        "--tensor_model_parallel_size",
        type=int,
        default=1,
        required=True,
    )
    parser.add_argument("--precision",
                        default=32,
                        help="PyTorch Lightning Trainer precision flag")

    args = parser.parse_args()

    # cast precision to int if 32 or 16
    if args.precision in ["32", "16"]:
        args.precision = int(float(args.precision))

    # trainer required for restoring model parallel models
    trainer = Trainer(plugins=NLPDDPPlugin(),
                      gpus=args.tensor_model_parallel_size,
                      precision=args.precision)

    app_state = AppState()
    if args.tensor_model_parallel_size is not None and args.tensor_model_parallel_size > 1:
        app_state.model_parallel_size = args.tensor_model_parallel_size
        app_state.model_parallel_rank = compute_model_parallel_rank(
            trainer.local_rank, app_state.model_parallel_size)

    model = MegatronGPTModel.restore_from(restore_path=args.model_file,
                                          trainer=trainer)

    model.freeze()

    request = {
        "prompt": args.prompt,
        "tokens_to_generate": args.tokens_to_generate,
        "stop_after_sentence": args.stop_after_sentence,
    }

    dataset = GPTRequestDataset(request, model.tokenizer)

    request_dl = DataLoader(dataset)

    response = trainer.predict(model, request_dl)

    print("***************************")
    print(response[0]['completion']['text'])
    print("***************************")
    logging.info(
        f"Generation stopped because: {response[0]['completion']['stop reason']}"
    )
def main():
    parser = ArgumentParser()
    parser.add_argument("--use_soft_prompts", action="store_true", help="Use model's existing soft prompts")
    parser.add_argument("--model_file", type=str, default="", required=True, help="Pass path to model's .nemo file")
    parser.add_argument(
        "--path_to_file", type=str, default="", required=False, help="Path to file with prompts (a text to complete)"
    )
    parser.add_argument(
        "--prompt", type=str, default="", required=False, help="Prompt for the model (a text to complete)"
    )
    parser.add_argument(
        "--prompt_tag", type=str, default="", required=False, help="Prompt tag string for task specific soft prompt"
    )
    parser.add_argument(
        "--tokens_to_generate", type=int, default="1", required=False, help="How many tokens to add to prompt"
    )
    parser.add_argument(
        "--stop_after_sentence",
        type=bool,
        default="True",
        required=False,
        help="True/False: whether to stop after full sentence has been generated.",
    )
    parser.add_argument(
        "--tensor_model_parallel_size", type=int, default=1, required=False,
    )
    parser.add_argument("--precision", default=16, help="PyTorch Lightning Trainer precision flag")
    parser.add_argument("--batch_size", default=1, required=False, help="Evaluation batch_size")
    parser.add_argument(
        "--compute_logprobs", type=bool, default=False, required=False, help="Method for logprobs computation"
    )

    args = parser.parse_args()

    # cast precision to int if 32 or 16
    if args.precision in ["32", "16"]:
        args.precision = int(float(args.precision))

    # trainer required for restoring model parallel models
    trainer = Trainer(plugins=NLPDDPPlugin(), gpus=args.tensor_model_parallel_size, precision=args.precision)

    app_state = AppState()
    if args.tensor_model_parallel_size is not None and args.tensor_model_parallel_size > 1:
        app_state.model_parallel_size = args.tensor_model_parallel_size
        app_state.model_parallel_rank = compute_model_parallel_rank(trainer.local_rank, app_state.model_parallel_size)

    model = MegatronGPTModel.restore_from(restore_path=args.model_file, trainer=trainer)
    model.freeze()

    def pad_collate(batch):
        tokens, tokens_to_generate = batch[0]['data'], batch[0]['tokens_to_generate']
        compute_logprobs = batch[0]['compute_logprobs']
        lens = [len(token) for token in tokens]

        tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=50256)
        data = []

        if 'prompt_tags' in batch[0]:
            # Keep track of soft prompt tags
            prompt_tags = batch[0]['prompt_tags']

            for token, lenn, prompt_tag in zip(tokens_pad.T, lens, prompt_tags):
                data.append((token, lenn, tokens_to_generate, compute_logprobs, prompt_tag))
        else:
            for token, lenn in zip(tokens_pad.T, lens):
                data.append((token, lenn, tokens_to_generate, compute_logprobs))

        return data

    # defining type of request
    if args.path_to_file != "":
        request = []
        prompts = open(args.path_to_file, 'r')

        for prompt in prompts.readlines():
            prompt = prompt.split('\n')[0]

            if args.use_soft_prompts and model.use_soft_prompts:
                prompt = json.loads(prompt)

            request.append(prompt)

        dataset = GPTRequestDataset(request, model.tokenizer, args.tokens_to_generate, args.compute_logprobs)
        request_dl = DataLoader(dataset=pad_collate(dataset), batch_size=int(args.batch_size))

    else:
        if args.use_soft_prompts and model.use_soft_prompts:
            request = [{'prompt_tag': args.prompt_tag, 'text': args.prompt}]
        else:
            request = [args.prompt]

        dataset = GPTRequestDataset(request, model.tokenizer, args.tokens_to_generate, args.compute_logprobs)
        request_dl = DataLoader(dataset=pad_collate(dataset), batch_size=1)

    # For GPT models that have had soft prompt tuning but you don't want to use any soft prompts
    if not args.use_soft_prompts and model.use_soft_prompts:
        model.use_soft_prompts = False

    response = trainer.predict(model, request_dl)

    print("***************************")
    print(response)
    print("***************************")
Exemplo n.º 29
0
def main(cfg) -> None:
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
    plugins = [
        NLPDDPPlugin(
            no_ddp_communication_hook=True,
            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
            find_unused_parameters=False,
        )
    ]
    if cfg.trainer.precision in [16, 'bf16']:
        scaler = None
        if cfg.trainer.precision == 16:
            scaler = GradScaler(
                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                growth_interval=cfg.model.get('native_amp_growth_interval',
                                              1000),
                hysteresis=cfg.model.get('hysteresis', 2),
            )
        if megatron_amp_o2:
            plugins.append(
                MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision,
                                            device='cuda',
                                            scaler=scaler))
        else:
            plugins.append(
                NativeMixedPrecisionPlugin(precision=cfg.trainer.precision,
                                           device='cuda',
                                           scaler=scaler))

    if cfg.get('cluster_type', None) == 'BCP':
        plugins.append(TorchElasticEnvironment())

    trainer = Trainer(plugins=plugins, **cfg.trainer)

    exp_manager(trainer, cfg.exp_manager)

    # Override timer callback to a stateless one
    for idx, callback in enumerate(trainer.callbacks):
        if isinstance(callback, Timer):
            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, )

    # Get the T5 Base configuration.
    t5_cfg = MegatronT5GLUEModel.restore_from(
        restore_path=cfg.model.restore_from_path,
        trainer=trainer,
        return_config=True)

    # Override the T5 configuration with the one from the config file.
    # NOTE: Only data can be overriden here since this the file being restored here should already correspond to a GLUE/XNLI finetuned model.
    OmegaConf.set_struct(t5_cfg, True)
    with open_dict(t5_cfg):
        t5_cfg.masked_softmax_fusion = False
        t5_cfg.precision = cfg.trainer.precision
        # Overwrite data configs
        t5_cfg.data = cfg.model.data
        # XNLI has eval languages in the yaml config.
        if hasattr(cfg.model, 'eval_languages'):
            t5_cfg.eval_languages = cfg.model.eval_languages

    if hasattr(t5_cfg.data.validation_ds, 'task_name'):
        model = MegatronT5GLUEModel.restore_from(
            restore_path=cfg.model.restore_from_path,
            trainer=trainer,
            override_config_path=t5_cfg)
    else:
        model = MegatronT5FinetuneModel.restore_from(
            restore_path=cfg.model.restore_from_path,
            trainer=trainer,
            override_config_path=t5_cfg)
    model.freeze()
    trainer.validate(model)
    if hasattr(cfg.model.data, 'test_ds'):
        trainer.test(model)
Exemplo n.º 30
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--model_file",
                        type=str,
                        required=True,
                        help="Path to source .nemo file")
    parser.add_argument("--target_file",
                        type=str,
                        required=True,
                        help="Path to write target .nemo file")
    parser.add_argument("--tensor_model_parallel_size",
                        type=int,
                        required=True,
                        help="TP size of source model")
    parser.add_argument("--target_tensor_model_parallel_size",
                        type=int,
                        required=True,
                        help="TP size of target model")
    parser.add_argument(
        "--model_class",
        type=str,
        default=
        "nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel",
        help=
        "NeMo model class. This script should support all NeMo megatron models that use Tensor Parallel",
    )
    parser.add_argument("--precision",
                        default=16,
                        help="PyTorch Lightning Trainer precision flag")

    args = parser.parse_args()

    precision = args.precision
    if args.precision in ["32", "16"]:
        precision = int(float(args.precision))
    tp_size = args.tensor_model_parallel_size
    tgt_tp_size = args.target_tensor_model_parallel_size
    cls = model_utils.import_class_by_path(args.model_class)

    trainer = Trainer(devices=1,
                      plugins=NLPDDPPlugin(),
                      accelerator="cpu",
                      precision=precision)
    app_state = AppState()
    app_state.data_parallel_rank = 0
    app_state.pipeline_model_parallel_size = 1  # not supported yet in this script
    app_state.tensor_model_parallel_size = tp_size
    app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size

    if tp_size > 1:
        partitions = []
        for i in range(tp_size):
            app_state.tensor_model_parallel_rank = i
            model = cls.restore_from(restore_path=args.model_file,
                                     trainer=trainer,
                                     map_location=torch.device("cpu"))
            params = [p for _, p in model.named_parameters()]
            partitions.append(params)
            # app_state is being updated incorrectly during restore
            app_state.data_parallel_rank = 0
            app_state.pipeline_model_parallel_size = 1  # not supported yet in this script
            app_state.tensor_model_parallel_size = tp_size
            app_state.model_parallel_size = (
                app_state.pipeline_model_parallel_size *
                app_state.tensor_model_parallel_size)

        model.cfg.tensor_model_parallel_size = 1
        app_state.model_parallel_size = 1
        trainer = Trainer(devices=1,
                          plugins=NLPDDPPlugin(),
                          accelerator="cpu",
                          precision=precision)
        model = cls(model.cfg, trainer).to('cpu')
        model._save_restore_connector = NLPSaveRestoreConnector()

        if tgt_tp_size > 1:
            merge_partition(model, partitions)
        else:
            merge_partition(model, partitions, args.target_file)
    else:
        app_state.model_parallel_size = 1
        model = cls.restore_from(restore_path=args.model_file, trainer=trainer)

    if tgt_tp_size > 1:
        partitions = []
        params = [p for _, p in model.named_parameters()]
        partitions.append(params)

        model.cfg.tensor_model_parallel_size = tgt_tp_size
        app_state.model_parallel_size = tgt_tp_size
        trainer = Trainer(devices=1,
                          plugins=NLPDDPPlugin(),
                          accelerator="cpu",
                          precision=precision)
        model = cls(model.cfg, trainer).to('cpu')
        model._save_restore_connector = NLPSaveRestoreConnector()

        split_partition(model, partitions, tgt_tp_size, args.target_file)

    logging.info("Successfully finished changing partitions!")