def main(cfg: MTBottleneckConfig) -> None: # merge default config with user specified config default_cfg = MTBottleneckConfig() cfg = update_model_config(default_cfg, cfg) logging.info("\n\n************** Experiment configuration ***********") logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') # training is managed by PyTorch Lightning trainer_cfg = OmegaConf.to_container(cfg.trainer) trainer_cfg.pop('plugins', None) trainer = Trainer(plugins=[NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)], **trainer_cfg) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated if cfg.model.preproc_out_dir is not None: MTDataPreproc(cfg=cfg.model, trainer=trainer) # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning exp_manager(trainer, cfg.exp_manager) # everything needed to train translation models is encapsulated in the NeMo MTEncdDecModel mt_model = MTBottleneckModel(cfg.model, trainer=trainer) logging.info("\n\n************** Model parameters and their sizes ***********") for name, param in mt_model.named_parameters(): print(name, param.size()) logging.info("***********************************************************\n\n") if cfg.do_training: trainer.fit(mt_model) if cfg.do_testing: trainer.test(mt_model)
def main(cfg: MTEncDecConfig) -> None: # merge default config with user specified config default_cfg = MTEncDecConfig() cfg = update_model_config(default_cfg, cfg) logging.info("\n\n************** Experiment configuration ***********") logging.info(f'Config: {cfg.pretty()}') # training is managed by PyTorch Lightning trainer = Trainer(**cfg.trainer) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated MTDataPreproc(cfg=cfg.model, trainer=trainer) if cfg.do_training: # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning exp_manager(trainer, cfg.exp_manager) # everything needed to train translation models is encapsulated in the NeMo MTEncdDecModel mt_model = MTEncDecModel(cfg.model, trainer=trainer) logging.info("\n\n************** Model parameters and their sizes ***********") for name, param in mt_model.named_parameters(): print(name, param.size()) logging.info("***********************************************************\n\n") trainer.fit(mt_model)
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=( megatron_amp_o2 and cfg.trainer.precision == 'bf16' ), # Only bf16 uses fp32_grad_accum. gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append(NativeMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated if cfg.model.preproc_out_dir is not None: MTDataPreproc(cfg=cfg.model, trainer=trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronNMTModel(cfg.model, trainer) if cfg.do_training: trainer.fit(model) if cfg.do_testing: trainer.test(model)
parser.add_argument( '--num_batches_per_tarfile', type=int, default=1000, help='Number of batches (pickle files) within each tarfile', ) args = parser.parse_args() if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) if not os.path.exists(args.tokenizer_model): assert FileNotFoundError("Could not find tokenizer model %s" % (args.tokenizer)) tokenizer_model = MTDataPreproc.get_monolingual_tokenizer( tokenizer_name=args.tokenizer_name, tokenizer_model=args.tokenizer_model, bpe_dropout=args.bpe_droput) MTDataPreproc.preprocess_monolingual_dataset( clean=args.clean, fname=args.fname, out_dir=args.out_dir, tokenizer=tokenizer_model, max_seq_length=args.max_seq_length, min_seq_length=args.min_seq_length, tokens_in_batch=args.tokens_in_batch, lines_per_dataset_fragment=args.lines_per_dataset_fragment, num_batches_per_tarfile=args.num_batches_per_tarfile, pkl_file_prefix=args.pkl_file_prefix, global_rank=0, world_size=1,
If using a pre-trained shared tokenizer, both encoder and decoder tokenizers must be the same ''') else: raise ValueError( 'Both encoder and decoder pre-trained tokenizer models must be specified' ) if args.encoder_tokenizer_model == 'None' and args.decoder_tokenizer_model == 'None': encoder_tokenizer_model, decoder_tokenizer_model = MTDataPreproc.train_tokenizers( out_dir=args.out_dir, src_fname=args.src_fname, tgt_fname=args.tgt_fname, shared_tokenizer=args.shared_tokenizer, encoder_tokenizer_name=args.encoder_tokenizer_name, encoder_tokenizer_vocab_size=args.encoder_tokenizer_vocab_size, encoder_tokenizer_coverage=args.encoder_tokenizer_coverage, decoder_tokenizer_name=args.decoder_tokenizer_name, decoder_tokenizer_vocab_size=args.decoder_tokenizer_vocab_size, decoder_tokenizer_coverage=args.decoder_tokenizer_coverage, global_rank=0, ) else: encoder_tokenizer_model, decoder_tokenizer_model = args.encoder_tokenizer_model, args.decoder_tokenizer_model encoder_tokenizer, decoder_tokenizer = MTDataPreproc.get_enc_dec_tokenizers( encoder_tokenizer_name=args.encoder_tokenizer_name, encoder_tokenizer_model=encoder_tokenizer_model, encoder_bpe_dropout=args.encoder_tokenizer_bpe_dropout, encoder_r2l=args.encoder_tokenizer_r2l, decoder_tokenizer_name=args.decoder_tokenizer_name,
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=True, gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append( MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append( PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated if cfg.model.preproc_out_dir is not None: MTDataPreproc(cfg=cfg.model, trainer=trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager if cfg.model.resume_from_checkpoint is not None: resume_from_checkpoint = cfg.model.resume_from_checkpoint else: resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path logging.info( f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer._checkpoint_connector = CheckpointConnector( trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision if hasattr(cfg.model, 'pretrained_model_path' ) and cfg.model.pretrained_model_path is not None: if not hasattr(cfg.model, 'pretrained_model_type'): raise ValueError(f"Pretrained model type must be in [T5, BART].") assert cfg.model.pretrained_model_type in ['T5', 'BART'] if cfg.model.pretrained_model_type == 'T5': pretrained_cfg = MegatronT5Model.restore_from( cfg.model.pretrained_model_path, trainer=trainer, return_config=True) else: pretrained_cfg = MegatronBARTModel.restore_from( cfg.model.pretrained_model_path, trainer=trainer, return_config=True) OmegaConf.set_struct(pretrained_cfg, True) with open_dict(pretrained_cfg): pretrained_cfg.masked_softmax_fusion = False # Set source and target language/multilingual pretrained_cfg.src_language = cfg.model.src_language pretrained_cfg.tgt_language = cfg.model.tgt_language pretrained_cfg.multilingual = cfg.model.multilingual pretrained_cfg.shared_tokenizer = True # Max generation delta pretrained_cfg.max_generation_delta = cfg.model.max_generation_delta # Set label smoothing pretrained_cfg.label_smoothing = cfg.model.label_smoothing # Set tokenizer paths: pretrained_cfg.encoder_tokenizer = pretrained_cfg.tokenizer pretrained_cfg.decoder_tokenizer = pretrained_cfg.tokenizer # Pre-trained models should use the legacy sentencepiece tokenizer ex: mT5 pretrained_cfg.encoder_tokenizer.sentencepiece_legacy = True pretrained_cfg.decoder_tokenizer.sentencepiece_legacy = True # Override dropout pretrained_cfg.hidden_dropout = cfg.model.hidden_dropout pretrained_cfg.attention_dropout = cfg.model.attention_dropout # Override precision pretrained_cfg.precision = cfg.model.precision # Set above from trainer.precision # Override data and global/micro batch size. pretrained_cfg.train_ds = cfg.model.train_ds pretrained_cfg.validation_ds = cfg.model.validation_ds pretrained_cfg.test_ds = cfg.model.test_ds pretrained_cfg.micro_batch_size = cfg.model.micro_batch_size pretrained_cfg.global_batch_size = cfg.model.global_batch_size # Class target for the new class being restored. pretrained_cfg.target = ( "nemo.collections.nlp.models.machine_translation.megatron_nmt_model.MegatronNMTModel" ) # Optimizer overrides. pretrained_cfg.optim = cfg.model.optim model = MegatronNMTModel.restore_from( cfg.model.pretrained_model_path, trainer=trainer, override_config_path=pretrained_cfg, save_restore_connector=NLPSaveRestoreConnector(), ) else: model = MegatronNMTModel(cfg.model, trainer) if cfg.do_training: trainer.fit(model) if cfg.do_testing: trainer.test(model)