def main(cfg: MTEncDecConfig) -> None: # merge default config with user specified config default_cfg = MTEncDecConfig() cfg = update_model_config(default_cfg, cfg) logging.info("\n\n************** Experiment configuration ***********") logging.info(f'Config: {cfg.pretty()}') # training is managed by PyTorch Lightning trainer = Trainer(**cfg.trainer) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated MTDataPreproc(cfg=cfg.model, trainer=trainer) if cfg.do_training: # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning exp_manager(trainer, cfg.exp_manager) # everything needed to train translation models is encapsulated in the NeMo MTEncdDecModel mt_model = MTEncDecModel(cfg.model, trainer=trainer) logging.info("\n\n************** Model parameters and their sizes ***********") for name, param in mt_model.named_parameters(): print(name, param.size()) logging.info("***********************************************************\n\n") trainer.fit(mt_model)
def build_train_valid_test_datasets(self): self._train_ds = MTEncDecModel._setup_dataset_from_config( cfg=self._cfg.train_ds, encoder_tokenizer=self.encoder_tokenizer, decoder_tokenizer=self.decoder_tokenizer, global_rank=parallel_state.get_data_parallel_rank(), world_size=parallel_state.get_data_parallel_world_size(), multilingual=self.multilingual, multilingual_ids=self.multilingual_ids, ) self._validation_ds = MTEncDecModel._setup_eval_dataset_from_config( cfg=self._cfg.validation_ds, multilingual=self.multilingual, multilingual_ids=self.multilingual_ids, encoder_tokenizer=self.encoder_tokenizer, decoder_tokenizer=self.decoder_tokenizer, ) # Test data config is optional. if hasattr(self._cfg, 'test_ds'): self._test_ds = MTEncDecModel._setup_eval_dataset_from_config( cfg=self._cfg.validation_ds, multilingual=self.multilingual, multilingual_ids=self.multilingual_ids, encoder_tokenizer=self.encoder_tokenizer, decoder_tokenizer=self.decoder_tokenizer, )
def build_train_valid_test_datasets(self): """Builds the train, validation, and test datasets.""" # Builds datasets if the type is tarred or from raw text without memmap. if self._cfg.train_ds.dataset_type in ['tarred', 'text']: self._train_ds = self.build_tarred_train_dataset() elif self._cfg.train_ds.dataset_type in ['bin_memmap', 'text_memmap']: self._train_ds = self.build_memmap_dataset_from_config( self._cfg.train_ds) if self._cfg.validation_ds.get("dataset_type", "text") != "text": raise ValueError( f"Validation dataset type must be 'text', found {self._cfg.validation_ds.dataset_type}" ) self._validation_ds = MTEncDecModel._setup_eval_dataset_from_config( cfg=self._cfg.validation_ds, multilingual=self.multilingual, multilingual_ids=self.multilingual_ids, encoder_tokenizer=self.encoder_tokenizer, decoder_tokenizer=self.decoder_tokenizer, ) # Test data config is optional. if hasattr(self._cfg, 'test_ds'): if self._cfg.validation_ds.get("dataset_type", "text") != "text": raise ValueError( f"Test dataset type must be 'text', found {self._cfg.test_ds.dataset_type}" ) self._test_ds = MTEncDecModel._setup_eval_dataset_from_config( cfg=self._cfg.validation_ds, multilingual=self.multilingual, multilingual_ids=self.multilingual_ids, encoder_tokenizer=self.encoder_tokenizer, decoder_tokenizer=self.decoder_tokenizer, )
def translate(rank, world_size, args): if args.model.endswith(".nemo"): logging.info("Attempting to initialize from .nemo file") model = MTEncDecModel.restore_from(restore_path=args.model) elif args.model.endswith(".ckpt"): logging.info("Attempting to initialize from .ckpt file") model = MTEncDecModel.load_from_checkpoint(checkpoint_path=args.model) model.replace_beam_with_sampling(topk=args.topk) model.eval() model.to(rank) if args.twoside: dataset = TarredTranslationDataset( text_tar_filepaths=args.text2translate, metadata_path=args.metadata_path, encoder_tokenizer=model.encoder_tokenizer, decoder_tokenizer=model.decoder_tokenizer, shuffle_n=100, shard_strategy="scatter", world_size=world_size, global_rank=rank, reverse_lang_direction=args.reverse_lang_direction, ) else: dataset = TarredOneSideTranslationDataset( text_tar_filepaths=args.text2translate, metadata_path=args.metadata_path, tokenizer=model.encoder_tokenizer, shuffle_n=100, shard_strategy="scatter", world_size=world_size, global_rank=rank, ) loader = DataLoader(dataset, batch_size=1) result_dir = os.path.join(args.result_dir, f'rank{rank}') os.makedirs(result_dir, exist_ok=True) originals_file_name = os.path.join(result_dir, 'originals.txt') translations_file_name = os.path.join(result_dir, 'translations.txt') num_translated_sentences = 0 with open(originals_file_name, 'w') as of, open(translations_file_name, 'w') as tf: for batch_idx, batch in enumerate(loader): for i in range(len(batch)): if batch[i].ndim == 3: batch[i] = batch[i].squeeze(dim=0) batch[i] = batch[i].to(rank) if args.twoside: src_ids, src_mask, _, _, _ = batch else: src_ids, src_mask = batch if batch_idx % 100 == 0: logging.info( f"{batch_idx} batches ({num_translated_sentences} sentences) were translated by process with " f"rank {rank}") num_translated_sentences += len(src_ids) inputs, translations = model.batch_translate(src=src_ids, src_mask=src_mask) for src, translation in zip(inputs, translations): of.write(src + '\n') tf.write(translation + '\n')
def _build_tokenizer(self): # Instantiates tokenizers and register to be saved with NeMo Model archive # After this call, there will be self.encoder_tokenizer and self.decoder_tokenizer # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly. encoder_tokenizer_model = self.register_artifact( "encoder_tokenizer.tokenizer_model", self._cfg.encoder_tokenizer.get('tokenizer_model')) decoder_tokenizer_model = self.register_artifact( "decoder_tokenizer.tokenizer_model", self._cfg.decoder_tokenizer.get('tokenizer_model')) self.encoder_tokenizer, self.decoder_tokenizer = MTEncDecModel.setup_enc_dec_tokenizers( encoder_tokenizer_library=self.encoder_tokenizer_library, encoder_tokenizer_model=encoder_tokenizer_model, encoder_bpe_dropout=self._cfg.encoder_tokenizer.get( 'bpe_dropout', 0.0) if self._cfg.encoder_tokenizer.get( 'bpe_dropout', 0.0) is not None else 0.0, encoder_model_name=None, encoder_r2l=self._cfg.encoder_tokenizer.get('r2l', False), decoder_tokenizer_library=self.decoder_tokenizer_library, encoder_tokenizer_vocab_file=self._cfg.encoder_tokenizer.get( 'vocab_file', None), decoder_tokenizer_model=decoder_tokenizer_model, decoder_bpe_dropout=self._cfg.decoder_tokenizer.get( 'bpe_dropout', 0.0) if self._cfg.decoder_tokenizer.get( 'bpe_dropout', 0.0) is not None else 0.0, decoder_model_name=None, decoder_r2l=self._cfg.decoder_tokenizer.get('r2l', False), special_tokens=self.special_tokens, encoder_sentencepiece_legacy=self._cfg.encoder_tokenizer.get( 'sentencepiece_legacy', False), decoder_sentencepiece_legacy=self._cfg.decoder_tokenizer.get( 'sentencepiece_legacy', False), ) # Set up pre and post processors as well. if self.multilingual: ( self.source_processor_list, self.target_processor_list, self.multilingual_ids, ) = MTEncDecModel.setup_multilingual_ids_and_processors( src_language=self.src_language, tgt_language=self.tgt_language, tokenizer=self. encoder_tokenizer, # Multilingual training requires shared tokenizers. tokenizer_library=self.encoder_tokenizer_library, ) else: # After this call, the model will have self.source_processor and self.target_processor objects self.source_processor, self.target_processor = MTEncDecModel.setup_pre_and_post_processing_utils( self.src_language, self.tgt_language, self.encoder_tokenizer_library, self.decoder_tokenizer_library, ) self.multilingual_ids = [None]
def main(cfg: MTEncDecConfig) -> None: # # merge default config with user specified config default_cfg = MTEncDecConfig() cfg = update_model_config(default_cfg, cfg) logging.info("\n\n************** Experiment configuration ***********") logging.info(f'Config: {cfg.pretty()}') trainer = Trainer(**cfg.trainer) exp_manager(trainer, cfg.exp_manager) mt_model = MTEncDecModel(cfg.model, trainer=trainer) logging.info( "\n\n************** Model parameters and their sizes ***********") for name, param in mt_model.named_parameters(): print(name, param.size()) logging.info( "***********************************************************\n\n") trainer.fit(mt_model)
def build_tarred_train_dataset(self): return MTEncDecModel._setup_dataset_from_config( cfg=self._cfg.train_ds, encoder_tokenizer=self.encoder_tokenizer, decoder_tokenizer=self.decoder_tokenizer, global_rank=parallel_state.get_data_parallel_rank(), world_size=parallel_state.get_data_parallel_world_size(), multilingual=self.multilingual, multilingual_ids=self.multilingual_ids, )
def setup_training_data(self, train_data_config: Optional[DictConfig]): # TODO: Figure out how to set global rank and world size for model parallel. if hasattr(self, '_train_ds'): if train_data_config.dataset_type in ['tarred', 'text']: self._train_dl = MTEncDecModel._setup_dataloader_from_config( cfg=train_data_config, dataset=self._train_ds) elif train_data_config.dataset_type in [ 'bin_memmap', 'text_memmap' ]: consumed_samples = self.compute_consumed_samples(0) self._train_dl = self._setup_megatron_dataloader_from_config( cfg=train_data_config, dataset=self._train_ds, consumed_samples=consumed_samples)
def main(cfg: MTEncDecConfig) -> None: # merge default config with user specified config default_cfg = MTEncDecConfig() cfg = update_model_config(default_cfg, cfg) logging.info("\n\n************** Experiment configuration ***********") logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') # training is managed by PyTorch Lightning trainer_cfg = OmegaConf.to_container(cfg.trainer) trainer_cfg.pop('plugins', None) trainer = Trainer(plugins=[NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)], **trainer_cfg) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated if cfg.model.preproc_out_dir is not None: MTDataPreproc(cfg=cfg.model, trainer=trainer) # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning exp_manager(trainer, cfg.exp_manager) # everything needed to train translation models is encapsulated in the NeMo MTEncdDecModel mt_model = MTEncDecModel(cfg.model, trainer=trainer) logging.info( "\n\n************** Model parameters and their sizes ***********") for name, param in mt_model.named_parameters(): print(name, param.size()) logging.info( "***********************************************************\n\n") if cfg.do_training: trainer.fit(mt_model) if cfg.do_testing: trainer.test(mt_model)
def setup_training_data(self, train_data_config: Optional[DictConfig]): # TODO: Figure out how to set global rank and world size for model parallel. if hasattr(self, '_train_ds'): self._train_dl = MTEncDecModel._setup_dataloader_from_config( cfg=train_data_config, dataset=self._train_ds)
def translate( self, text: List[str], source_lang: str = None, target_lang: str = None, return_beam_scores: bool = False, log_timing: bool = False, ) -> List[str]: """ Translates list of sentences from source language to target language. Should be regular text, this method performs its own tokenization/de-tokenization Args: text: list of strings to translate source_lang: if not "ignore", corresponding MosesTokenizer and MosesPunctNormalizer will be run target_lang: if not "ignore", corresponding MosesDecokenizer will be run return_beam_scores: if True, returns a list of translations and their corresponding beam scores. log_timing: if True, prints timing information. Returns: list of translated strings """ # __TODO__: This will reset both source and target processors even if you want to reset just one. # NOTE: This will also set up appropriate source and target processors for a given src/tgt language for multilingual models instead of creating a list of them. if source_lang is not None or target_lang is not None: self.source_processor, self.target_processor = MTEncDecModel.setup_pre_and_post_processing_utils( source_lang, target_lang, self.encoder_tokenizer_library, self.decoder_tokenizer_library) mode = self.training prepend_ids = [] if self.multilingual: if source_lang is None or target_lang is None: raise ValueError( "Expect source_lang and target_lang to run inference for multilingual model." ) src_symbol = self.encoder_tokenizer.token_to_id('<' + source_lang + '>') tgt_symbol = self.encoder_tokenizer.token_to_id('<' + target_lang + '>') if src_symbol in self.multilingual_ids: prepend_ids = [src_symbol] elif tgt_symbol in self.multilingual_ids: prepend_ids = [tgt_symbol] if log_timing: timer = timers.NamedTimer() else: timer = None cache = { "timer": timer, } try: self.eval() src, src_mask = MTEncDecModel.prepare_inference_batch( text=text, prepend_ids=prepend_ids, target=False, source_processor=self.source_processor, target_processor=self.target_processor, encoder_tokenizer=self.encoder_tokenizer, decoder_tokenizer=self.decoder_tokenizer, device=self.device, ) predicted_tokens_ids, _ = self.decode( src, src_mask, src.size(1) + self._cfg. max_generation_delta, # Generate up to src-length + max generation delta. TODO: Implement better stopping when everything hits <EOS>. tokenizer=self.decoder_tokenizer, ) best_translations = self.postprocess_outputs( outputs=predicted_tokens_ids, tokenizer=self.decoder_tokenizer, processor=self.target_processor) return_val = best_translations finally: self.train(mode=mode) if log_timing: timing = timer.export() timing["mean_src_length"] = src_mask.sum().cpu().item( ) / src_mask.shape[0] tgt, tgt_mask = self.prepare_inference_batch( text=best_translations, prepend_ids=prepend_ids, target=True, source_processor=self.source_processor, target_processor=self.target_processor, encoder_tokenizer=self.encoder_tokenizer, decoder_tokenizer=self.decoder_tokenizer, device=self.device, ) timing["mean_tgt_length"] = tgt_mask.sum().cpu().item( ) / tgt_mask.shape[0] if type(return_val) is tuple: return_val = return_val + (timing, ) else: return_val = (return_val, timing) return return_val