class MTEncDecConfig(NemoConfig): name: Optional[str] = 'MTEncDec' do_training: bool = True do_testing: bool = False model: MTEncDecModelConfig = MTEncDecModelConfig() trainer: Optional[TrainerConfig] = TrainerConfig() exp_manager: Optional[ExpManagerConfig] = ExpManagerConfig(name='MTEncDec', files_to_copy=[])
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None) -> None: self._cfg = cfg self.global_rank = 0 self.world_size = 1 if trainer is not None: self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank self.world_size = trainer.num_nodes * trainer.num_gpus if hasattr(cfg, 'train_ds'): supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'megatron'] supported_train_tokenizers = ['yttm', 'sentencepiece'] if ( cfg.encoder_tokenizer.get('library') not in supported_tokenizers or cfg.decoder_tokenizer.get('library') not in supported_tokenizers ): raise NotImplementedError(f"Currently we only support {supported_tokenizers}.") if cfg.get('shared_tokenizer') and cfg.encoder_tokenizer.get('library') != cfg.decoder_tokenizer.get( 'library' ): raise ValueError("Shared tokenizers cannot be from different libraries.") # Prepare tokenizers if ( cfg.encoder_tokenizer.get('library') in supported_train_tokenizers or cfg.decoder_tokenizer.get('library') in supported_train_tokenizers ): # Train tokenizer models if using yttm or sentencepiece and they don't exist if ( cfg.encoder_tokenizer.get('library') in supported_train_tokenizers and cfg.encoder_tokenizer.get('tokenizer_model') is None ) or ( cfg.decoder_tokenizer.get('library') in supported_train_tokenizers and cfg.decoder_tokenizer.get('tokenizer_model') is None ): if cfg.get('preproc_out_dir') is None: raise ValueError('Tokenizer model training required but cfg.preproc_out_dir is None.') if cfg.train_ds.get('src_file_name') is None or cfg.train_ds.get('tgt_file_name') is None: raise ValueError( 'src_file_name and tgt_file_name needed to train tokenizers but could not be found.' ) src_fname = cfg.train_ds.get('src_file_name') tgt_fname = cfg.train_ds.get('tgt_file_name') src_language = cfg.get('src_language') tgt_language = cfg.get('tgt_language') spt_symbols = None tempdir = tempfile.TemporaryDirectory() if cfg.get('multilingual'): spt_symbols = [] if isinstance(src_fname, ListConfig): fnames = (" ").join(src_fname) src_fname = os.path.join(tempdir.name, 'src.txt') os.system('cat %s > %s' % (fnames, src_fname)) if isinstance(tgt_fname, ListConfig): fnames = (" ").join(tgt_fname) tgt_fname = os.path.join(tempdir.name, 'tgt.txt') os.system('cat %s > %s' % (fnames, tgt_fname)) if isinstance(src_language, ListConfig): for lng in src_language: spt_symbols.append("<" + lng + ">") if isinstance(tgt_language, ListConfig): for lng in tgt_language: spt_symbols.append("<" + lng + ">") # train tokenizer model on training data self.encoder_tokenizer_model, self.decoder_tokenizer_model = MTDataPreproc.train_tokenizers( out_dir=cfg.get('preproc_out_dir'), src_fname=src_fname, tgt_fname=tgt_fname, shared_tokenizer=cfg.get('shared_tokenizer'), encoder_tokenizer_vocab_size=cfg.encoder_tokenizer.get('vocab_size'), decoder_tokenizer_vocab_size=cfg.decoder_tokenizer.get('vocab_size'), encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'), decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'), encoder_tokenizer_coverage=cfg.encoder_tokenizer.get('coverage', 0.999), decoder_tokenizer_coverage=cfg.decoder_tokenizer.get('coverage', 0.999), global_rank=self.global_rank, encoder_training_sample_size=cfg.encoder_tokenizer.get('training_sample_size', -1), decoder_training_sample_size=cfg.decoder_tokenizer.get('training_sample_size', -1), encoder_special_tokens=OmegaConf.to_container(cfg.encoder_tokenizer.special_tokens) if cfg.encoder_tokenizer.special_tokens else None, decoder_special_tokens=OmegaConf.to_container(cfg.decoder_tokenizer.special_tokens) if cfg.decoder_tokenizer.special_tokens else None, spt_symbols=spt_symbols, multilingual=cfg.get('multilingual', False), ) # update config self._cfg.encoder_tokenizer.tokenizer_model = self.encoder_tokenizer_model self._cfg.decoder_tokenizer.tokenizer_model = self.decoder_tokenizer_model tempdir.cleanup() else: self.encoder_tokenizer_model = cfg.encoder_tokenizer.get('tokenizer_model') self.decoder_tokenizer_model = cfg.decoder_tokenizer.get('tokenizer_model') self.encoder_tokenizer, self.decoder_tokenizer = self.get_enc_dec_tokenizers( encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'), encoder_model_name=cfg.encoder.get('model_name'), encoder_tokenizer_model=getattr(self, "encoder_tokenizer_model", None), encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0), encoder_r2l=cfg.encoder_tokenizer.get('r2l', False), decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'), decoder_model_name=cfg.decoder.get('model_name'), decoder_tokenizer_model=getattr(self, "decoder_tokenizer_model", None), decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0), decoder_r2l=cfg.decoder_tokenizer.get('r2l', False), ) # If using tarred dataset for training, automatically create it if needed if cfg.train_ds.get('use_tarred_dataset'): if cfg.train_ds.get('tar_files') is None and cfg.train_ds.get('metadata_file') is None: if cfg.get('preproc_out_dir') is None: raise ValueError('Data preprocessing required but cfg.preproc_out_dir is None.') if cfg.train_ds.get('src_file_name') is None or cfg.train_ds.get('tgt_file_name') is None: raise ValueError( 'src_file_name and tgt_file_name needed to create tarred dataset but could not be found.' ) # Preprocess data and cache for use during training if self.global_rank == 0: logging.info( f"Creating tarred dataset for src: {cfg.train_ds.get('src_file_name')} and tgt: {cfg.train_ds.get('tgt_file_name')}" ) if isinstance(cfg.train_ds.get('src_file_name'), str): src_file_list = [cfg.train_ds.get('src_file_name')] tgt_file_list = [cfg.train_ds.get('tgt_file_name')] outdir_list = [cfg.get('preproc_out_dir')] else: src_file_list = cfg.train_ds.get('src_file_name') tgt_file_list = cfg.train_ds.get('tgt_file_name') if isinstance(cfg.get('src_language'), ListConfig): langs = cfg.get('src_language') elif isinstance(cfg.get('tgt_language'), ListConfig): langs = cfg.get('tgt_language') outdir_list = [] for lang in langs: outdir_list.append(os.path.join(cfg.get('preproc_out_dir'), lang)) if len(src_file_list) != len(tgt_file_list) or len(src_file_list) != len(outdir_list): raise ValueError( "Number of source files, target files, and multilingual language pairs must be the same." ) # TODO: have to get tokenizers instide .preprocess_parallel because they can't be pickled metadata_file_list = [] for idx, src_file in enumerate(src_file_list): self.train_tar_files, self.train_metadata_file = MTDataPreproc.preprocess_parallel_dataset( clean=cfg.train_ds.clean, src_fname=src_file, tgt_fname=tgt_file_list[idx], out_dir=outdir_list[idx], encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'), encoder_model_name=cfg.encoder.get('model_name'), encoder_tokenizer_model=self.encoder_tokenizer_model, encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0), encoder_tokenizer_r2l=cfg.encoder_tokenizer.get('r2l', False), decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'), decoder_model_name=cfg.decoder.get('model_name'), decoder_tokenizer_model=self.decoder_tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0), decoder_tokenizer_r2l=cfg.decoder_tokenizer.get('r2l', False), max_seq_length=cfg.train_ds.get('max_seq_length', 512), tokens_in_batch=cfg.train_ds.get('tokens_in_batch', 8192), lines_per_dataset_fragment=cfg.train_ds.get('lines_per_dataset_fragment', 1000000), num_batches_per_tarfile=cfg.train_ds.get('num_batches_per_tarfile', 1000), min_seq_length=1, global_rank=self.global_rank, world_size=self.world_size, n_jobs=cfg.train_ds.get('n_preproc_jobs', -2), tar_file_prefix=cfg.train_ds.get('tar_file_prefix', 'parallel'), ) metadata_file_list.append(self.train_metadata_file) # update config # self._cfg.train_ds.tar_files = self.tar_files_to_string(self.train_tar_files) # self._cfg.train_ds.tar_files = self.train_tar_files if isinstance(cfg.train_ds.get('metadata_file'), str): self._cfg.train_ds.metadata_file = metadata_file_list[0] else: self._cfg.train_ds.metadata_file = metadata_file_list logging.info( f"Using tarred dataset created in folder(s) {outdir_list} and metadata created at {self._cfg.train_ds.metadata_file}" ) elif cfg.train_ds.get('tar_files') is not None and cfg.train_ds.get('metadata_file') is None: raise ValueError('A metadata file is required for tarred dataset but cfg.metadata_file is None.') elif cfg.train_ds.get('tar_files') is None and cfg.train_ds.get('metadata_file') is not None: if isinstance(cfg.train_ds.get('metadata_file'), str): metadata_file_list = [cfg.train_ds.get('metadata_file')] else: metadata_file_list = cfg.train_ds.get('metadata_file') for metadata_file in metadata_file_list: with open(metadata_file) as metadata_reader: metadata = json.load(metadata_reader) if metadata['tar_files']: logging.info(f"Using tarred dataset: {metadata['tar_files']}") else: raise ValueError(f'tar_files not provided and metadata does not have tar files') else: self.train_tar_files = cfg.train_ds.get('tar_files') self.train_metadata_file = cfg.train_ds.get('metadata_file') logging.info( f"Using tarred dataset from config at {self.train_tar_files} and metadata from {self.train_metadata_file}" )
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 self.world_size = 1 if trainer is not None: self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.src_language: str = cfg.get("src_language", None) self.tgt_language: str = cfg.get("tgt_language", None) # Instantiates tokenizers and register to be saved with NeMo Model archive # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly. self.setup_enc_dec_tokenizers( encoder_tokenizer_name=cfg.encoder_tokenizer.tokenizer_name, encoder_tokenizer_model=cfg.encoder_tokenizer.tokenizer_model, encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0), decoder_tokenizer_name=cfg.decoder_tokenizer.tokenizer_name, decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0), ) # After this call, the model will have self.source_processor and self.target_processor objects self.setup_pre_and_post_processing_utils(source_lang=self.src_language, target_lang=self.tgt_language) # TODO: Why is this base constructor call so late in the game? super().__init__(cfg=cfg, trainer=trainer) # TODO: use get_encoder function with support for HF and Megatron self.encoder = TransformerEncoderNM( vocab_size=self.encoder_vocab_size, hidden_size=cfg.encoder.hidden_size, num_layers=cfg.encoder.num_layers, inner_size=cfg.encoder.inner_size, max_sequence_length=cfg.encoder.max_sequence_length if hasattr( cfg.encoder, 'max_sequence_length') else 512, embedding_dropout=cfg.encoder.embedding_dropout if hasattr( cfg.encoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.encoder.learn_positional_encodings if hasattr(cfg.encoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.encoder.num_attention_heads, ffn_dropout=cfg.encoder.ffn_dropout, attn_score_dropout=cfg.encoder.attn_score_dropout, attn_layer_dropout=cfg.encoder.attn_layer_dropout, hidden_act=cfg.encoder.hidden_act, mask_future=cfg.encoder.mask_future, pre_ln=cfg.encoder.pre_ln, ) # TODO: user get_decoder function with support for HF and Megatron self.decoder = TransformerDecoderNM( vocab_size=self.decoder_vocab_size, hidden_size=cfg.decoder.hidden_size, num_layers=cfg.decoder.num_layers, inner_size=cfg.decoder.inner_size, max_sequence_length=cfg.decoder.max_sequence_length if hasattr( cfg.decoder, 'max_sequence_length') else 512, embedding_dropout=cfg.decoder.embedding_dropout if hasattr( cfg.decoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.decoder.learn_positional_encodings if hasattr(cfg.decoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.decoder.num_attention_heads, ffn_dropout=cfg.decoder.ffn_dropout, attn_score_dropout=cfg.decoder.attn_score_dropout, attn_layer_dropout=cfg.decoder.attn_layer_dropout, hidden_act=cfg.decoder.hidden_act, pre_ln=cfg.decoder.pre_ln, ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size**0.5 self.apply( lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing) self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 self.world_size = 1 if trainer is not None: self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.src_language = cfg.get("src_language", None) self.tgt_language = cfg.get("tgt_language", None) self.multilingual = cfg.get("multilingual", False) self.multilingual_ids = [] self.encoder_tokenizer_library = cfg.encoder_tokenizer.get( 'library', 'yttm') self.decoder_tokenizer_library = cfg.decoder_tokenizer.get( 'library', 'yttm') # Instantiates tokenizers and register to be saved with NeMo Model archive # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly. self.setup_enc_dec_tokenizers( encoder_tokenizer_library=self.encoder_tokenizer_library, encoder_tokenizer_model=cfg.encoder_tokenizer.get( 'tokenizer_model'), encoder_bpe_dropout=cfg.encoder_tokenizer.get( 'bpe_dropout', 0.0) if cfg.encoder_tokenizer.get( 'bpe_dropout', 0.0) is not None else 0.0, encoder_model_name=cfg.encoder.get('model_name') if hasattr( cfg.encoder, 'model_name') else None, encoder_r2l=cfg.encoder_tokenizer.get('r2l', False), decoder_tokenizer_library=self.decoder_tokenizer_library, encoder_tokenizer_vocab_file=cfg.encoder_tokenizer.get( 'vocab_file', None), decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get( 'bpe_dropout', 0.0) if cfg.decoder_tokenizer.get( 'bpe_dropout', 0.0) is not None else 0.0, decoder_model_name=cfg.decoder.get('model_name') if hasattr( cfg.decoder, 'model_name') else None, decoder_r2l=cfg.decoder_tokenizer.get('r2l', False), ) if self.multilingual: if isinstance(self.src_language, ListConfig) and isinstance( self.tgt_language, ListConfig): raise ValueError( "cfg.src_language and cfg.tgt_language cannot both be lists. We only support many-to-one or one-to-many multilingual models." ) elif isinstance(self.src_language, ListConfig): for lng in self.src_language: self.multilingual_ids.append( self.encoder_tokenizer.token_to_id("<" + lng + ">")) elif isinstance(self.tgt_language, ListConfig): for lng in self.tgt_language: self.multilingual_ids.append( self.encoder_tokenizer.token_to_id("<" + lng + ">")) else: raise ValueError( "Expect either cfg.src_language or cfg.tgt_language to be a list when multilingual=True." ) if isinstance(self.src_language, ListConfig): self.tgt_language = [self.tgt_language] * len( self.src_language) else: self.src_language = [self.src_language] * len( self.tgt_language) self.source_processor_list = [] self.target_processor_list = [] for src_lng, tgt_lng in zip(self.src_language, self.tgt_language): src_prcsr, tgt_prscr = self.setup_pre_and_post_processing_utils( src_lng, tgt_lng) self.source_processor_list.append(src_prcsr) self.target_processor_list.append(tgt_prscr) else: # After this call, the model will have self.source_processor and self.target_processor objects self.setup_pre_and_post_processing_utils(self.src_language, self.tgt_language) self.multilingual_ids = [None] # TODO: Why is this base constructor call so late in the game? super().__init__(cfg=cfg, trainer=trainer) # encoder from NeMo, Megatron-LM, or HuggingFace encoder_cfg_dict = OmegaConf.to_container(cfg.get('encoder')) encoder_cfg_dict['vocab_size'] = self.encoder_vocab_size library = encoder_cfg_dict.pop('library', 'nemo') model_name = encoder_cfg_dict.pop('model_name', None) pretrained = encoder_cfg_dict.pop('pretrained', False) checkpoint_file = encoder_cfg_dict.pop('checkpoint_file', None) self.encoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=encoder_cfg_dict, encoder=True, pre_ln_final_layer_norm=encoder_cfg_dict.get( 'pre_ln_final_layer_norm', False), checkpoint_file=checkpoint_file, ) # decoder from NeMo, Megatron-LM, or HuggingFace decoder_cfg_dict = OmegaConf.to_container(cfg.get('decoder')) decoder_cfg_dict['vocab_size'] = self.decoder_vocab_size library = decoder_cfg_dict.pop('library', 'nemo') model_name = decoder_cfg_dict.pop('model_name', None) pretrained = decoder_cfg_dict.pop('pretrained', False) decoder_cfg_dict['hidden_size'] = self.encoder.hidden_size self.decoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=decoder_cfg_dict, encoder=False, pre_ln_final_layer_norm=decoder_cfg_dict.get( 'pre_ln_final_layer_norm', False), ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size**0.5 # initialize weights if not using pretrained encoder/decoder if not self._cfg.encoder.get('pretrained', False): self.encoder.apply(lambda module: transformer_weights_init( module, std_init_range)) if not self._cfg.decoder.get('pretrained', False): self.decoder.apply(lambda module: transformer_weights_init( module, std_init_range)) self.log_softmax.apply( lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing) self.eval_loss_fn = NLLLoss(ignore_index=self.decoder_tokenizer.pad_id)
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None) -> None: self._cfg = cfg self.global_rank = 0 self.world_size = 1 if trainer is not None: self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank self.world_size = trainer.num_nodes * trainer.num_gpus if hasattr(cfg, 'train_ds'): supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece'] supported_train_tokenizers = ['yttm', 'sentencepiece'] if ( cfg.encoder_tokenizer.get('library') not in supported_tokenizers or cfg.decoder_tokenizer.get('library') not in supported_tokenizers ): raise NotImplementedError(f"Currently we only support {supported_tokenizers}.") if cfg.get('shared_tokenizer') and cfg.encoder_tokenizer.get('library') != cfg.decoder_tokenizer.get( 'library' ): raise ValueError("Shared tokenizers cannot be from different libraries.") # Prepare tokenizers if ( cfg.encoder_tokenizer.get('library') in supported_train_tokenizers or cfg.decoder_tokenizer.get('library') in supported_train_tokenizers ): # Train tokenizer models if using yttm or sentencepiece and they don't exist if ( cfg.encoder_tokenizer.get('library') in supported_train_tokenizers and cfg.encoder_tokenizer.get('tokenizer_model') is None ) or ( cfg.decoder_tokenizer.get('library') in supported_train_tokenizers and cfg.decoder_tokenizer.get('tokenizer_model') is None ): if cfg.get('preproc_out_dir') is None: raise ValueError('Tokenizer model training required but cfg.preproc_out_dir is None.') if cfg.train_ds.get('src_file_name') is None or cfg.train_ds.get('tgt_file_name') is None: raise ValueError( 'src_file_name and tgt_file_name needed to train tokenizers but could not be found.' ) # train tokenizer model on training data self.encoder_tokenizer_model, self.decoder_tokenizer_model = MTDataPreproc.train_tokenizers( out_dir=cfg.get('preproc_out_dir'), src_fname=cfg.train_ds.get('src_file_name'), tgt_fname=cfg.train_ds.get('tgt_file_name'), shared_tokenizer=cfg.get('shared_tokenizer'), encoder_tokenizer_vocab_size=cfg.encoder_tokenizer.get('vocab_size'), decoder_tokenizer_vocab_size=cfg.decoder_tokenizer.get('vocab_size'), encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'), decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'), encoder_tokenizer_coverage=cfg.encoder_tokenizer.get('coverage', 0.999), decoder_tokenizer_coverage=cfg.decoder_tokenizer.get('coverage', 0.999), global_rank=self.global_rank, encoder_training_sample_size=cfg.encoder_tokenizer.get('training_sample_size', -1), decoder_training_sample_size=cfg.decoder_tokenizer.get('training_sample_size', -1), encoder_special_tokens=OmegaConf.to_container(cfg.encoder_tokenizer.special_tokens) if cfg.encoder_tokenizer.special_tokens else None, decoder_special_tokens=OmegaConf.to_container(cfg.decoder_tokenizer.special_tokens) if cfg.decoder_tokenizer.special_tokens else None, ) # update config self._cfg.encoder_tokenizer.tokenizer_model = self.encoder_tokenizer_model self._cfg.decoder_tokenizer.tokenizer_model = self.decoder_tokenizer_model else: self.encoder_tokenizer_model = cfg.encoder_tokenizer.get('tokenizer_model') self.decoder_tokenizer_model = cfg.decoder_tokenizer.get('tokenizer_model') self.encoder_tokenizer, self.decoder_tokenizer = self.get_enc_dec_tokenizers( encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'), encoder_model_name=cfg.encoder.get('model_name'), encoder_tokenizer_model=self.encoder_tokenizer_model, encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0), decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'), decoder_model_name=cfg.decoder.get('model_name'), decoder_tokenizer_model=self.decoder_tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0), ) # If using tarred dataset for training, automatically create it if needed if cfg.train_ds.get('use_tarred_dataset'): if cfg.train_ds.get('tar_files') is None and cfg.train_ds.get('metadata_file') is None: if cfg.get('preproc_out_dir') is None: raise ValueError('Data preprocessing required but cfg.preproc_out_dir is None.') if cfg.train_ds.get('src_file_name') is None or cfg.train_ds.get('tgt_file_name') is None: raise ValueError( 'src_file_name and tgt_file_name needed to create tarred dataset but could not be found.' ) # Preprocess data and cache for use during training if self.global_rank == 0: logging.info( f"Using tarred dataset for src: {cfg.train_ds.get('src_file_name')} and tgt: {cfg.train_ds.get('tgt_file_name')}" ) # TODO: have to get tokenizers instide .preprocess_parallel because they can't be pickled self.train_tar_files, self.train_metadata_file = MTDataPreproc.preprocess_parallel_dataset( clean=cfg.train_ds.clean, src_fname=cfg.train_ds.get('src_file_name'), tgt_fname=cfg.train_ds.get('tgt_file_name'), out_dir=cfg.get('preproc_out_dir'), encoder_tokenizer_name=cfg.encoder_tokenizer.get('library'), encoder_model_name=cfg.encoder.get('model_name'), encoder_tokenizer_model=self.encoder_tokenizer_model, encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0), decoder_tokenizer_name=cfg.decoder_tokenizer.get('library'), decoder_model_name=cfg.decoder.get('model_name'), decoder_tokenizer_model=self.decoder_tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0), max_seq_length=cfg.train_ds.get('max_seq_length', 512), tokens_in_batch=cfg.train_ds.get('tokens_in_batch', 8192), lines_per_dataset_fragment=cfg.train_ds.get('lines_per_dataset_fragment', 1000000), num_batches_per_tarfile=cfg.train_ds.get('num_batches_per_tarfile', 1000), min_seq_length=1, global_rank=self.global_rank, world_size=self.world_size, n_jobs=cfg.train_ds.get('n_preproc_jobs', -2), tar_file_prefix=cfg.train_ds.get('tar_file_prefix', 'parallel'), ) # update config # self._cfg.train_ds.tar_files = self.tar_files_to_string(self.train_tar_files) # self._cfg.train_ds.tar_files = self.train_tar_files self._cfg.train_ds.metadata_file = self.train_metadata_file logging.info( f"Using tarred dataset created at {self.train_tar_files} and metadata created at {self._cfg.train_ds.metadata_file}" ) elif cfg.train_ds.get('tar_files') is not None and cfg.train_ds.get('metadata_file') is None: raise ValueError('A metadata file is required for tarred dataset but cfg.metadata_file is None.') elif cfg.train_ds.get('tar_files') is None and cfg.train_ds.get('metadata_file') is not None: metadata = json.load(cfg.train_ds.get('metadata_file')) if metadata['train_tar_files']: logging.info(f"Using tarred dataset: {metadata['train_tar_files']}") else: raise ValueError(f'tar_files not provided and metadata does not have tar files') else: self.train_tar_files = cfg.train_ds.get('tar_files') self.train_metadata_file = cfg.train_ds.get('metadata_file') logging.info( f"Using tarred dataset from config at {self.train_tar_files} and metadata from {self.train_metadata_file}" )
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable self.global_rank = 0 self.world_size = 1 if trainer is not None: self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.setup_enc_dec_tokenizers(cfg) super().__init__(cfg=cfg, trainer=trainer) self.src_language: str = cfg.get("src_language", None) self.tgt_language: str = cfg.get("tgt_language", None) # TODO: use get_encoder function with support for HF and Megatron self.encoder = TransformerEncoderNM( vocab_size=self.encoder_vocab_size, hidden_size=cfg.encoder.hidden_size, num_layers=cfg.encoder.num_layers, inner_size=cfg.encoder.inner_size, max_sequence_length=cfg.encoder.max_sequence_length if hasattr(cfg.encoder, 'max_sequence_length') else 512, embedding_dropout=cfg.encoder.embedding_dropout if hasattr(cfg.encoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.encoder.learn_positional_encodings if hasattr(cfg.encoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.encoder.num_attention_heads, ffn_dropout=cfg.encoder.ffn_dropout, attn_score_dropout=cfg.encoder.attn_score_dropout, attn_layer_dropout=cfg.encoder.attn_layer_dropout, hidden_act=cfg.encoder.hidden_act, mask_future=cfg.encoder.mask_future, pre_ln=cfg.encoder.pre_ln, ) # TODO: user get_decoder function with support for HF and Megatron self.decoder = TransformerDecoderNM( vocab_size=self.decoder_vocab_size, hidden_size=cfg.decoder.hidden_size, num_layers=cfg.decoder.num_layers, inner_size=cfg.decoder.inner_size, max_sequence_length=cfg.decoder.max_sequence_length if hasattr(cfg.decoder, 'max_sequence_length') else 512, embedding_dropout=cfg.decoder.embedding_dropout if hasattr(cfg.decoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.decoder.learn_positional_encodings if hasattr(cfg.decoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.decoder.num_attention_heads, ffn_dropout=cfg.decoder.ffn_dropout, attn_score_dropout=cfg.decoder.attn_score_dropout, attn_layer_dropout=cfg.decoder.attn_layer_dropout, hidden_act=cfg.decoder.hidden_act, pre_ln=cfg.decoder.pre_ln, ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size ** 0.5 self.apply(lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing ) self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 self.world_size = 1 if trainer is not None: self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.src_language: str = cfg.get("src_language", None) self.tgt_language: str = cfg.get("tgt_language", None) # Instantiates tokenizers and register to be saved with NeMo Model archive # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly. self.setup_enc_dec_tokenizers( encoder_tokenizer_library=cfg.encoder_tokenizer.get('library', 'yttm'), encoder_tokenizer_model=cfg.encoder_tokenizer.get('tokenizer_model'), encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0), encoder_model_name=cfg.encoder.get('model_name') if hasattr(cfg.encoder, 'model_name') else None, decoder_tokenizer_library=cfg.decoder_tokenizer.get('library', 'yttm'), decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0), decoder_model_name=cfg.decoder.get('model_name') if hasattr(cfg.decoder, 'model_name') else None, ) # After this call, the model will have self.source_processor and self.target_processor objects self.setup_pre_and_post_processing_utils(source_lang=self.src_language, target_lang=self.tgt_language) # TODO: Why is this base constructor call so late in the game? super().__init__(cfg=cfg, trainer=trainer) # encoder from NeMo, Megatron-LM, or HuggingFace encoder_cfg_dict = OmegaConf.to_container(cfg.get('encoder')) encoder_cfg_dict['vocab_size'] = self.encoder_vocab_size library = encoder_cfg_dict.pop('library', 'nemo') model_name = encoder_cfg_dict.pop('model_name', None) pretrained = encoder_cfg_dict.pop('pretrained', False) self.encoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=encoder_cfg_dict, encoder=True, ) # decoder from NeMo, Megatron-LM, or HuggingFace decoder_cfg_dict = OmegaConf.to_container(cfg.get('decoder')) decoder_cfg_dict['vocab_size'] = self.decoder_vocab_size library = decoder_cfg_dict.pop('library', 'nemo') model_name = decoder_cfg_dict.pop('model_name', None) pretrained = decoder_cfg_dict.pop('pretrained', False) decoder_cfg_dict['hidden_size'] = self.encoder.hidden_size self.decoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=decoder_cfg_dict, encoder=False, ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size ** 0.5 # initialize weights if not using pretrained encoder/decoder if not self._cfg.encoder.get('pretrained', False): self.encoder.apply(lambda module: transformer_weights_init(module, std_init_range)) if not self._cfg.decoder.get('pretrained', False): self.decoder.apply(lambda module: transformer_weights_init(module, std_init_range)) self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing ) self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None) -> None: self._cfg = cfg self.global_rank = 0 self.world_size = 1 if trainer is not None: self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank self.world_size = trainer.num_nodes * trainer.num_gpus if hasattr(cfg, 'train_ds'): if (cfg.encoder_tokenizer.get('tokenizer_name') != 'yttm' or cfg.decoder_tokenizer.get('tokenizer_name') != 'yttm'): raise NotImplementedError( f"Currently we only support yttm tokenizer.") # Train tokenizer models if they don't exist if (cfg.encoder_tokenizer.get('tokenizer_model') is None or cfg.decoder_tokenizer.get('tokenizer_model') is None): if cfg.get('preproc_out_dir') is None: raise ValueError( 'Tokenizer model training required but cfg.preproc_out_dir is None.' ) if cfg.train_ds.get( 'src_file_name') is None or cfg.train_ds.get( 'tgt_file_name') is None: raise ValueError( 'src_file_name and tgt_file_name needed to train tokenizers but could not be found.' ) # train tokenizer model on training data self.encoder_tokenizer_model, self.decoder_tokenizer_model = MTDataPreproc.train_tokenizers( out_dir=cfg.get('preproc_out_dir'), src_fname=cfg.train_ds.get('src_file_name'), tgt_fname=cfg.train_ds.get('tgt_file_name'), shared_tokenizer=cfg.get('shared_tokenizer'), encoder_tokenizer_vocab_size=cfg.encoder_tokenizer.get( 'vocab_size'), decoder_tokenizer_vocab_size=cfg.decoder_tokenizer.get( 'vocab_size'), encoder_tokenizer_name=cfg.encoder_tokenizer.get( 'tokenizer_name'), decoder_tokenizer_name=cfg.decoder_tokenizer.get( 'tokenizer_name'), encoder_tokenizer_coverage=cfg.encoder_tokenizer.get( 'coverage', 0.999), decoder_tokenizer_coverage=cfg.decoder_tokenizer.get( 'coverage', 0.999), global_rank=self.global_rank, ) # update config self._cfg.encoder_tokenizer.tokenizer_model = self.encoder_tokenizer_model self._cfg.decoder_tokenizer.tokenizer_model = self.decoder_tokenizer_model else: self.encoder_tokenizer_model = cfg.encoder_tokenizer.get( 'tokenizer_model') self.decoder_tokenizer_model = cfg.decoder_tokenizer.get( 'tokenizer_model') self.encoder_tokenizer, self.decoder_tokenizer = self.get_enc_dec_tokenizers( encoder_tokenizer_name=cfg.encoder_tokenizer.get( 'tokenizer_name'), encoder_tokenizer_model=self.encoder_tokenizer_model, encoder_bpe_dropout=cfg.encoder_tokenizer.get( 'bpe_dropout', 0.0), decoder_tokenizer_name=cfg.decoder_tokenizer.get( 'tokenizer_name'), decoder_tokenizer_model=self.decoder_tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get( 'bpe_dropout', 0.0), ) # If using tarred dataset for training, automatically create it if needed if cfg.train_ds.get('use_tarred_dataset'): if cfg.train_ds.get('tar_files') is None or cfg.train_ds.get( 'metadata_file') is None: if cfg.get('preproc_out_dir') is None: raise ValueError( 'Data preprocessing required but cfg.preproc_out_dir is None.' ) if cfg.train_ds.get( 'src_file_name') is None or cfg.train_ds.get( 'tgt_file_name') is None: raise ValueError( 'src_file_name and tgt_file_name needed to create tarred dataset but could not be found.' ) # Preprocess data and cache for use during training if self.global_rank == 0: logging.info( f"Using tarred dataset for src {cfg.train_ds.get('src_file_name')} and tgt {cfg.train_ds.get('tgt_file_name')}" ) self.train_tar_files, self.train_metadata_file = MTDataPreproc.preprocess_parallel_dataset( clean=cfg.train_ds.clean, src_fname=cfg.train_ds.get('src_file_name'), tgt_fname=cfg.train_ds.get('tgt_file_name'), out_dir=cfg.get('preproc_out_dir'), encoder_tokenizer=self.encoder_tokenizer, decoder_tokenizer=self.decoder_tokenizer, max_seq_length=cfg.train_ds.get('max_seq_length', 512), tokens_in_batch=cfg.train_ds.get( 'tokens_in_batch', 8192), lines_per_dataset_fragment=cfg.train_ds.get( 'lines_per_dataset_fragment', 1000000), num_batches_per_tarfile=cfg.train_ds.get( 'num_batches_per_tarfile', 1000), min_seq_length=1, pkl_file_prefix=cfg.train_ds.get( 'pkl_file_preifx', 'parallel'), global_rank=self.global_rank, world_size=self.world_size, ) # update config self._cfg.train_ds.tar_files = self.tar_files_to_string( self.train_tar_files) self._cfg.train_ds.metadata_file = self.train_metadata_file logging.info( f"Using tarred dataset created at {self._cfg.train_ds.tar_files} and metadata created at {self._cfg.train_ds.metadata_file}" ) else: self.train_tar_files = cfg.train_ds.get('tar_files') self.train_metadata_file = cfg.train_ds.get( 'metadata_file') logging.info( f"Using tarred dataset from config at {self.train_tar_files} and metadata from {self.train_metadata_file}" )