def main(): parser = ArgumentParser() parser.add_argument("--model", type=str, required=True, help="") parser.add_argument("--srctext", type=str, required=True, help="") parser.add_argument("--tgtout", type=str, required=True, help="") parser.add_argument("--batch_size", type=int, default=256, help="") parser.add_argument("--beam_size", type=int, default=4, help="") parser.add_argument("--len_pen", type=float, default=0.6, help="") parser.add_argument("--max_delta_length", type=int, default=5, help="") parser.add_argument("--target_lang", type=str, default=None, help="") parser.add_argument("--source_lang", type=str, default=None, help="") # shallow fusion specific parameters parser.add_argument("--lm_model", type=str, default=None, help="") parser.add_argument("--fusion_coef", type=float, default=0.0, help="") args = parser.parse_args() torch.set_grad_enabled(False) if args.model.endswith(".nemo"): logging.info("Attempting to initialize from .nemo file") model = nemo_nlp.models.machine_translation.MTEncDecModel.restore_from(restore_path=args.model) src_text = [] tgt_text = [] else: raise NotImplemented(f"Only support .nemo files, but got: {args.model}") if torch.cuda.is_available(): model = model.cuda() if args.lm_model is not None: lm_model = nemo_nlp.models.language_modeling.TransformerLMModel.restore_from(restore_path=args.lm_model).eval() model.beam_search = BeamSearchSequenceGeneratorWithLanguageModel( embedding=model.decoder.embedding, decoder=model.decoder.decoder, log_softmax=model.log_softmax, bos=model.decoder_tokenizer.bos_id, pad=model.decoder_tokenizer.pad_id, eos=model.decoder_tokenizer.eos_id, language_model=lm_model, fusion_coef=args.fusion_coef, max_sequence_length=model.decoder.max_sequence_length, beam_size=args.beam_size, len_pen=args.len_pen, max_delta_length=args.max_delta_length, ) else: model.beam_search = BeamSearchSequenceGenerator( embedding=model.decoder.embedding, decoder=model.decoder.decoder, log_softmax=model.log_softmax, bos=model.decoder_tokenizer.bos_id, pad=model.decoder_tokenizer.pad_id, eos=model.decoder_tokenizer.eos_id, max_sequence_length=model.decoder.max_sequence_length, beam_size=args.beam_size, len_pen=args.len_pen, max_delta_length=args.max_delta_length, ) logging.info(f"Translating: {args.srctext}") count = 0 with open(args.srctext, 'r') as src_f: for line in src_f: src_text.append(line.strip()) if len(src_text) == args.batch_size: res = model.translate(text=src_text, source_lang=args.source_lang, target_lang=args.target_lang) if len(res) != len(src_text): print(len(res)) print(len(src_text)) print(res) print(src_text) tgt_text += res src_text = [] count += 1 # if count % 300 == 0: # print(f"Translated {count} sentences") if len(src_text) > 0: tgt_text += model.translate(text=src_text, source_lang=args.source_lang, target_lang=args.target_lang) with open(args.tgtout, 'w') as tgt_f: for line in tgt_text: tgt_f.write(line + "\n")
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 self.world_size = 1 if trainer is not None: self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.src_language: str = cfg.get("src_language", None) self.tgt_language: str = cfg.get("tgt_language", None) # Instantiates tokenizers and register to be saved with NeMo Model archive # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly. self.setup_enc_dec_tokenizers( encoder_tokenizer_name=cfg.encoder_tokenizer.tokenizer_name, encoder_tokenizer_model=cfg.encoder_tokenizer.tokenizer_model, encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0), decoder_tokenizer_name=cfg.decoder_tokenizer.tokenizer_name, decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0), ) # After this call, the model will have self.source_processor and self.target_processor objects self.setup_pre_and_post_processing_utils(source_lang=self.src_language, target_lang=self.tgt_language) # TODO: Why is this base constructor call so late in the game? super().__init__(cfg=cfg, trainer=trainer) # TODO: use get_encoder function with support for HF and Megatron self.encoder = TransformerEncoderNM( vocab_size=self.encoder_vocab_size, hidden_size=cfg.encoder.hidden_size, num_layers=cfg.encoder.num_layers, inner_size=cfg.encoder.inner_size, max_sequence_length=cfg.encoder.max_sequence_length if hasattr( cfg.encoder, 'max_sequence_length') else 512, embedding_dropout=cfg.encoder.embedding_dropout if hasattr( cfg.encoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.encoder.learn_positional_encodings if hasattr(cfg.encoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.encoder.num_attention_heads, ffn_dropout=cfg.encoder.ffn_dropout, attn_score_dropout=cfg.encoder.attn_score_dropout, attn_layer_dropout=cfg.encoder.attn_layer_dropout, hidden_act=cfg.encoder.hidden_act, mask_future=cfg.encoder.mask_future, pre_ln=cfg.encoder.pre_ln, ) # TODO: user get_decoder function with support for HF and Megatron self.decoder = TransformerDecoderNM( vocab_size=self.decoder_vocab_size, hidden_size=cfg.decoder.hidden_size, num_layers=cfg.decoder.num_layers, inner_size=cfg.decoder.inner_size, max_sequence_length=cfg.decoder.max_sequence_length if hasattr( cfg.decoder, 'max_sequence_length') else 512, embedding_dropout=cfg.decoder.embedding_dropout if hasattr( cfg.decoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.decoder.learn_positional_encodings if hasattr(cfg.decoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.decoder.num_attention_heads, ffn_dropout=cfg.decoder.ffn_dropout, attn_score_dropout=cfg.decoder.attn_score_dropout, attn_layer_dropout=cfg.decoder.attn_layer_dropout, hidden_act=cfg.decoder.hidden_act, pre_ln=cfg.decoder.pre_ln, ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size**0.5 self.apply( lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing) self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 self.world_size = 1 if trainer is not None: self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.src_language = cfg.get("src_language", None) self.tgt_language = cfg.get("tgt_language", None) self.multilingual = cfg.get("multilingual", False) self.multilingual_ids = [] self.encoder_tokenizer_library = cfg.encoder_tokenizer.get( 'library', 'yttm') self.decoder_tokenizer_library = cfg.decoder_tokenizer.get( 'library', 'yttm') # Instantiates tokenizers and register to be saved with NeMo Model archive # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly. self.setup_enc_dec_tokenizers( encoder_tokenizer_library=self.encoder_tokenizer_library, encoder_tokenizer_model=cfg.encoder_tokenizer.get( 'tokenizer_model'), encoder_bpe_dropout=cfg.encoder_tokenizer.get( 'bpe_dropout', 0.0) if cfg.encoder_tokenizer.get( 'bpe_dropout', 0.0) is not None else 0.0, encoder_model_name=cfg.encoder.get('model_name') if hasattr( cfg.encoder, 'model_name') else None, encoder_r2l=cfg.encoder_tokenizer.get('r2l', False), decoder_tokenizer_library=self.decoder_tokenizer_library, encoder_tokenizer_vocab_file=cfg.encoder_tokenizer.get( 'vocab_file', None), decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get( 'bpe_dropout', 0.0) if cfg.decoder_tokenizer.get( 'bpe_dropout', 0.0) is not None else 0.0, decoder_model_name=cfg.decoder.get('model_name') if hasattr( cfg.decoder, 'model_name') else None, decoder_r2l=cfg.decoder_tokenizer.get('r2l', False), ) if self.multilingual: if isinstance(self.src_language, ListConfig) and isinstance( self.tgt_language, ListConfig): raise ValueError( "cfg.src_language and cfg.tgt_language cannot both be lists. We only support many-to-one or one-to-many multilingual models." ) elif isinstance(self.src_language, ListConfig): for lng in self.src_language: self.multilingual_ids.append( self.encoder_tokenizer.token_to_id("<" + lng + ">")) elif isinstance(self.tgt_language, ListConfig): for lng in self.tgt_language: self.multilingual_ids.append( self.encoder_tokenizer.token_to_id("<" + lng + ">")) else: raise ValueError( "Expect either cfg.src_language or cfg.tgt_language to be a list when multilingual=True." ) if isinstance(self.src_language, ListConfig): self.tgt_language = [self.tgt_language] * len( self.src_language) else: self.src_language = [self.src_language] * len( self.tgt_language) self.source_processor_list = [] self.target_processor_list = [] for src_lng, tgt_lng in zip(self.src_language, self.tgt_language): src_prcsr, tgt_prscr = self.setup_pre_and_post_processing_utils( src_lng, tgt_lng) self.source_processor_list.append(src_prcsr) self.target_processor_list.append(tgt_prscr) else: # After this call, the model will have self.source_processor and self.target_processor objects self.setup_pre_and_post_processing_utils(self.src_language, self.tgt_language) self.multilingual_ids = [None] # TODO: Why is this base constructor call so late in the game? super().__init__(cfg=cfg, trainer=trainer) # encoder from NeMo, Megatron-LM, or HuggingFace encoder_cfg_dict = OmegaConf.to_container(cfg.get('encoder')) encoder_cfg_dict['vocab_size'] = self.encoder_vocab_size library = encoder_cfg_dict.pop('library', 'nemo') model_name = encoder_cfg_dict.pop('model_name', None) pretrained = encoder_cfg_dict.pop('pretrained', False) checkpoint_file = encoder_cfg_dict.pop('checkpoint_file', None) self.encoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=encoder_cfg_dict, encoder=True, pre_ln_final_layer_norm=encoder_cfg_dict.get( 'pre_ln_final_layer_norm', False), checkpoint_file=checkpoint_file, ) # decoder from NeMo, Megatron-LM, or HuggingFace decoder_cfg_dict = OmegaConf.to_container(cfg.get('decoder')) decoder_cfg_dict['vocab_size'] = self.decoder_vocab_size library = decoder_cfg_dict.pop('library', 'nemo') model_name = decoder_cfg_dict.pop('model_name', None) pretrained = decoder_cfg_dict.pop('pretrained', False) decoder_cfg_dict['hidden_size'] = self.encoder.hidden_size self.decoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=decoder_cfg_dict, encoder=False, pre_ln_final_layer_norm=decoder_cfg_dict.get( 'pre_ln_final_layer_norm', False), ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size**0.5 # initialize weights if not using pretrained encoder/decoder if not self._cfg.encoder.get('pretrained', False): self.encoder.apply(lambda module: transformer_weights_init( module, std_init_range)) if not self._cfg.decoder.get('pretrained', False): self.decoder.apply(lambda module: transformer_weights_init( module, std_init_range)) self.log_softmax.apply( lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing) self.eval_loss_fn = NLLLoss(ignore_index=self.decoder_tokenizer.pad_id)
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable self.global_rank = 0 self.world_size = 1 if trainer is not None: self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.setup_enc_dec_tokenizers(cfg) super().__init__(cfg=cfg, trainer=trainer) # TODO: use get_encoder function with support for HF and Megatron self.encoder = TransformerEncoderNM( vocab_size=self.encoder_vocab_size, hidden_size=cfg.encoder.hidden_size, num_layers=cfg.encoder.num_layers, inner_size=cfg.encoder.inner_size, max_sequence_length=cfg.encoder.max_sequence_length if hasattr( cfg.encoder, 'max_sequence_length') else 512, embedding_dropout=cfg.encoder.embedding_dropout if hasattr( cfg.encoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.encoder.learn_positional_encodings if hasattr(cfg.encoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.encoder.num_attention_heads, ffn_dropout=cfg.encoder.ffn_dropout, attn_score_dropout=cfg.encoder.attn_score_dropout, attn_layer_dropout=cfg.encoder.attn_layer_dropout, hidden_act=cfg.encoder.hidden_act, mask_future=cfg.encoder.mask_future, pre_ln=cfg.encoder.pre_ln, ) # TODO: user get_decoder function with support for HF and Megatron self.decoder = TransformerDecoderNM( vocab_size=self.decoder_vocab_size, hidden_size=cfg.decoder.hidden_size, num_layers=cfg.decoder.num_layers, inner_size=cfg.decoder.inner_size, max_sequence_length=cfg.decoder.max_sequence_length if hasattr( cfg.decoder, 'max_sequence_length') else 512, embedding_dropout=cfg.decoder.embedding_dropout if hasattr( cfg.decoder, 'embedding_dropout') else 0.0, learn_positional_encodings=cfg.decoder.learn_positional_encodings if hasattr(cfg.decoder, 'learn_positional_encodings') else False, num_attention_heads=cfg.decoder.num_attention_heads, ffn_dropout=cfg.decoder.ffn_dropout, attn_score_dropout=cfg.decoder.attn_score_dropout, attn_layer_dropout=cfg.decoder.attn_layer_dropout, hidden_act=cfg.decoder.hidden_act, pre_ln=cfg.decoder.pre_ln, ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size**0.5 self.apply( lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing) self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
def main(): parser = ArgumentParser() parser.add_argument( "--model", type=str, required=True, help="Path to .nemo model file(s). If ensembling, provide comma separated paths to multiple models.", ) parser.add_argument("--srctext", type=str, required=True, help="Path to the file to translate.") parser.add_argument( "--tgtout", type=str, required=True, help="Path to the file where translations are to be written." ) parser.add_argument( "--batch_size", type=int, default=256, help="Number of sentences to batch together while translatiing." ) parser.add_argument("--beam_size", type=int, default=4, help="Beam size.") parser.add_argument( "--len_pen", type=float, default=0.6, help="Length Penalty. Ref: https://arxiv.org/abs/1609.08144" ) parser.add_argument( "--max_delta_length", type=int, default=5, help="Stop generating if target sequence length exceeds source length by this number.", ) parser.add_argument( "--target_lang", type=str, default=None, help="Target language identifier ex: en,de,fr,es etc." ) parser.add_argument( "--source_lang", type=str, default=None, help="Source language identifier ex: en,de,fr,es etc." ) parser.add_argument( "--write_scores", action="store_true", help="Whether to write a separate file with scores not including length penalties corresponding to each beam hypothesis (.score suffix)", ) # shallow fusion specific parameters parser.add_argument( "--lm_model", type=str, default=None, help="Optional path to an LM model that has the same tokenizer as NMT models for shallow fuison. Note: If using --write_scores, it will add LM scores as well.", ) parser.add_argument( "--fusion_coef", type=float, default=0.07, help="Weight assigned to LM scores during shallow fusion." ) args = parser.parse_args() torch.set_grad_enabled(False) logging.info("Attempting to initialize from .nemo file") models = [] for model_path in args.model.split(','): if not model_path.endswith('.nemo'): raise NotImplementedError(f"Only support .nemo files, but got: {model_path}") model = nemo_nlp.models.machine_translation.MTEncDecModel.restore_from(restore_path=model_path).eval() models.append(model) src_text = [] tgt_text = [] tgt_text_all = [] src_texts = [] all_scores = [] if torch.cuda.is_available(): models = [model.cuda() for model in models] if args.lm_model is not None: lm_model = nemo_nlp.models.language_modeling.TransformerLMModel.restore_from(restore_path=args.lm_model).eval() else: lm_model = None if len(models) > 1: ensemble_generator = EnsembleBeamSearchSequenceGenerator( encoders=[model.encoder for model in models], embeddings=[model.decoder.embedding for model in models], decoders=[model.decoder.decoder for model in models], log_softmaxes=[model.log_softmax for model in models], max_sequence_length=512, beam_size=args.beam_size, bos=models[0].decoder_tokenizer.bos_id, pad=models[0].decoder_tokenizer.pad_id, eos=models[0].decoder_tokenizer.eos_id, len_pen=args.len_pen, max_delta_length=args.max_delta_length, language_model=lm_model, fusion_coef=args.fusion_coef, ) else: model = models[0] if lm_model is not None: model.beam_search = BeamSearchSequenceGeneratorWithLanguageModel( embedding=model.decoder.embedding, decoder=model.decoder.decoder, log_softmax=model.log_softmax, bos=model.decoder_tokenizer.bos_id, pad=model.decoder_tokenizer.pad_id, eos=model.decoder_tokenizer.eos_id, language_model=lm_model, fusion_coef=args.fusion_coef, max_sequence_length=model.decoder.max_sequence_length, beam_size=args.beam_size, len_pen=args.len_pen, max_delta_length=args.max_delta_length, ) else: model.beam_search = BeamSearchSequenceGenerator( embedding=model.decoder.embedding, decoder=model.decoder.decoder, log_softmax=model.log_softmax, bos=model.decoder_tokenizer.bos_id, pad=model.decoder_tokenizer.pad_id, eos=model.decoder_tokenizer.eos_id, max_sequence_length=model.decoder.max_sequence_length, beam_size=args.beam_size, len_pen=args.len_pen, max_delta_length=args.max_delta_length, ) logging.info(f"Translating: {args.srctext}") count = 0 with open(args.srctext, 'r') as src_f: for line in src_f: src_text.append(line.strip()) if len(src_text) == args.batch_size: if len(models) > 1: src_ids, src_mask = models[0].prepare_inference_batch(src_text) best_translations = ensemble_generator(src_ids, src_mask, return_beam_scores=args.write_scores) if args.write_scores: all_results, scores, best_translations = ( best_translations[0], best_translations[1], best_translations[2], ) scores = scores.view(-1).data.cpu().numpy().tolist() all_scores += scores src_texts += [item for item in src_text for i in range(args.beam_size)] all_results = models[0].ids_to_postprocessed_text( all_results, models[0].decoder_tokenizer, models[0].target_processor ) tgt_text_all += all_results best_translations = models[0].ids_to_postprocessed_text( best_translations, models[0].decoder_tokenizer, models[0].target_processor ) tgt_text += best_translations else: best_translations = model.translate( text=src_text, source_lang=args.source_lang, target_lang=args.target_lang, return_beam_scores=args.write_scores, ) if args.write_scores: all_results, scores, best_translations = ( best_translations[0], best_translations[1], best_translations[2], ) all_scores += scores src_texts += [item for item in src_text for i in range(args.beam_size)] tgt_text_all += all_results tgt_text += best_translations src_text = [] print(f"Translated {count + 1} sentences") count += 1 if len(src_text) > 0: if len(models) > 1: src_ids, src_mask = models[0].prepare_inference_batch(src_text) best_translations = ensemble_generator(src_ids, src_mask, return_beam_scores=args.write_scores) if args.write_scores: all_results, scores, best_translations = ( best_translations[0], best_translations[1], best_translations[2], ) scores = scores.view(-1).data.cpu().numpy().tolist() all_scores += scores src_texts += [item for item in src_text for i in range(args.beam_size)] all_results = models[0].ids_to_postprocessed_text( all_results, models[0].decoder_tokenizer, models[0].target_processor ) tgt_text_all += all_results best_translations = models[0].ids_to_postprocessed_text( best_translations, models[0].decoder_tokenizer, models[0].target_processor ) tgt_text += best_translations else: best_translations = model.translate( text=src_text, source_lang=args.source_lang, target_lang=args.target_lang, return_beam_scores=args.write_scores, ) if args.write_scores: all_results, scores, best_translations = ( best_translations[0], best_translations[1], best_translations[2], ) all_scores += scores src_texts += [item for item in src_text for i in range(args.beam_size)] tgt_text_all += all_results tgt_text += best_translations src_text = [] print(f"Translated {count} sentences") with open(args.tgtout, 'w') as tgt_f: for line in tgt_text: tgt_f.write(line + "\n") if args.write_scores: with open(args.tgtout + '.score', 'w') as tgt_f_scores: for line, score, inp in zip(tgt_text_all, all_scores, src_texts): tgt_f_scores.write(inp + "\t" + line + "\t" + str(score) + "\n")
def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): cfg = model_utils.convert_model_config_to_dict_config(cfg) # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 self.world_size = 1 if trainer is not None: self.world_size = trainer.num_nodes * trainer.num_gpus cfg = model_utils.maybe_update_config_version(cfg) self.src_language: str = cfg.get("src_language", None) self.tgt_language: str = cfg.get("tgt_language", None) # Instantiates tokenizers and register to be saved with NeMo Model archive # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly. self.setup_enc_dec_tokenizers( encoder_tokenizer_library=cfg.encoder_tokenizer.get('library', 'yttm'), encoder_tokenizer_model=cfg.encoder_tokenizer.get('tokenizer_model'), encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0), encoder_model_name=cfg.encoder.get('model_name') if hasattr(cfg.encoder, 'model_name') else None, decoder_tokenizer_library=cfg.decoder_tokenizer.get('library', 'yttm'), decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0), decoder_model_name=cfg.decoder.get('model_name') if hasattr(cfg.decoder, 'model_name') else None, ) # After this call, the model will have self.source_processor and self.target_processor objects self.setup_pre_and_post_processing_utils(source_lang=self.src_language, target_lang=self.tgt_language) # TODO: Why is this base constructor call so late in the game? super().__init__(cfg=cfg, trainer=trainer) # encoder from NeMo, Megatron-LM, or HuggingFace encoder_cfg_dict = OmegaConf.to_container(cfg.get('encoder')) encoder_cfg_dict['vocab_size'] = self.encoder_vocab_size library = encoder_cfg_dict.pop('library', 'nemo') model_name = encoder_cfg_dict.pop('model_name', None) pretrained = encoder_cfg_dict.pop('pretrained', False) self.encoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=encoder_cfg_dict, encoder=True, ) # decoder from NeMo, Megatron-LM, or HuggingFace decoder_cfg_dict = OmegaConf.to_container(cfg.get('decoder')) decoder_cfg_dict['vocab_size'] = self.decoder_vocab_size library = decoder_cfg_dict.pop('library', 'nemo') model_name = decoder_cfg_dict.pop('model_name', None) pretrained = decoder_cfg_dict.pop('pretrained', False) decoder_cfg_dict['hidden_size'] = self.encoder.hidden_size self.decoder = get_transformer( library=library, model_name=model_name, pretrained=pretrained, config_dict=decoder_cfg_dict, encoder=False, ) self.log_softmax = TokenClassifier( hidden_size=self.decoder.hidden_size, num_classes=self.decoder_vocab_size, activation=cfg.head.activation, log_softmax=cfg.head.log_softmax, dropout=cfg.head.dropout, use_transformer_init=cfg.head.use_transformer_init, ) self.beam_search = BeamSearchSequenceGenerator( embedding=self.decoder.embedding, decoder=self.decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.decoder.max_sequence_length, beam_size=cfg.beam_size, bos=self.decoder_tokenizer.bos_id, pad=self.decoder_tokenizer.pad_id, eos=self.decoder_tokenizer.eos_id, len_pen=cfg.len_pen, max_delta_length=cfg.max_generation_delta, ) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight # TODO: encoder and decoder with different hidden size? std_init_range = 1 / self.encoder.hidden_size ** 0.5 # initialize weights if not using pretrained encoder/decoder if not self._cfg.encoder.get('pretrained', False): self.encoder.apply(lambda module: transformer_weights_init(module, std_init_range)) if not self._cfg.decoder.get('pretrained', False): self.decoder.apply(lambda module: transformer_weights_init(module, std_init_range)) self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range)) self.loss_fn = SmoothedCrossEntropyLoss( pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing ) self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
def main(): parser = ArgumentParser() parser.add_argument( "--model", type=str, required=True, help="Path to .nemo model file(s). If ensembling, provide comma separated paths to multiple models.", ) parser.add_argument("--srctext", type=str, required=True, help="Path to the file to translate.") parser.add_argument( "--tgtout", type=str, required=True, help="Path to the file where translations are to be written." ) parser.add_argument( "--batch_size", type=int, default=256, help="Number of sentences to batch together while translatiing." ) parser.add_argument("--beam_size", type=int, default=4, help="Beam size.") parser.add_argument( "--len_pen", type=float, default=0.6, help="Length Penalty. Ref: https://arxiv.org/abs/1609.08144" ) parser.add_argument( "--max_delta_length", type=int, default=5, help="Stop generating if target sequence length exceeds source length by this number.", ) parser.add_argument( "--target_lang", type=str, default=None, help="Target language identifier ex: en,de,fr,es etc." ) parser.add_argument( "--source_lang", type=str, default=None, help="Source language identifier ex: en,de,fr,es etc." ) parser.add_argument( "--write_scores", action="store_true", help="Whether to write a separate file with scores not including length penalties corresponding to each beam hypothesis (.score suffix)", ) parser.add_argument( "--write_timing", action="store_true", help="Whether to write a separate file with detailed timing info (.timing.json suffix)", ) # shallow fusion specific parameters parser.add_argument( "--lm_model", type=str, default=None, help="Optional path to an LM model that has the same tokenizer as NMT models for shallow fuison. Note: If using --write_scores, it will add LM scores as well.", ) parser.add_argument( "--fusion_coef", type=float, default=0.07, help="Weight assigned to LM scores during shallow fusion." ) args = parser.parse_args() torch.set_grad_enabled(False) logging.info("Attempting to initialize from .nemo file") models = [] for model_path in args.model.split(','): if not model_path.endswith('.nemo'): raise NotImplementedError(f"Only support .nemo files, but got: {model_path}") model = nemo_nlp.models.machine_translation.MTEncDecModel.restore_from(restore_path=model_path).eval() models.append(model) if (len(models) > 1) and (args.write_timing): raise RuntimeError("Cannot measure timing when more than 1 model is used") src_text = [] tgt_text = [] tgt_text_all = [] src_texts = [] all_scores = [] all_timing = [] if torch.cuda.is_available(): models = [model.cuda() for model in models] if args.lm_model is not None: lm_model = nemo_nlp.models.language_modeling.TransformerLMModel.restore_from(restore_path=args.lm_model).eval() else: lm_model = None if len(models) > 1: ensemble_generator = EnsembleBeamSearchSequenceGenerator( encoders=[model.encoder for model in models], embeddings=[model.decoder.embedding for model in models], decoders=[model.decoder.decoder for model in models], log_softmaxes=[model.log_softmax for model in models], max_sequence_length=512, beam_size=args.beam_size, bos=models[0].decoder_tokenizer.bos_id, pad=models[0].decoder_tokenizer.pad_id, eos=models[0].decoder_tokenizer.eos_id, len_pen=args.len_pen, max_delta_length=args.max_delta_length, language_model=lm_model, fusion_coef=args.fusion_coef, ) else: model = models[0] ensemble_generator = None if lm_model is not None: model.beam_search = BeamSearchSequenceGeneratorWithLanguageModel( embedding=model.decoder.embedding, decoder=model.decoder.decoder, log_softmax=model.log_softmax, bos=model.decoder_tokenizer.bos_id, pad=model.decoder_tokenizer.pad_id, eos=model.decoder_tokenizer.eos_id, language_model=lm_model, fusion_coef=args.fusion_coef, max_sequence_length=model.decoder.max_sequence_length, beam_size=args.beam_size, len_pen=args.len_pen, max_delta_length=args.max_delta_length, ) else: model.beam_search = BeamSearchSequenceGenerator( embedding=model.decoder.embedding, decoder=model.decoder.decoder, log_softmax=model.log_softmax, bos=model.decoder_tokenizer.bos_id, pad=model.decoder_tokenizer.pad_id, eos=model.decoder_tokenizer.eos_id, max_sequence_length=model.decoder.max_sequence_length, beam_size=args.beam_size, len_pen=args.len_pen, max_delta_length=args.max_delta_length, ) logging.info(f"Translating: {args.srctext}") with open(args.srctext, 'r') as src_f: for line in src_f: src_text.append(line.strip()) if len(src_text) == args.batch_size: # warmup when measuring timing if not all_timing: print("running a warmup batch") translate_text( models=models, args=args, src_text=src_text, tgt_text=[], tgt_text_all=[], src_texts=[], all_scores=[], all_timing=[], ensemble_generator=ensemble_generator, ) translate_text( models=models, args=args, src_text=src_text, tgt_text=tgt_text, tgt_text_all=tgt_text_all, src_texts=src_texts, all_scores=all_scores, all_timing=all_timing, ensemble_generator=ensemble_generator, ) src_text = [] if len(src_text) > 0: translate_text( models=models, args=args, src_text=src_text, tgt_text=tgt_text, tgt_text_all=tgt_text_all, src_texts=src_texts, all_scores=all_scores, all_timing=all_timing, ensemble_generator=ensemble_generator, ) with open(args.tgtout, 'w') as tgt_f: for line in tgt_text: tgt_f.write(line + "\n") if args.write_scores: with open(args.tgtout + '.score', 'w') as tgt_f_scores: for line, score, inp in zip(tgt_text_all, all_scores, src_texts): tgt_f_scores.write(inp + "\t" + line + "\t" + str(score) + "\n") if args.write_timing: # collect list of dicts to a dict of lists timing_dict = {} if len(all_timing): for k in all_timing[0].keys(): timing_dict[k] = [t[k] for t in all_timing] with open(args.tgtout + '.timing.json', 'w') as timing_fh: json.dump(timing_dict, timing_fh)