예제 #1
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--model", type=str, required=True, help="")
    parser.add_argument("--srctext", type=str, required=True, help="")
    parser.add_argument("--tgtout", type=str, required=True, help="")
    parser.add_argument("--batch_size", type=int, default=256, help="")
    parser.add_argument("--beam_size", type=int, default=4, help="")
    parser.add_argument("--len_pen", type=float, default=0.6, help="")
    parser.add_argument("--max_delta_length", type=int, default=5, help="")
    parser.add_argument("--target_lang", type=str, default=None, help="")
    parser.add_argument("--source_lang", type=str, default=None, help="")
    # shallow fusion specific parameters
    parser.add_argument("--lm_model", type=str, default=None, help="")
    parser.add_argument("--fusion_coef", type=float, default=0.0, help="")

    args = parser.parse_args()
    torch.set_grad_enabled(False)
    if args.model.endswith(".nemo"):
        logging.info("Attempting to initialize from .nemo file")
        model = nemo_nlp.models.machine_translation.MTEncDecModel.restore_from(restore_path=args.model)
        src_text = []
        tgt_text = []
    else:
        raise NotImplemented(f"Only support .nemo files, but got: {args.model}")

    if torch.cuda.is_available():
        model = model.cuda()

    if args.lm_model is not None:
        lm_model = nemo_nlp.models.language_modeling.TransformerLMModel.restore_from(restore_path=args.lm_model).eval()
        model.beam_search = BeamSearchSequenceGeneratorWithLanguageModel(
            embedding=model.decoder.embedding,
            decoder=model.decoder.decoder,
            log_softmax=model.log_softmax,
            bos=model.decoder_tokenizer.bos_id,
            pad=model.decoder_tokenizer.pad_id,
            eos=model.decoder_tokenizer.eos_id,
            language_model=lm_model,
            fusion_coef=args.fusion_coef,
            max_sequence_length=model.decoder.max_sequence_length,
            beam_size=args.beam_size,
            len_pen=args.len_pen,
            max_delta_length=args.max_delta_length,
        )
    else:
        model.beam_search = BeamSearchSequenceGenerator(
            embedding=model.decoder.embedding,
            decoder=model.decoder.decoder,
            log_softmax=model.log_softmax,
            bos=model.decoder_tokenizer.bos_id,
            pad=model.decoder_tokenizer.pad_id,
            eos=model.decoder_tokenizer.eos_id,
            max_sequence_length=model.decoder.max_sequence_length,
            beam_size=args.beam_size,
            len_pen=args.len_pen,
            max_delta_length=args.max_delta_length,
        )

    logging.info(f"Translating: {args.srctext}")

    count = 0
    with open(args.srctext, 'r') as src_f:
        for line in src_f:
            src_text.append(line.strip())
            if len(src_text) == args.batch_size:
                res = model.translate(text=src_text, source_lang=args.source_lang, target_lang=args.target_lang)
                if len(res) != len(src_text):
                    print(len(res))
                    print(len(src_text))
                    print(res)
                    print(src_text)
                tgt_text += res
                src_text = []
            count += 1
            # if count % 300 == 0:
            #    print(f"Translated {count} sentences")
        if len(src_text) > 0:
            tgt_text += model.translate(text=src_text, source_lang=args.source_lang, target_lang=args.target_lang)

    with open(args.tgtout, 'w') as tgt_f:
        for line in tgt_text:
            tgt_f.write(line + "\n")
예제 #2
0
    def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None):
        cfg = model_utils.convert_model_config_to_dict_config(cfg)
        # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable
        # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0

        self.world_size = 1
        if trainer is not None:
            self.world_size = trainer.num_nodes * trainer.num_gpus

        cfg = model_utils.maybe_update_config_version(cfg)

        self.src_language: str = cfg.get("src_language", None)
        self.tgt_language: str = cfg.get("tgt_language", None)

        # Instantiates tokenizers and register to be saved with NeMo Model archive
        # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer
        # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly.
        self.setup_enc_dec_tokenizers(
            encoder_tokenizer_name=cfg.encoder_tokenizer.tokenizer_name,
            encoder_tokenizer_model=cfg.encoder_tokenizer.tokenizer_model,
            encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0),
            decoder_tokenizer_name=cfg.decoder_tokenizer.tokenizer_name,
            decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model,
            decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0),
        )

        # After this call, the model will have  self.source_processor and self.target_processor objects
        self.setup_pre_and_post_processing_utils(source_lang=self.src_language,
                                                 target_lang=self.tgt_language)

        # TODO: Why is this base constructor call so late in the game?
        super().__init__(cfg=cfg, trainer=trainer)

        # TODO: use get_encoder function with support for HF and Megatron
        self.encoder = TransformerEncoderNM(
            vocab_size=self.encoder_vocab_size,
            hidden_size=cfg.encoder.hidden_size,
            num_layers=cfg.encoder.num_layers,
            inner_size=cfg.encoder.inner_size,
            max_sequence_length=cfg.encoder.max_sequence_length if hasattr(
                cfg.encoder, 'max_sequence_length') else 512,
            embedding_dropout=cfg.encoder.embedding_dropout if hasattr(
                cfg.encoder, 'embedding_dropout') else 0.0,
            learn_positional_encodings=cfg.encoder.learn_positional_encodings
            if hasattr(cfg.encoder, 'learn_positional_encodings') else False,
            num_attention_heads=cfg.encoder.num_attention_heads,
            ffn_dropout=cfg.encoder.ffn_dropout,
            attn_score_dropout=cfg.encoder.attn_score_dropout,
            attn_layer_dropout=cfg.encoder.attn_layer_dropout,
            hidden_act=cfg.encoder.hidden_act,
            mask_future=cfg.encoder.mask_future,
            pre_ln=cfg.encoder.pre_ln,
        )

        # TODO: user get_decoder function with support for HF and Megatron
        self.decoder = TransformerDecoderNM(
            vocab_size=self.decoder_vocab_size,
            hidden_size=cfg.decoder.hidden_size,
            num_layers=cfg.decoder.num_layers,
            inner_size=cfg.decoder.inner_size,
            max_sequence_length=cfg.decoder.max_sequence_length if hasattr(
                cfg.decoder, 'max_sequence_length') else 512,
            embedding_dropout=cfg.decoder.embedding_dropout if hasattr(
                cfg.decoder, 'embedding_dropout') else 0.0,
            learn_positional_encodings=cfg.decoder.learn_positional_encodings
            if hasattr(cfg.decoder, 'learn_positional_encodings') else False,
            num_attention_heads=cfg.decoder.num_attention_heads,
            ffn_dropout=cfg.decoder.ffn_dropout,
            attn_score_dropout=cfg.decoder.attn_score_dropout,
            attn_layer_dropout=cfg.decoder.attn_layer_dropout,
            hidden_act=cfg.decoder.hidden_act,
            pre_ln=cfg.decoder.pre_ln,
        )

        self.log_softmax = TokenClassifier(
            hidden_size=self.decoder.hidden_size,
            num_classes=self.decoder_vocab_size,
            activation=cfg.head.activation,
            log_softmax=cfg.head.log_softmax,
            dropout=cfg.head.dropout,
            use_transformer_init=cfg.head.use_transformer_init,
        )

        self.beam_search = BeamSearchSequenceGenerator(
            embedding=self.decoder.embedding,
            decoder=self.decoder.decoder,
            log_softmax=self.log_softmax,
            max_sequence_length=self.decoder.max_sequence_length,
            beam_size=cfg.beam_size,
            bos=self.decoder_tokenizer.bos_id,
            pad=self.decoder_tokenizer.pad_id,
            eos=self.decoder_tokenizer.eos_id,
            len_pen=cfg.len_pen,
            max_delta_length=cfg.max_generation_delta,
        )

        # tie weights of embedding and softmax matrices
        self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight

        # TODO: encoder and decoder with different hidden size?
        std_init_range = 1 / self.encoder.hidden_size**0.5
        self.apply(
            lambda module: transformer_weights_init(module, std_init_range))

        self.loss_fn = SmoothedCrossEntropyLoss(
            pad_id=self.decoder_tokenizer.pad_id,
            label_smoothing=cfg.label_smoothing)
        self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False,
                                                 take_avg_loss=True)
예제 #3
0
    def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None):
        cfg = model_utils.convert_model_config_to_dict_config(cfg)
        # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable
        # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0

        self.world_size = 1
        if trainer is not None:
            self.world_size = trainer.num_nodes * trainer.num_gpus

        cfg = model_utils.maybe_update_config_version(cfg)

        self.src_language = cfg.get("src_language", None)
        self.tgt_language = cfg.get("tgt_language", None)

        self.multilingual = cfg.get("multilingual", False)
        self.multilingual_ids = []

        self.encoder_tokenizer_library = cfg.encoder_tokenizer.get(
            'library', 'yttm')
        self.decoder_tokenizer_library = cfg.decoder_tokenizer.get(
            'library', 'yttm')

        # Instantiates tokenizers and register to be saved with NeMo Model archive
        # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer
        # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly.
        self.setup_enc_dec_tokenizers(
            encoder_tokenizer_library=self.encoder_tokenizer_library,
            encoder_tokenizer_model=cfg.encoder_tokenizer.get(
                'tokenizer_model'),
            encoder_bpe_dropout=cfg.encoder_tokenizer.get(
                'bpe_dropout', 0.0) if cfg.encoder_tokenizer.get(
                    'bpe_dropout', 0.0) is not None else 0.0,
            encoder_model_name=cfg.encoder.get('model_name') if hasattr(
                cfg.encoder, 'model_name') else None,
            encoder_r2l=cfg.encoder_tokenizer.get('r2l', False),
            decoder_tokenizer_library=self.decoder_tokenizer_library,
            encoder_tokenizer_vocab_file=cfg.encoder_tokenizer.get(
                'vocab_file', None),
            decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model,
            decoder_bpe_dropout=cfg.decoder_tokenizer.get(
                'bpe_dropout', 0.0) if cfg.decoder_tokenizer.get(
                    'bpe_dropout', 0.0) is not None else 0.0,
            decoder_model_name=cfg.decoder.get('model_name') if hasattr(
                cfg.decoder, 'model_name') else None,
            decoder_r2l=cfg.decoder_tokenizer.get('r2l', False),
        )

        if self.multilingual:
            if isinstance(self.src_language, ListConfig) and isinstance(
                    self.tgt_language, ListConfig):
                raise ValueError(
                    "cfg.src_language and cfg.tgt_language cannot both be lists. We only support many-to-one or one-to-many multilingual models."
                )
            elif isinstance(self.src_language, ListConfig):
                for lng in self.src_language:
                    self.multilingual_ids.append(
                        self.encoder_tokenizer.token_to_id("<" + lng + ">"))
            elif isinstance(self.tgt_language, ListConfig):
                for lng in self.tgt_language:
                    self.multilingual_ids.append(
                        self.encoder_tokenizer.token_to_id("<" + lng + ">"))
            else:
                raise ValueError(
                    "Expect either cfg.src_language or cfg.tgt_language to be a list when multilingual=True."
                )

            if isinstance(self.src_language, ListConfig):
                self.tgt_language = [self.tgt_language] * len(
                    self.src_language)
            else:
                self.src_language = [self.src_language] * len(
                    self.tgt_language)

            self.source_processor_list = []
            self.target_processor_list = []
            for src_lng, tgt_lng in zip(self.src_language, self.tgt_language):
                src_prcsr, tgt_prscr = self.setup_pre_and_post_processing_utils(
                    src_lng, tgt_lng)
                self.source_processor_list.append(src_prcsr)
                self.target_processor_list.append(tgt_prscr)

        else:
            # After this call, the model will have  self.source_processor and self.target_processor objects
            self.setup_pre_and_post_processing_utils(self.src_language,
                                                     self.tgt_language)
            self.multilingual_ids = [None]

        # TODO: Why is this base constructor call so late in the game?
        super().__init__(cfg=cfg, trainer=trainer)

        # encoder from NeMo, Megatron-LM, or HuggingFace
        encoder_cfg_dict = OmegaConf.to_container(cfg.get('encoder'))
        encoder_cfg_dict['vocab_size'] = self.encoder_vocab_size
        library = encoder_cfg_dict.pop('library', 'nemo')
        model_name = encoder_cfg_dict.pop('model_name', None)
        pretrained = encoder_cfg_dict.pop('pretrained', False)
        checkpoint_file = encoder_cfg_dict.pop('checkpoint_file', None)
        self.encoder = get_transformer(
            library=library,
            model_name=model_name,
            pretrained=pretrained,
            config_dict=encoder_cfg_dict,
            encoder=True,
            pre_ln_final_layer_norm=encoder_cfg_dict.get(
                'pre_ln_final_layer_norm', False),
            checkpoint_file=checkpoint_file,
        )

        # decoder from NeMo, Megatron-LM, or HuggingFace
        decoder_cfg_dict = OmegaConf.to_container(cfg.get('decoder'))
        decoder_cfg_dict['vocab_size'] = self.decoder_vocab_size
        library = decoder_cfg_dict.pop('library', 'nemo')
        model_name = decoder_cfg_dict.pop('model_name', None)
        pretrained = decoder_cfg_dict.pop('pretrained', False)
        decoder_cfg_dict['hidden_size'] = self.encoder.hidden_size
        self.decoder = get_transformer(
            library=library,
            model_name=model_name,
            pretrained=pretrained,
            config_dict=decoder_cfg_dict,
            encoder=False,
            pre_ln_final_layer_norm=decoder_cfg_dict.get(
                'pre_ln_final_layer_norm', False),
        )

        self.log_softmax = TokenClassifier(
            hidden_size=self.decoder.hidden_size,
            num_classes=self.decoder_vocab_size,
            activation=cfg.head.activation,
            log_softmax=cfg.head.log_softmax,
            dropout=cfg.head.dropout,
            use_transformer_init=cfg.head.use_transformer_init,
        )

        self.beam_search = BeamSearchSequenceGenerator(
            embedding=self.decoder.embedding,
            decoder=self.decoder.decoder,
            log_softmax=self.log_softmax,
            max_sequence_length=self.decoder.max_sequence_length,
            beam_size=cfg.beam_size,
            bos=self.decoder_tokenizer.bos_id,
            pad=self.decoder_tokenizer.pad_id,
            eos=self.decoder_tokenizer.eos_id,
            len_pen=cfg.len_pen,
            max_delta_length=cfg.max_generation_delta,
        )

        # tie weights of embedding and softmax matrices
        self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight

        # TODO: encoder and decoder with different hidden size?
        std_init_range = 1 / self.encoder.hidden_size**0.5

        # initialize weights if not using pretrained encoder/decoder
        if not self._cfg.encoder.get('pretrained', False):
            self.encoder.apply(lambda module: transformer_weights_init(
                module, std_init_range))

        if not self._cfg.decoder.get('pretrained', False):
            self.decoder.apply(lambda module: transformer_weights_init(
                module, std_init_range))

        self.log_softmax.apply(
            lambda module: transformer_weights_init(module, std_init_range))

        self.loss_fn = SmoothedCrossEntropyLoss(
            pad_id=self.decoder_tokenizer.pad_id,
            label_smoothing=cfg.label_smoothing)
        self.eval_loss_fn = NLLLoss(ignore_index=self.decoder_tokenizer.pad_id)
예제 #4
0
    def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None):
        cfg = model_utils.convert_model_config_to_dict_config(cfg)
        # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable
        self.global_rank = 0
        self.world_size = 1
        if trainer is not None:
            self.global_rank = (trainer.node_rank *
                                trainer.num_gpus) + trainer.local_rank
            self.world_size = trainer.num_nodes * trainer.num_gpus

        cfg = model_utils.maybe_update_config_version(cfg)
        self.setup_enc_dec_tokenizers(cfg)

        super().__init__(cfg=cfg, trainer=trainer)

        # TODO: use get_encoder function with support for HF and Megatron
        self.encoder = TransformerEncoderNM(
            vocab_size=self.encoder_vocab_size,
            hidden_size=cfg.encoder.hidden_size,
            num_layers=cfg.encoder.num_layers,
            inner_size=cfg.encoder.inner_size,
            max_sequence_length=cfg.encoder.max_sequence_length if hasattr(
                cfg.encoder, 'max_sequence_length') else 512,
            embedding_dropout=cfg.encoder.embedding_dropout if hasattr(
                cfg.encoder, 'embedding_dropout') else 0.0,
            learn_positional_encodings=cfg.encoder.learn_positional_encodings
            if hasattr(cfg.encoder, 'learn_positional_encodings') else False,
            num_attention_heads=cfg.encoder.num_attention_heads,
            ffn_dropout=cfg.encoder.ffn_dropout,
            attn_score_dropout=cfg.encoder.attn_score_dropout,
            attn_layer_dropout=cfg.encoder.attn_layer_dropout,
            hidden_act=cfg.encoder.hidden_act,
            mask_future=cfg.encoder.mask_future,
            pre_ln=cfg.encoder.pre_ln,
        )

        # TODO: user get_decoder function with support for HF and Megatron
        self.decoder = TransformerDecoderNM(
            vocab_size=self.decoder_vocab_size,
            hidden_size=cfg.decoder.hidden_size,
            num_layers=cfg.decoder.num_layers,
            inner_size=cfg.decoder.inner_size,
            max_sequence_length=cfg.decoder.max_sequence_length if hasattr(
                cfg.decoder, 'max_sequence_length') else 512,
            embedding_dropout=cfg.decoder.embedding_dropout if hasattr(
                cfg.decoder, 'embedding_dropout') else 0.0,
            learn_positional_encodings=cfg.decoder.learn_positional_encodings
            if hasattr(cfg.decoder, 'learn_positional_encodings') else False,
            num_attention_heads=cfg.decoder.num_attention_heads,
            ffn_dropout=cfg.decoder.ffn_dropout,
            attn_score_dropout=cfg.decoder.attn_score_dropout,
            attn_layer_dropout=cfg.decoder.attn_layer_dropout,
            hidden_act=cfg.decoder.hidden_act,
            pre_ln=cfg.decoder.pre_ln,
        )

        self.log_softmax = TokenClassifier(
            hidden_size=self.decoder.hidden_size,
            num_classes=self.decoder_vocab_size,
            activation=cfg.head.activation,
            log_softmax=cfg.head.log_softmax,
            dropout=cfg.head.dropout,
            use_transformer_init=cfg.head.use_transformer_init,
        )

        self.beam_search = BeamSearchSequenceGenerator(
            embedding=self.decoder.embedding,
            decoder=self.decoder.decoder,
            log_softmax=self.log_softmax,
            max_sequence_length=self.decoder.max_sequence_length,
            beam_size=cfg.beam_size,
            bos=self.decoder_tokenizer.bos_id,
            pad=self.decoder_tokenizer.pad_id,
            eos=self.decoder_tokenizer.eos_id,
            len_pen=cfg.len_pen,
            max_delta_length=cfg.max_generation_delta,
        )

        # tie weights of embedding and softmax matrices
        self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight

        # TODO: encoder and decoder with different hidden size?
        std_init_range = 1 / self.encoder.hidden_size**0.5
        self.apply(
            lambda module: transformer_weights_init(module, std_init_range))

        self.loss_fn = SmoothedCrossEntropyLoss(
            pad_id=self.decoder_tokenizer.pad_id,
            label_smoothing=cfg.label_smoothing)
        self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False,
                                                 take_avg_loss=True)
예제 #5
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--model",
        type=str,
        required=True,
        help="Path to .nemo model file(s). If ensembling, provide comma separated paths to multiple models.",
    )
    parser.add_argument("--srctext", type=str, required=True, help="Path to the file to translate.")
    parser.add_argument(
        "--tgtout", type=str, required=True, help="Path to the file where translations are to be written."
    )
    parser.add_argument(
        "--batch_size", type=int, default=256, help="Number of sentences to batch together while translatiing."
    )
    parser.add_argument("--beam_size", type=int, default=4, help="Beam size.")
    parser.add_argument(
        "--len_pen", type=float, default=0.6, help="Length Penalty. Ref: https://arxiv.org/abs/1609.08144"
    )
    parser.add_argument(
        "--max_delta_length",
        type=int,
        default=5,
        help="Stop generating if target sequence length exceeds source length by this number.",
    )
    parser.add_argument(
        "--target_lang", type=str, default=None, help="Target language identifier ex: en,de,fr,es etc."
    )
    parser.add_argument(
        "--source_lang", type=str, default=None, help="Source language identifier ex: en,de,fr,es etc."
    )
    parser.add_argument(
        "--write_scores",
        action="store_true",
        help="Whether to write a separate file with scores not including length penalties corresponding to each beam hypothesis (.score suffix)",
    )
    # shallow fusion specific parameters
    parser.add_argument(
        "--lm_model",
        type=str,
        default=None,
        help="Optional path to an LM model that has the same tokenizer as NMT models for shallow fuison. Note: If using --write_scores, it will add LM scores as well.",
    )
    parser.add_argument(
        "--fusion_coef", type=float, default=0.07, help="Weight assigned to LM scores during shallow fusion."
    )

    args = parser.parse_args()
    torch.set_grad_enabled(False)
    logging.info("Attempting to initialize from .nemo file")
    models = []
    for model_path in args.model.split(','):
        if not model_path.endswith('.nemo'):
            raise NotImplementedError(f"Only support .nemo files, but got: {model_path}")
        model = nemo_nlp.models.machine_translation.MTEncDecModel.restore_from(restore_path=model_path).eval()
        models.append(model)

    src_text = []
    tgt_text = []
    tgt_text_all = []
    src_texts = []
    all_scores = []

    if torch.cuda.is_available():
        models = [model.cuda() for model in models]

    if args.lm_model is not None:
        lm_model = nemo_nlp.models.language_modeling.TransformerLMModel.restore_from(restore_path=args.lm_model).eval()
    else:
        lm_model = None

    if len(models) > 1:
        ensemble_generator = EnsembleBeamSearchSequenceGenerator(
            encoders=[model.encoder for model in models],
            embeddings=[model.decoder.embedding for model in models],
            decoders=[model.decoder.decoder for model in models],
            log_softmaxes=[model.log_softmax for model in models],
            max_sequence_length=512,
            beam_size=args.beam_size,
            bos=models[0].decoder_tokenizer.bos_id,
            pad=models[0].decoder_tokenizer.pad_id,
            eos=models[0].decoder_tokenizer.eos_id,
            len_pen=args.len_pen,
            max_delta_length=args.max_delta_length,
            language_model=lm_model,
            fusion_coef=args.fusion_coef,
        )
    else:
        model = models[0]
        if lm_model is not None:
            model.beam_search = BeamSearchSequenceGeneratorWithLanguageModel(
                embedding=model.decoder.embedding,
                decoder=model.decoder.decoder,
                log_softmax=model.log_softmax,
                bos=model.decoder_tokenizer.bos_id,
                pad=model.decoder_tokenizer.pad_id,
                eos=model.decoder_tokenizer.eos_id,
                language_model=lm_model,
                fusion_coef=args.fusion_coef,
                max_sequence_length=model.decoder.max_sequence_length,
                beam_size=args.beam_size,
                len_pen=args.len_pen,
                max_delta_length=args.max_delta_length,
            )
        else:
            model.beam_search = BeamSearchSequenceGenerator(
                embedding=model.decoder.embedding,
                decoder=model.decoder.decoder,
                log_softmax=model.log_softmax,
                bos=model.decoder_tokenizer.bos_id,
                pad=model.decoder_tokenizer.pad_id,
                eos=model.decoder_tokenizer.eos_id,
                max_sequence_length=model.decoder.max_sequence_length,
                beam_size=args.beam_size,
                len_pen=args.len_pen,
                max_delta_length=args.max_delta_length,
            )

    logging.info(f"Translating: {args.srctext}")

    count = 0
    with open(args.srctext, 'r') as src_f:
        for line in src_f:
            src_text.append(line.strip())
            if len(src_text) == args.batch_size:
                if len(models) > 1:
                    src_ids, src_mask = models[0].prepare_inference_batch(src_text)
                    best_translations = ensemble_generator(src_ids, src_mask, return_beam_scores=args.write_scores)
                    if args.write_scores:
                        all_results, scores, best_translations = (
                            best_translations[0],
                            best_translations[1],
                            best_translations[2],
                        )
                        scores = scores.view(-1).data.cpu().numpy().tolist()
                        all_scores += scores
                        src_texts += [item for item in src_text for i in range(args.beam_size)]
                        all_results = models[0].ids_to_postprocessed_text(
                            all_results, models[0].decoder_tokenizer, models[0].target_processor
                        )
                        tgt_text_all += all_results
                    best_translations = models[0].ids_to_postprocessed_text(
                        best_translations, models[0].decoder_tokenizer, models[0].target_processor
                    )
                    tgt_text += best_translations
                else:
                    best_translations = model.translate(
                        text=src_text,
                        source_lang=args.source_lang,
                        target_lang=args.target_lang,
                        return_beam_scores=args.write_scores,
                    )
                    if args.write_scores:
                        all_results, scores, best_translations = (
                            best_translations[0],
                            best_translations[1],
                            best_translations[2],
                        )
                        all_scores += scores
                        src_texts += [item for item in src_text for i in range(args.beam_size)]
                        tgt_text_all += all_results
                    tgt_text += best_translations
                src_text = []
                print(f"Translated {count + 1} sentences")
            count += 1
        if len(src_text) > 0:
            if len(models) > 1:
                src_ids, src_mask = models[0].prepare_inference_batch(src_text)
                best_translations = ensemble_generator(src_ids, src_mask, return_beam_scores=args.write_scores)
                if args.write_scores:
                    all_results, scores, best_translations = (
                        best_translations[0],
                        best_translations[1],
                        best_translations[2],
                    )
                    scores = scores.view(-1).data.cpu().numpy().tolist()
                    all_scores += scores
                    src_texts += [item for item in src_text for i in range(args.beam_size)]
                    all_results = models[0].ids_to_postprocessed_text(
                        all_results, models[0].decoder_tokenizer, models[0].target_processor
                    )
                    tgt_text_all += all_results
                best_translations = models[0].ids_to_postprocessed_text(
                    best_translations, models[0].decoder_tokenizer, models[0].target_processor
                )
                tgt_text += best_translations
            else:
                best_translations = model.translate(
                    text=src_text,
                    source_lang=args.source_lang,
                    target_lang=args.target_lang,
                    return_beam_scores=args.write_scores,
                )
                if args.write_scores:
                    all_results, scores, best_translations = (
                        best_translations[0],
                        best_translations[1],
                        best_translations[2],
                    )
                    all_scores += scores
                    src_texts += [item for item in src_text for i in range(args.beam_size)]
                    tgt_text_all += all_results
                tgt_text += best_translations
            src_text = []
            print(f"Translated {count} sentences")

    with open(args.tgtout, 'w') as tgt_f:
        for line in tgt_text:
            tgt_f.write(line + "\n")

    if args.write_scores:
        with open(args.tgtout + '.score', 'w') as tgt_f_scores:
            for line, score, inp in zip(tgt_text_all, all_scores, src_texts):
                tgt_f_scores.write(inp + "\t" + line + "\t" + str(score) + "\n")
예제 #6
0
    def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None):
        cfg = model_utils.convert_model_config_to_dict_config(cfg)
        # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable
        # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0

        self.world_size = 1
        if trainer is not None:
            self.world_size = trainer.num_nodes * trainer.num_gpus

        cfg = model_utils.maybe_update_config_version(cfg)

        self.src_language: str = cfg.get("src_language", None)
        self.tgt_language: str = cfg.get("tgt_language", None)

        # Instantiates tokenizers and register to be saved with NeMo Model archive
        # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer
        # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly.
        self.setup_enc_dec_tokenizers(
            encoder_tokenizer_library=cfg.encoder_tokenizer.get('library', 'yttm'),
            encoder_tokenizer_model=cfg.encoder_tokenizer.get('tokenizer_model'),
            encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0),
            encoder_model_name=cfg.encoder.get('model_name') if hasattr(cfg.encoder, 'model_name') else None,
            decoder_tokenizer_library=cfg.decoder_tokenizer.get('library', 'yttm'),
            decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model,
            decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0),
            decoder_model_name=cfg.decoder.get('model_name') if hasattr(cfg.decoder, 'model_name') else None,
        )

        # After this call, the model will have  self.source_processor and self.target_processor objects
        self.setup_pre_and_post_processing_utils(source_lang=self.src_language, target_lang=self.tgt_language)

        # TODO: Why is this base constructor call so late in the game?
        super().__init__(cfg=cfg, trainer=trainer)

        # encoder from NeMo, Megatron-LM, or HuggingFace
        encoder_cfg_dict = OmegaConf.to_container(cfg.get('encoder'))
        encoder_cfg_dict['vocab_size'] = self.encoder_vocab_size
        library = encoder_cfg_dict.pop('library', 'nemo')
        model_name = encoder_cfg_dict.pop('model_name', None)
        pretrained = encoder_cfg_dict.pop('pretrained', False)
        self.encoder = get_transformer(
            library=library, model_name=model_name, pretrained=pretrained, config_dict=encoder_cfg_dict, encoder=True,
        )

        # decoder from NeMo, Megatron-LM, or HuggingFace
        decoder_cfg_dict = OmegaConf.to_container(cfg.get('decoder'))
        decoder_cfg_dict['vocab_size'] = self.decoder_vocab_size
        library = decoder_cfg_dict.pop('library', 'nemo')
        model_name = decoder_cfg_dict.pop('model_name', None)
        pretrained = decoder_cfg_dict.pop('pretrained', False)
        decoder_cfg_dict['hidden_size'] = self.encoder.hidden_size
        self.decoder = get_transformer(
            library=library, model_name=model_name, pretrained=pretrained, config_dict=decoder_cfg_dict, encoder=False,
        )

        self.log_softmax = TokenClassifier(
            hidden_size=self.decoder.hidden_size,
            num_classes=self.decoder_vocab_size,
            activation=cfg.head.activation,
            log_softmax=cfg.head.log_softmax,
            dropout=cfg.head.dropout,
            use_transformer_init=cfg.head.use_transformer_init,
        )

        self.beam_search = BeamSearchSequenceGenerator(
            embedding=self.decoder.embedding,
            decoder=self.decoder.decoder,
            log_softmax=self.log_softmax,
            max_sequence_length=self.decoder.max_sequence_length,
            beam_size=cfg.beam_size,
            bos=self.decoder_tokenizer.bos_id,
            pad=self.decoder_tokenizer.pad_id,
            eos=self.decoder_tokenizer.eos_id,
            len_pen=cfg.len_pen,
            max_delta_length=cfg.max_generation_delta,
        )

        # tie weights of embedding and softmax matrices
        self.log_softmax.mlp.layer0.weight = self.decoder.embedding.token_embedding.weight

        # TODO: encoder and decoder with different hidden size?
        std_init_range = 1 / self.encoder.hidden_size ** 0.5

        # initialize weights if not using pretrained encoder/decoder
        if not self._cfg.encoder.get('pretrained', False):
            self.encoder.apply(lambda module: transformer_weights_init(module, std_init_range))

        if not self._cfg.decoder.get('pretrained', False):
            self.decoder.apply(lambda module: transformer_weights_init(module, std_init_range))

        self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range))

        self.loss_fn = SmoothedCrossEntropyLoss(
            pad_id=self.decoder_tokenizer.pad_id, label_smoothing=cfg.label_smoothing
        )
        self.eval_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
예제 #7
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--model",
        type=str,
        required=True,
        help="Path to .nemo model file(s). If ensembling, provide comma separated paths to multiple models.",
    )
    parser.add_argument("--srctext", type=str, required=True, help="Path to the file to translate.")
    parser.add_argument(
        "--tgtout", type=str, required=True, help="Path to the file where translations are to be written."
    )
    parser.add_argument(
        "--batch_size", type=int, default=256, help="Number of sentences to batch together while translatiing."
    )
    parser.add_argument("--beam_size", type=int, default=4, help="Beam size.")
    parser.add_argument(
        "--len_pen", type=float, default=0.6, help="Length Penalty. Ref: https://arxiv.org/abs/1609.08144"
    )
    parser.add_argument(
        "--max_delta_length",
        type=int,
        default=5,
        help="Stop generating if target sequence length exceeds source length by this number.",
    )
    parser.add_argument(
        "--target_lang", type=str, default=None, help="Target language identifier ex: en,de,fr,es etc."
    )
    parser.add_argument(
        "--source_lang", type=str, default=None, help="Source language identifier ex: en,de,fr,es etc."
    )
    parser.add_argument(
        "--write_scores",
        action="store_true",
        help="Whether to write a separate file with scores not including length penalties corresponding to each beam hypothesis (.score suffix)",
    )
    parser.add_argument(
        "--write_timing",
        action="store_true",
        help="Whether to write a separate file with detailed timing info (.timing.json suffix)",
    )
    # shallow fusion specific parameters
    parser.add_argument(
        "--lm_model",
        type=str,
        default=None,
        help="Optional path to an LM model that has the same tokenizer as NMT models for shallow fuison. Note: If using --write_scores, it will add LM scores as well.",
    )
    parser.add_argument(
        "--fusion_coef", type=float, default=0.07, help="Weight assigned to LM scores during shallow fusion."
    )

    args = parser.parse_args()
    torch.set_grad_enabled(False)
    logging.info("Attempting to initialize from .nemo file")
    models = []
    for model_path in args.model.split(','):
        if not model_path.endswith('.nemo'):
            raise NotImplementedError(f"Only support .nemo files, but got: {model_path}")
        model = nemo_nlp.models.machine_translation.MTEncDecModel.restore_from(restore_path=model_path).eval()
        models.append(model)

    if (len(models) > 1) and (args.write_timing):
        raise RuntimeError("Cannot measure timing when more than 1 model is used")

    src_text = []
    tgt_text = []
    tgt_text_all = []
    src_texts = []
    all_scores = []
    all_timing = []

    if torch.cuda.is_available():
        models = [model.cuda() for model in models]

    if args.lm_model is not None:
        lm_model = nemo_nlp.models.language_modeling.TransformerLMModel.restore_from(restore_path=args.lm_model).eval()
    else:
        lm_model = None

    if len(models) > 1:
        ensemble_generator = EnsembleBeamSearchSequenceGenerator(
            encoders=[model.encoder for model in models],
            embeddings=[model.decoder.embedding for model in models],
            decoders=[model.decoder.decoder for model in models],
            log_softmaxes=[model.log_softmax for model in models],
            max_sequence_length=512,
            beam_size=args.beam_size,
            bos=models[0].decoder_tokenizer.bos_id,
            pad=models[0].decoder_tokenizer.pad_id,
            eos=models[0].decoder_tokenizer.eos_id,
            len_pen=args.len_pen,
            max_delta_length=args.max_delta_length,
            language_model=lm_model,
            fusion_coef=args.fusion_coef,
        )
    else:
        model = models[0]
        ensemble_generator = None
        if lm_model is not None:
            model.beam_search = BeamSearchSequenceGeneratorWithLanguageModel(
                embedding=model.decoder.embedding,
                decoder=model.decoder.decoder,
                log_softmax=model.log_softmax,
                bos=model.decoder_tokenizer.bos_id,
                pad=model.decoder_tokenizer.pad_id,
                eos=model.decoder_tokenizer.eos_id,
                language_model=lm_model,
                fusion_coef=args.fusion_coef,
                max_sequence_length=model.decoder.max_sequence_length,
                beam_size=args.beam_size,
                len_pen=args.len_pen,
                max_delta_length=args.max_delta_length,
            )
        else:
            model.beam_search = BeamSearchSequenceGenerator(
                embedding=model.decoder.embedding,
                decoder=model.decoder.decoder,
                log_softmax=model.log_softmax,
                bos=model.decoder_tokenizer.bos_id,
                pad=model.decoder_tokenizer.pad_id,
                eos=model.decoder_tokenizer.eos_id,
                max_sequence_length=model.decoder.max_sequence_length,
                beam_size=args.beam_size,
                len_pen=args.len_pen,
                max_delta_length=args.max_delta_length,
            )

    logging.info(f"Translating: {args.srctext}")

    with open(args.srctext, 'r') as src_f:
        for line in src_f:
            src_text.append(line.strip())
            if len(src_text) == args.batch_size:
                # warmup when measuring timing
                if not all_timing:
                    print("running a warmup batch")
                    translate_text(
                        models=models,
                        args=args,
                        src_text=src_text,
                        tgt_text=[],
                        tgt_text_all=[],
                        src_texts=[],
                        all_scores=[],
                        all_timing=[],
                        ensemble_generator=ensemble_generator,
                    )
                translate_text(
                    models=models,
                    args=args,
                    src_text=src_text,
                    tgt_text=tgt_text,
                    tgt_text_all=tgt_text_all,
                    src_texts=src_texts,
                    all_scores=all_scores,
                    all_timing=all_timing,
                    ensemble_generator=ensemble_generator,
                )
                src_text = []

        if len(src_text) > 0:
            translate_text(
                models=models,
                args=args,
                src_text=src_text,
                tgt_text=tgt_text,
                tgt_text_all=tgt_text_all,
                src_texts=src_texts,
                all_scores=all_scores,
                all_timing=all_timing,
                ensemble_generator=ensemble_generator,
            )

    with open(args.tgtout, 'w') as tgt_f:
        for line in tgt_text:
            tgt_f.write(line + "\n")

    if args.write_scores:
        with open(args.tgtout + '.score', 'w') as tgt_f_scores:
            for line, score, inp in zip(tgt_text_all, all_scores, src_texts):
                tgt_f_scores.write(inp + "\t" + line + "\t" + str(score) + "\n")

    if args.write_timing:
        # collect list of dicts to a dict of lists
        timing_dict = {}
        if len(all_timing):
            for k in all_timing[0].keys():
                timing_dict[k] = [t[k] for t in all_timing]

        with open(args.tgtout + '.timing.json', 'w') as timing_fh:
            json.dump(timing_dict, timing_fh)