Пример #1
0
    def __init__(self,
                 max_length: int = 256,
                 max_out_len: int = 256,
                 beam_size: int = 5):
        super(MTTransformer, self).__init__()
        bpe_codes_file = os.path.join(MODULE_HOME, 'transformer_zh_en',
                                      'assets', '2M.zh2en.dict4bpe.zh')
        src_vocab_file = os.path.join(MODULE_HOME, 'transformer_zh_en',
                                      'assets', 'vocab.zh')
        trg_vocab_file = os.path.join(MODULE_HOME, 'transformer_zh_en',
                                      'assets', 'vocab.en')
        checkpoint = os.path.join(MODULE_HOME, 'transformer_zh_en', 'assets',
                                  'transformer.pdparams')

        self.max_length = max_length
        self.beam_size = beam_size
        self.tokenizer = MTTokenizer(bpe_codes_file=bpe_codes_file,
                                     lang_src=self.lang_config['source'],
                                     lang_trg=self.lang_config['target'])
        self.src_vocab = Vocab.load_vocabulary(
            filepath=src_vocab_file,
            unk_token=self.vocab_config['unk_token'],
            bos_token=self.vocab_config['bos_token'],
            eos_token=self.vocab_config['eos_token'])
        self.trg_vocab = Vocab.load_vocabulary(
            filepath=trg_vocab_file,
            unk_token=self.vocab_config['unk_token'],
            bos_token=self.vocab_config['bos_token'],
            eos_token=self.vocab_config['eos_token'])
        self.src_vocab_size = (len(self.src_vocab) + self.vocab_config['pad_factor'] - 1) \
            // self.vocab_config['pad_factor'] * self.vocab_config['pad_factor']
        self.trg_vocab_size = (len(self.trg_vocab) + self.vocab_config['pad_factor'] - 1) \
            // self.vocab_config['pad_factor'] * self.vocab_config['pad_factor']
        self.transformer = InferTransformerModel(
            src_vocab_size=self.src_vocab_size,
            trg_vocab_size=self.trg_vocab_size,
            bos_id=self.vocab_config['bos_id'],
            eos_id=self.vocab_config['eos_id'],
            max_length=self.max_length + 1,
            max_out_len=max_out_len,
            beam_size=self.beam_size,
            **self.model_config)

        state_dict = paddle.load(checkpoint)

        # To avoid a longer length than training, reset the size of position
        # encoding to max_length
        state_dict["encoder.pos_encoder.weight"] = position_encoding_init(
            self.max_length + 1, self.model_config['d_model'])
        state_dict["decoder.pos_encoder.weight"] = position_encoding_init(
            self.max_length + 1, self.model_config['d_model'])

        self.transformer.set_state_dict(state_dict)
Пример #2
0
def do_predict(args):
    if args.use_gpu:
        place = "gpu:0"
    else:
        place = "cpu"

    paddle.set_device(place)

    # Define data loader
    test_loader, to_tokens = create_infer_loader(args)

    # Define model
    transformer = InferTransformerModel(src_vocab_size=args.src_vocab_size,
                                        trg_vocab_size=args.trg_vocab_size,
                                        max_length=args.max_length + 1,
                                        n_layer=args.n_layer,
                                        n_head=args.n_head,
                                        d_model=args.d_model,
                                        d_inner_hid=args.d_inner_hid,
                                        dropout=args.dropout,
                                        weight_sharing=args.weight_sharing,
                                        bos_id=args.bos_idx,
                                        eos_id=args.eos_idx,
                                        beam_size=args.beam_size,
                                        max_out_len=args.max_out_len)

    # Load the trained model
    # assert args.init_from_params, (
    #     "Please set init_from_params to load the infer model.")
    init_from_params = 'trained_models/step_final'
    model_dict = paddle.load(
        os.path.join(init_from_params, "transformer.pdparams"))

    # To avoid a longer length than training, reset the size of position
    # encoding to max_length
    model_dict["encoder.pos_encoder.weight"] = position_encoding_init(
        args.max_length + 1, args.d_model)
    model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
        args.max_length + 1, args.d_model)
    transformer.load_dict(model_dict)

    # Set evaluate mode
    transformer.eval()

    f = open(args.output_file, "w")
    with paddle.no_grad():
        for (src_word, ) in test_loader:
            finished_seq = transformer(src_word=src_word)
            finished_seq = finished_seq.numpy().transpose([0, 2, 1])
            for ins in finished_seq:
                for beam_idx, beam in enumerate(ins):
                    if beam_idx >= args.n_best:
                        break
                    id_list = post_process_seq(beam, args.bos_idx,
                                               args.eos_idx)
                    word_list = to_tokens(id_list)
                    sequence = " ".join(word_list) + "\n"
                    f.write(sequence)
    f.close()
Пример #3
0
def do_export(args):
    # Adapt vocabulary size
    reader.adapt_vocab_size(args)
    # Define model
    transformer = InferTransformerModel(
        src_vocab_size=args.src_vocab_size,
        trg_vocab_size=args.trg_vocab_size,
        max_length=args.max_length + 1,
        num_encoder_layers=args.n_layer,
        num_decoder_layers=args.n_layer,
        n_head=args.n_head,
        d_model=args.d_model,
        d_inner_hid=args.d_inner_hid,
        dropout=args.dropout,
        weight_sharing=args.weight_sharing,
        bos_id=args.bos_idx,
        eos_id=args.eos_idx,
        beam_size=args.beam_size,
        max_out_len=args.max_out_len)

    # Load the trained model
    assert args.init_from_params, (
        "Please set init_from_params to load the infer model.")

    model_dict = paddle.load(
        os.path.join(args.init_from_params, "transformer.pdparams"))

    # To avoid a longer length than training, reset the size of position
    # encoding to max_length
    model_dict["encoder.pos_encoder.weight"] = position_encoding_init(
        args.max_length + 1, args.d_model)
    model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
        args.max_length + 1, args.d_model)
    transformer.load_dict(model_dict)
    # Set evaluate mode
    transformer.eval()

    # Convert dygraph model to static graph model 
    transformer = paddle.jit.to_static(
        transformer,
        input_spec=[
            # src_word
            paddle.static.InputSpec(
                shape=[None, None], dtype="int64")
        ])

    # Save converted static graph model
    paddle.jit.save(transformer,
                    os.path.join(args.inference_model_dir, "transformer"))
    logger.info("Transformer has been saved to {}".format(
        args.inference_model_dir))
Пример #4
0
class MTTransformer(nn.Layer):
    """
    Transformer model for machine translation.
    """
    # Language config
    lang_config = {'source': 'zh', 'target': 'en'}

    # Model config
    model_config = {
        # Number of head used in multi-head attention.
        "n_head": 8,
        # The dimension for word embeddings, which is also the last dimension of
        # the input and output of multi-head attention, position-wise feed-forward
        # networks, encoder and decoder.
        "d_model": 512,
        # Size of the hidden layer in position-wise feed-forward networks.
        "d_inner_hid": 2048,
        # The flag indicating whether to share embedding and softmax weights.
        # Vocabularies in source and target should be same for weight sharing.
        "weight_sharing": False,
        # Dropout rate
        'dropout': 0
    }

    # Number of sub-layers to be stacked in the encoder and decoder.
    if Version(paddlenlp.__version__) <= Version('2.0.5'):
        model_config.update({"n_layer": 6})
    else:
        model_config.update({"num_encoder_layers": 6, "num_decoder_layers": 6})

    # Vocab config
    vocab_config = {
        # Used to pad vocab size to be multiple of pad_factor.
        "pad_factor": 8,
        # Index for <bos> token
        "bos_id": 0,
        "bos_token": "<s>",
        # Index for <eos> token
        "eos_id": 1,
        "eos_token": "<e>",
        # Index for <unk> token
        "unk_id": 2,
        "unk_token": "<unk>",
    }

    def __init__(self,
                 max_length: int = 256,
                 max_out_len: int = 256,
                 beam_size: int = 5):
        super(MTTransformer, self).__init__()
        bpe_codes_file = os.path.join(MODULE_HOME, 'transformer_zh_en',
                                      'assets', '2M.zh2en.dict4bpe.zh')
        src_vocab_file = os.path.join(MODULE_HOME, 'transformer_zh_en',
                                      'assets', 'vocab.zh')
        trg_vocab_file = os.path.join(MODULE_HOME, 'transformer_zh_en',
                                      'assets', 'vocab.en')
        checkpoint = os.path.join(MODULE_HOME, 'transformer_zh_en', 'assets',
                                  'transformer.pdparams')

        self.max_length = max_length
        self.beam_size = beam_size
        self.tokenizer = MTTokenizer(bpe_codes_file=bpe_codes_file,
                                     lang_src=self.lang_config['source'],
                                     lang_trg=self.lang_config['target'])
        self.src_vocab = Vocab.load_vocabulary(
            filepath=src_vocab_file,
            unk_token=self.vocab_config['unk_token'],
            bos_token=self.vocab_config['bos_token'],
            eos_token=self.vocab_config['eos_token'])
        self.trg_vocab = Vocab.load_vocabulary(
            filepath=trg_vocab_file,
            unk_token=self.vocab_config['unk_token'],
            bos_token=self.vocab_config['bos_token'],
            eos_token=self.vocab_config['eos_token'])
        self.src_vocab_size = (len(self.src_vocab) + self.vocab_config['pad_factor'] - 1) \
            // self.vocab_config['pad_factor'] * self.vocab_config['pad_factor']
        self.trg_vocab_size = (len(self.trg_vocab) + self.vocab_config['pad_factor'] - 1) \
            // self.vocab_config['pad_factor'] * self.vocab_config['pad_factor']
        self.transformer = InferTransformerModel(
            src_vocab_size=self.src_vocab_size,
            trg_vocab_size=self.trg_vocab_size,
            bos_id=self.vocab_config['bos_id'],
            eos_id=self.vocab_config['eos_id'],
            max_length=self.max_length + 1,
            max_out_len=max_out_len,
            beam_size=self.beam_size,
            **self.model_config)

        state_dict = paddle.load(checkpoint)

        # To avoid a longer length than training, reset the size of position
        # encoding to max_length
        state_dict["encoder.pos_encoder.weight"] = position_encoding_init(
            self.max_length + 1, self.model_config['d_model'])
        state_dict["decoder.pos_encoder.weight"] = position_encoding_init(
            self.max_length + 1, self.model_config['d_model'])

        self.transformer.set_state_dict(state_dict)

    def forward(self, src_words: paddle.Tensor):
        return self.transformer(src_words)

    def _convert_text_to_input(self, text: str):
        """
        Convert input string to ids.
        """
        bpe_tokens = self.tokenizer.tokenize(text)
        if len(bpe_tokens) > self.max_length:
            bpe_tokens = bpe_tokens[:self.max_length]
        return self.src_vocab.to_indices(bpe_tokens)

    def _batchify(self, data: List[str], batch_size: int):
        """
        Generate input batches.
        """
        pad_func = Pad(self.vocab_config['eos_id'])

        def _parse_batch(batch_ids):
            return pad_func(
                [ids + [self.vocab_config['eos_id']] for ids in batch_ids])

        examples = []
        for text in data:
            examples.append(self._convert_text_to_input(text))

        # Seperates data into some batches.
        one_batch = []
        for example in examples:
            one_batch.append(example)
            if len(one_batch) == batch_size:
                yield _parse_batch(one_batch)
                one_batch = []
        if one_batch:
            yield _parse_batch(one_batch)

    @serving
    def predict(self,
                data: List[str],
                batch_size: int = 1,
                n_best: int = 1,
                use_gpu: bool = False):

        if n_best > self.beam_size:
            raise ValueError(
                f'Predict arg "n_best" must be smaller or equal to self.beam_size, \
                but got {n_best} > {self.beam_size}')

        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')

        batches = self._batchify(data, batch_size)

        results = []
        self.eval()
        for batch in batches:
            src_batch_ids = paddle.to_tensor(batch)
            trg_batch_beams = self(src_batch_ids).numpy().transpose([0, 2, 1])

            for trg_sample_beams in trg_batch_beams:
                for beam_idx, beam in enumerate(trg_sample_beams):
                    if beam_idx >= n_best:
                        break
                    trg_sample_ids = post_process_seq(
                        beam, self.vocab_config['bos_id'],
                        self.vocab_config['eos_id'])
                    trg_sample_words = self.trg_vocab.to_tokens(trg_sample_ids)
                    trg_sample_text = self.tokenizer.detokenize(
                        trg_sample_words)
                    results.append(trg_sample_text)

        return results
Пример #5
0
def do_predict(args):
    paddle.enable_static()
    if args.use_gpu:
        place = paddle.set_device("gpu:0")
    else:
        place = paddle.set_device("cpu")

    # Define data loader
    test_loader, to_tokens = reader.create_infer_loader(args)

    test_program = paddle.static.Program()
    startup_program = paddle.static.Program()
    with paddle.static.program_guard(test_program, startup_program):
        src_word = paddle.static.data(
            name="src_word", shape=[None, None], dtype="int64")

        # Define model
        transformer = InferTransformerModel(
            src_vocab_size=args.src_vocab_size,
            trg_vocab_size=args.trg_vocab_size,
            max_length=args.max_length + 1,
            n_layer=args.n_layer,
            n_head=args.n_head,
            d_model=args.d_model,
            d_inner_hid=args.d_inner_hid,
            dropout=args.dropout,
            weight_sharing=args.weight_sharing,
            bos_id=args.bos_idx,
            eos_id=args.eos_idx,
            beam_size=args.beam_size,
            max_out_len=args.max_out_len)

        finished_seq = transformer(src_word=src_word)

    test_program = test_program.clone(for_test=True)

    exe = paddle.static.Executor(place)
    exe.run(startup_program)

    assert (
        args.init_from_params), "must set init_from_params to load parameters"
    paddle.static.load(test_program,
                       os.path.join(args.init_from_params, "transformer"), exe)
    print("finish initing model from params from %s" % (args.init_from_params))

    # cast weights from fp16 to fp32 after loading
    if args.use_pure_fp16:
        cast_parameters_to_fp32(place, test_program)

    f = open(args.output_file, "w")
    for data in test_loader:
        finished_sequence, = exe.run(test_program,
                                     feed={'src_word': data[0]},
                                     fetch_list=finished_seq.name)
        finished_sequence = finished_sequence.transpose([0, 2, 1])
        for ins in finished_sequence:
            for beam_idx, beam in enumerate(ins):
                if beam_idx >= args.n_best:
                    break
                id_list = post_process_seq(beam, args.bos_idx, args.eos_idx)
                word_list = to_tokens(id_list)
                sequence = " ".join(word_list) + "\n"
                f.write(sequence)

    paddle.disable_static()
Пример #6
0
    def __init__(self,
                 src_vocab_size,
                 trg_vocab_size,
                 max_length,
                 num_encoder_layers,
                 num_decoder_layers,
                 n_head,
                 d_model,
                 d_inner_hid,
                 dropout,
                 weight_sharing,
                 bos_id=0,
                 eos_id=1,
                 beam_size=4,
                 max_out_len=256,
                 **kwargs):
        logger.warning(
            "TransformerGenerator is an experimental API and subject to change."
        )
        # `kwargs` can include output_time_major, use_fp16_decoding, topk, topp.
        # The later three arguments can only work when using FasterTransformer,
        # and expose topk, topp later.
        super(TransformerGenerator, self).__init__()
        self.d_model = d_model
        self.max_length = max_length
        self.output_time_major = kwargs.pop("output_time_major", True)
        use_fp16_decoding = kwargs.pop("use_fp16_decoding", False)
        use_ft = kwargs.pop("use_ft", True)
        beam_search_version = kwargs.pop("beam_search_version", "v1")
        rel_len = kwargs.pop("rel_len", False)
        alpha = kwargs.pop("alpha", 0.6)

        if use_ft:
            try:
                load("FasterTransformer", verbose=True)
                decoding_strategy = ("beam_search_v2" if beam_search_version
                                     == "v2" else "beam_search")
                self.transformer = FasterTransformer(
                    src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    max_length=max_length,
                    num_encoder_layers=num_encoder_layers,
                    num_decoder_layers=num_decoder_layers,
                    n_head=n_head,
                    d_model=d_model,
                    d_inner_hid=d_inner_hid,
                    dropout=dropout,
                    weight_sharing=weight_sharing,
                    bos_id=bos_id,
                    eos_id=eos_id,
                    beam_size=beam_size,
                    max_out_len=max_out_len,
                    decoding_strategy=decoding_strategy,
                    use_fp16_decoding=use_fp16_decoding,
                    rel_len=rel_len,
                    alpha=alpha)
            except Exception:
                logger.warning(
                    "Exception occurs when using Faster Transformer. " \
                    "The original forward will be involved. ")
                self.transformer = InferTransformerModel(
                    src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    max_length=max_length,
                    num_encoder_layers=num_encoder_layers,
                    num_decoder_layers=num_decoder_layers,
                    n_head=n_head,
                    d_model=d_model,
                    d_inner_hid=d_inner_hid,
                    dropout=dropout,
                    weight_sharing=weight_sharing,
                    bos_id=bos_id,
                    eos_id=eos_id,
                    beam_size=beam_size,
                    max_out_len=max_out_len,
                    output_time_major=self.output_time_major,
                    beam_search_version=beam_search_version,
                    rel_len=rel_len,
                    alpha=alpha)
        else:
            self.transformer = InferTransformerModel(
                src_vocab_size=src_vocab_size,
                trg_vocab_size=trg_vocab_size,
                max_length=max_length,
                num_encoder_layers=num_encoder_layers,
                num_decoder_layers=num_decoder_layers,
                n_head=n_head,
                d_model=d_model,
                d_inner_hid=d_inner_hid,
                dropout=dropout,
                weight_sharing=weight_sharing,
                bos_id=bos_id,
                eos_id=eos_id,
                beam_size=beam_size,
                max_out_len=max_out_len,
                output_time_major=self.output_time_major,
                beam_search_version=beam_search_version,
                rel_len=rel_len,
                alpha=alpha)
Пример #7
0
class TransformerGenerator(paddle.nn.Layer):
    """
    The Transformer model for auto-regressive generation with beam search. It wraps
    `FasterTransformer` and `InferTransformerModel`, and automatically chioces using
    `FasterTransformer` (with jit building) or the slower verison `InferTransformerModel`.

    Args:
        src_vocab_size (int):
            The size of source vocabulary.
        trg_vocab_size (int):
            The size of target vocabulary.
        max_length (int):
            The maximum length of input sequences.
        num_encoder_layers (int):
            The number of sub-layers to be stacked in the encoder.
        num_decoder_layers (int):
            The number of sub-layers to be stacked in the decoder.
        n_head (int):
            The number of head used in multi-head attention.
        d_model (int):
            The dimension for word embeddings, which is also the last dimension of
            the input and output of multi-head attention, position-wise feed-forward
            networks, encoder and decoder.
        d_inner_hid (int):
            Size of the hidden layer in position-wise feed-forward networks.
        dropout (float):
            Dropout rates. Used for pre-process, activation and inside attention.
        weight_sharing (bool):
            Whether to use weight sharing. 
        bos_id (int, optional):
            The start token id and also is used as padding id. Defaults to 0.
        eos_id (int, optional):
            The end token id. Defaults to 1.
        beam_size (int, optional):
            The beam width for beam search. Defaults to 4. 
        max_out_len (int, optional):
            The maximum output length. Defaults to 256.
        kwargs:
            The key word arguments can be `output_time_major`, `use_ft`, `use_fp16_decoding`,
            `rel_len`, `alpha`:

            - `output_time_major(bool, optional)`: Indicate the data layout of predicted
            Tensor. If `False`, the data layout would be batch major with shape
            `[batch_size, seq_len, beam_size]`. If  `True`, the data layout would
            be time major with shape `[seq_len, batch_size, beam_size]`. Default
            to `False`. 

            - `use_ft(bool, optional)`: Whether to use Faster Transformer
            for decoding. Default to True if not set.

            - `use_fp16_decoding(bool, optional)`: Whether to use fp16
            for decoding.  Only works when using Faster Transformer.

            - `beam_search_version(str, optional)`: Indicating the strategy of
            beam search. It can be 'v1' or 'v2'. 'v2' would select the top
            `beam_size * 2` beams and process the top `beam_size` alive and
            finish beams in them separately, while 'v1' would only select the
            top `beam_size` beams and mix up the alive and finish beams. 'v2' always
            searchs more and get better results, since the alive beams would
            always be `beam_size` while the number of alive beams in `v1` might
            decrease when meeting the end token. However, 'v2' always generates
            longer results thus might do more calculation and be slower.

            - `rel_len(bool, optional)`: Indicating whether `max_out_len` in is
            the length relative to that of source text. Only works in `v2` temporarily.
            It is suggest to set a small `max_out_len` and use `rel_len=True`.
            Default to False if not set.

            - `alpha(float, optional)`: The power number in length penalty
            calculation. Refer to `GNMT <https://arxiv.org/pdf/1609.08144.pdf>`_.
            Only works in `v2` temporarily. Default to 0.6 if not set.
    """
    def __init__(self,
                 src_vocab_size,
                 trg_vocab_size,
                 max_length,
                 num_encoder_layers,
                 num_decoder_layers,
                 n_head,
                 d_model,
                 d_inner_hid,
                 dropout,
                 weight_sharing,
                 bos_id=0,
                 eos_id=1,
                 beam_size=4,
                 max_out_len=256,
                 **kwargs):
        logger.warning(
            "TransformerGenerator is an experimental API and subject to change."
        )
        # `kwargs` can include output_time_major, use_fp16_decoding, topk, topp.
        # The later three arguments can only work when using FasterTransformer,
        # and expose topk, topp later.
        super(TransformerGenerator, self).__init__()
        self.d_model = d_model
        self.max_length = max_length
        self.output_time_major = kwargs.pop("output_time_major", True)
        use_fp16_decoding = kwargs.pop("use_fp16_decoding", False)
        use_ft = kwargs.pop("use_ft", True)
        beam_search_version = kwargs.pop("beam_search_version", "v1")
        rel_len = kwargs.pop("rel_len", False)
        alpha = kwargs.pop("alpha", 0.6)

        if use_ft:
            try:
                load("FasterTransformer", verbose=True)
                decoding_strategy = ("beam_search_v2" if beam_search_version
                                     == "v2" else "beam_search")
                self.transformer = FasterTransformer(
                    src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    max_length=max_length,
                    num_encoder_layers=num_encoder_layers,
                    num_decoder_layers=num_decoder_layers,
                    n_head=n_head,
                    d_model=d_model,
                    d_inner_hid=d_inner_hid,
                    dropout=dropout,
                    weight_sharing=weight_sharing,
                    bos_id=bos_id,
                    eos_id=eos_id,
                    beam_size=beam_size,
                    max_out_len=max_out_len,
                    decoding_strategy=decoding_strategy,
                    use_fp16_decoding=use_fp16_decoding,
                    rel_len=rel_len,
                    alpha=alpha)
            except Exception:
                logger.warning(
                    "Exception occurs when using Faster Transformer. " \
                    "The original forward will be involved. ")
                self.transformer = InferTransformerModel(
                    src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    max_length=max_length,
                    num_encoder_layers=num_encoder_layers,
                    num_decoder_layers=num_decoder_layers,
                    n_head=n_head,
                    d_model=d_model,
                    d_inner_hid=d_inner_hid,
                    dropout=dropout,
                    weight_sharing=weight_sharing,
                    bos_id=bos_id,
                    eos_id=eos_id,
                    beam_size=beam_size,
                    max_out_len=max_out_len,
                    output_time_major=self.output_time_major,
                    beam_search_version=beam_search_version,
                    rel_len=rel_len,
                    alpha=alpha)
        else:
            self.transformer = InferTransformerModel(
                src_vocab_size=src_vocab_size,
                trg_vocab_size=trg_vocab_size,
                max_length=max_length,
                num_encoder_layers=num_encoder_layers,
                num_decoder_layers=num_decoder_layers,
                n_head=n_head,
                d_model=d_model,
                d_inner_hid=d_inner_hid,
                dropout=dropout,
                weight_sharing=weight_sharing,
                bos_id=bos_id,
                eos_id=eos_id,
                beam_size=beam_size,
                max_out_len=max_out_len,
                output_time_major=self.output_time_major,
                beam_search_version=beam_search_version,
                rel_len=rel_len,
                alpha=alpha)

    def forward(self, src_word):
        r"""
        Performs decoding for transformer model.

        Args:
            src_word (Tensor):
                The ids of source sequence words. It is a tensor with shape
                `[batch_size, source_sequence_length]` and its data type can be
                int or int64.
        
        Returns:
            Tensor:
                An int64 tensor shaped indicating the predicted ids. Its shape is
                `[batch_size, seq_len, beam_size]` or `[seq_len, batch_size, beam_size]`
                according to `output_time_major`. While, when using FasterTransformer
                and beam search v2, the beam dimension would be doubled to include
                both the top `beam_size` alive and finish beams, thus the tensor
                shape is `[batch_size, seq_len, beam_size * 2]` or `[seq_len, batch_size, beam_size * 2]`.
        
        Example:
            .. code-block::

                import paddle
                from paddlenlp.ops import TransformerGenerator

                transformer = TransformerGenerator(
                    src_vocab_size=30000,
                    trg_vocab_size=30000,
                    max_length=256,
                    num_encoder_layers=6,
                    num_decoder_layers=6,
                    n_head=8,
                    d_model=512,
                    d_inner_hid=2048,
                    dropout=0.1,
                    weight_sharing=True,
                    bos_id=0,
                    eos_id=1,
                    beam_size=4,
                    max_out_len=256)

                batch_size = 5
                seq_len = 10
                transformer(
                    src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]))
        """
        out = self.transformer(src_word)
        # TODO(guosheng): FasterTransformer has an output with layout
        # `[seq_len, batch_size, beam_size]`. While the output layout of
        # original one is `[batch_size, seq_len, beam_size]`. Maybe we need
        # unify them later.
        if not self.output_time_major and isinstance(self.transformer,
                                                     FasterTransformer):
            out = paddle.transpose(out, [1, 0, 2])
        return out

    def load(self, path):
        if isinstance(self.transformer, FasterTransformer):
            self.transformer.load(path)
        else:
            model_dict = paddle.load(path)
            self.transformer.load_dict(model_dict)