Пример #1
0
    def __init__(self, args, src_dict, dst_dict, embed_tokens, left_pad=False):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024,
            embed_dim,
            padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        )

        self.layers = nn.ModuleList([])
        self.layers.extend([
            fairseq_transformer.TransformerDecoderLayer(args)
            for i in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                args.decoder_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=embed_dim**-0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict, dst_dict, args.vocab_reduction_params)

        self.onnx_trace = False
Пример #2
0
    def _init_components(self, args, src_dict, dst_dict, embed_tokens):
        self.initial_rnn_layer = nn.LSTM(input_size=self.initial_input_dim,
                                         hidden_size=self.lstm_units)

        self.proj_encoder_layer = None
        if self.attention_dim != self.encoder_output_dim:
            self.proj_encoder_layer = fairseq_transformer.Linear(
                self.encoder_output_dim, self.attention_dim)

        self.proj_layer = None
        if self.lstm_units != self.attention_dim:
            self.proj_layer = fairseq_transformer.Linear(
                self.lstm_units, self.attention_dim)

        self.attention = fairseq_transformer.MultiheadAttention(
            self.attention_dim,
            self.num_attention_heads,
            dropout=args.attention_dropout)

        self.extra_rnn_layers = nn.ModuleList([])
        for _ in range(self.num_layers - 1):
            self.extra_rnn_layers.append(
                nn.LSTM(input_size=self.input_dim,
                        hidden_size=self.lstm_units))

        self.bottleneck_layer = None
        if self.bottleneck_dim is not None:
            self.out_embed_dim = self.bottleneck_dim
            self.bottleneck_layer = fairseq_transformer.Linear(
                self.input_dim, self.out_embed_dim)
        else:
            self.out_embed_dim = self.input_dim

        self.embed_out = nn.Parameter(
            torch.Tensor(len(dst_dict), self.out_embed_dim))
        nn.init.normal_(self.embed_out, mean=0, std=self.out_embed_dim**-0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False
Пример #3
0
    def __init__(
        self,
        src_dict,
        dst_dict,
        vocab_reduction_params=None,
        out_embed_dim=512,
        project_output=True,
    ):
        super().__init__(dst_dict)
        self.project_output = project_output
        if project_output:
            self.num_embeddings = len(dst_dict)
            self.out_embed_dim = out_embed_dim
            self.vocab_reduction_module = None
            if vocab_reduction_params:
                self.vocab_reduction_module = vocab_reduction.VocabReduction(
                    src_dict, dst_dict, vocab_reduction_params)

            self.output_projection_w = nn.Parameter(
                torch.FloatTensor(self.num_embeddings,
                                  self.out_embed_dim).uniform_(-0.1, 0.1))
            self.output_projection_b = nn.Parameter(
                torch.FloatTensor(self.num_embeddings).zero_())
Пример #4
0
    def __init__(
        self,
        src_dict,
        dst_dict,
        decoders,
        combination_strategy,
        split_encoder=False,
        vocab_reduction_params=None,
    ):
        """Create a new multi-decoder instance.

        Args:
            src_dict (Dictionary): Source language dictionary.
            dst_dict (Dictionary): Target language dictionary.
            decoders (list): List of DecoderWithOutputProjection.
            combination_strategy (string): Name of the combination strategy.
                Passed through to `create_strategy()`.
            split_encoder (bool): If true, split encoder output, each decoder
                gets its own split.
            vocab_reduction_params: For vocabular reduction.
        """
        super().__init__(dst_dict)
        assert not any(decoder.project_output for decoder in decoders)
        self.decoders = nn.ModuleList(decoders)
        vocab_reduction_module = None
        if vocab_reduction_params:
            vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict, dst_dict, vocab_reduction_params
            )
        self.combi_strat = create_strategy(
            combination_strategy,
            [decoder.out_embed_dim for decoder in decoders],
            len(dst_dict),
            vocab_reduction_module,
        )
        self.split_encoder = split_encoder
Пример #5
0
    def __init__(
        self,
        src_dict,
        dst_dict,
        vocab_reduction_params=None,
        encoder_hidden_dim=512,
        embed_dim=512,
        freeze_embed=False,
        hidden_dim=512,
        out_embed_dim=512,
        cell_type="lstm",
        num_layers=1,
        dropout_in=0.1,
        dropout_out=0.1,
        attention_type="dot",
        residual_level=None,
        averaging_encoder=False,
    ):
        super().__init__(dst_dict)
        self.encoder_hidden_dim = encoder_hidden_dim
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.out_embed_dim = out_embed_dim
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.attention_type = attention_type
        self.residual_level = residual_level

        num_embeddings = len(dst_dict)
        padding_idx = dst_dict.pad()
        self.embed_tokens = Embedding(
            num_embeddings=num_embeddings,
            embedding_dim=embed_dim,
            padding_idx=padding_idx,
            freeze_embed=freeze_embed,
        )
        self.hidden_dim = hidden_dim
        self.averaging_encoder = averaging_encoder

        if cell_type == "lstm":
            cell_class = rnn_cell.LSTMCell
        elif cell_type == "milstm":
            cell_class = rnn_cell.MILSTMCell
        elif cell_type == "layer_norm_lstm":
            cell_class = rnn_cell.LayerNormLSTMCell

        if hidden_dim != encoder_hidden_dim:
            hidden_init_fc_list = []
            cell_init_fc_list = []
            for _ in range(num_layers):
                hidden_init_fc_list.append(
                    Linear(encoder_hidden_dim, hidden_dim))
                cell_init_fc_list.append(Linear(encoder_hidden_dim,
                                                hidden_dim))
            self.hidden_init_fc_list = nn.ModuleList(hidden_init_fc_list)
            self.cell_init_fc_list = nn.ModuleList(cell_init_fc_list)
        self.initial_attn_context = nn.Parameter(
            torch.Tensor(encoder_hidden_dim).zero_(), )

        if attention_type is not None:
            self.attention = attention.build_attention(
                attention_type=attention_type,
                decoder_hidden_state_dim=hidden_dim,
                encoder_output_dim=encoder_hidden_dim,
            )
            self.combined_output_and_context_dim = encoder_hidden_dim + hidden_dim
        else:
            self.attention = None
            self.combined_output_and_context_dim = hidden_dim

        layers = []
        for layer in range(num_layers):
            if layer == 0:
                if self.attention is not None:
                    cell_input_dim = encoder_hidden_dim + embed_dim
                else:
                    cell_input_dim = embed_dim
            else:
                cell_input_dim = hidden_dim
            layers.append(
                cell_class(input_dim=cell_input_dim, hidden_dim=hidden_dim))
        self.layers = nn.ModuleList(layers)

        if self.combined_output_and_context_dim != out_embed_dim:
            self.additional_fc = Linear(self.combined_output_and_context_dim,
                                        out_embed_dim)

        self.vocab_reduction_module = None
        if vocab_reduction_params:
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict, dst_dict, vocab_reduction_params)

        self.output_projection_w = nn.Parameter(
            torch.FloatTensor(num_embeddings,
                              out_embed_dim).uniform_(-0.1, 0.1))
        self.output_projection_b = nn.Parameter(
            torch.FloatTensor(num_embeddings).zero_())
Пример #6
0
    def __init__(
        self,
        src_dict,
        dst_dict,
        vocab_reduction_params=None,
        out_embed_dim=512,
        project_output=True,
        pretrained_embed=None,
        out_embed_norm=None,
        att_weighted_src_embeds=False,
        src_embed_dim=512,
        att_weighted_activation_type="tanh",
        predictor=None,
        fp16: bool = False,
    ):
        super().__init__(dst_dict)
        self.project_output = project_output
        if project_output:
            self.num_embeddings = len(dst_dict)
            self.out_embed_dim = out_embed_dim
            self.out_embed_norm = out_embed_norm
            self.att_weighted_src_embeds = att_weighted_src_embeds
            self.src_embed_dim = src_embed_dim
            self.vocab_reduction_module = None
            if vocab_reduction_params or predictor is not None:
                self.vocab_reduction_module = vocab_reduction.VocabReduction(
                    src_dict=src_dict,
                    dst_dict=dst_dict,
                    vocab_reduction_params=vocab_reduction_params,
                    predictor=predictor,
                    fp16=fp16,
                )

            projection_weights = torch.FloatTensor(
                self.num_embeddings, self.out_embed_dim
            ).uniform_(-0.1, 0.1)
            if isinstance(pretrained_embed, nn.Embedding):
                projection_weights.data = pretrained_embed.weights.data
            elif pretrained_embed is not None:
                embed_dict = utils.parse_embedding(pretrained_embed)
                # equivalent to utils.load_embedding but for nn.Parameter
                for idx in range(len(dst_dict)):
                    token = dst_dict[idx]
                    if token in embed_dict:
                        projection_weights[idx] = embed_dict[token]
            self.output_projection_w = nn.Parameter(projection_weights)
            self.output_projection_b = nn.Parameter(
                torch.FloatTensor(self.num_embeddings).zero_()
            )
            if att_weighted_activation_type == "tanh":
                activation_fn = nn.Tanh
                self.att_weighted_activation_fn = torch.tanh
            elif att_weighted_activation_type == "relu":
                activation_fn = nn.ReLU
                self.att_weighted_activation_fn = torch.relu
            else:
                raise Exception(
                    "att_weighted_activation_type '%s' not implemented"
                    % att_weighted_activation_type
                )
            if att_weighted_src_embeds:
                print(att_weighted_activation_type)
                self.lexical_layer = NonlinearLayer(
                    self.src_embed_dim,
                    self.out_embed_dim,
                    bias=False,
                    activation_fn=activation_fn,
                )
                self.output_projection_w_lex = nn.Parameter(
                    torch.FloatTensor(self.num_embeddings, self.out_embed_dim).uniform_(
                        -0.1, 0.1
                    )
                )
                self.output_projection_b_lex = nn.Parameter(
                    torch.FloatTensor(self.num_embeddings).zero_()
                )
Пример #7
0
    def __init__(self, args, src_dict, dst_dict, embed_tokens):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.decoder_layerdrop = 0
        if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0:
            self.decoder_layerdrop = args.decoder_layerdrop

        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos)

        self.aan = args.aan
        decoder_layer_class = (AANDecoderLayer if self.aan else
                               fairseq_transformer.TransformerDecoderLayer)

        self.layers = nn.ModuleList([])
        self.layers.extend(
            [decoder_layer_class(args) for i in range(args.decoder_layers)])
        if hasattr(args,
                   "decoder_layers_to_keep") and args.decoder_layers_to_keep:
            layers_to_keep = sorted(
                int(x) for x in args.decoder_layers_to_keep.split(","))
            self.decoder_layers_to_keep = {
                layer_id: layer_idx
                for layer_idx, layer_id in enumerate(layers_to_keep)
            }

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = fairseq_transformer.Linear(
                embed_dim, args.decoder_out_embed_dim)
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False

        # Use quantizable nn.Linear for output projection instead of F.linear
        self.output_projection = None
        if self.vocab_reduction_module is None:
            if self.share_input_output_embed:
                self.output_projection = nn.Linear(
                    self.embed_tokens.weight.shape[1],
                    self.embed_tokens.weight.shape[0])
                self.output_projection.weight = self.embed_tokens.weight
            else:
                self.output_projection = nn.Linear(self.embed_out.shape[1],
                                                   self.embed_out.shape[0])
                self.output_projection.weight = self.embed_out
Пример #8
0
    def __init__(
        self,
        src_dict,
        dst_dict,
        decoders,
        combination_strategy,
        is_lm=None,
        split_encoder=False,
        vocab_reduction_params=None,
        training_schedule="complete",
        fixed_weights=None,
    ):
        """Create a new multi-decoder instance.

        Args:
            src_dict (Dictionary): Source language dictionary.
            dst_dict (Dictionary): Target language dictionary.
            decoders (list): List of DecoderWithOutputProjection.
            combination_strategy (string): Name of the combination strategy.
                Passed through to `create_strategy()`.
            is_lm (list): List of booleans determining whether the n-th
                decoder is a language model. If None, none of the decoders are
                considered an LM.
            split_encoder (bool): If true, split encoder output, each decoder
                gets its own split.
            vocab_reduction_params: For vocabular reduction.
            training_schedule (str): Training strategy.
            fixed_weights (list): None or list of floats. If specified, use
                these fixed model weights in weighted* combination strategies.
        """
        super().__init__(dst_dict)
        if is_lm is None:
            is_lm = [False] * len(decoders)
        assert not any(decoder.project_output for decoder in decoders)
        assert len(is_lm) == len(decoders)
        self.attentive_decoder_ids = [i for i, b in enumerate(is_lm) if not b]
        self.decoders_is_lm = is_lm
        self.decoders = nn.ModuleList(decoders)
        vocab_reduction_module = None
        if vocab_reduction_params:
            vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict, dst_dict, vocab_reduction_params)
        self.combi_strat = create_strategy(
            combination_strategy,
            [decoder.out_embed_dim for decoder in decoders],
            len(dst_dict),
            vocab_reduction_module,
            fixed_weights,
        )
        self.split_encoder = split_encoder
        self.unfreeze_single = False
        self.separate_training = False
        self.unfreeze_idx = -1
        if self.training:
            if training_schedule in ["freeze_all", "freeze_all_decoders"]:
                self.freeze_decoders()
            elif training_schedule.startswith(
                    "unfreeze_dec_") or training_schedule.startswith(
                        "unfreeze_encdec_"):
                _, _, n = training_schedule.split("_")
                self.freeze_decoders(int(n))
            elif training_schedule in [
                    "unfreeze_single", "unfreeze_single_decoder"
            ]:
                self.unfreeze_single = True
                self.unfreeze_mod = len(decoders)
            elif training_schedule == "separate":
                self.unfreeze_single = True
                self.unfreeze_mod = len(decoders) + 1
                self.separate_training = True
            elif training_schedule != "complete":
                raise RuntimeError(
                    f"Unknown training schedule '{training_schedule}'")
Пример #9
0
    def __init__(
        self,
        args,
        src_dict,
        dst_dict,
        embed_tokens,
        no_encoder_attn=False,
        left_pad=False,
        final_norm=True,
    ):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerAANDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = Linear(embed_dim,
                                           args.decoder_out_embed_dim)
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)
        self.register_buffer("version", torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False