def __init__(
        self,
        dictionary,
        embed_dim=512,
        hidden_size=512,
        out_embed_dim=512,
        num_layers=1,
        dropout_in=0.1,
        dropout_out=0.1,
        attention=True,
        encoder_embed_dim=512,
        encoder_output_units=512,
        pretrained_embed=None,
        share_input_output_embed=False,
        adaptive_softmax_cutoff=None,
    ):
        super().__init__(dictionary)
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.hidden_size = hidden_size
        self.share_input_output_embed = share_input_output_embed
        self.need_attn = True
        self.num_layers = num_layers
        self.adaptive_softmax = None
        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        if pretrained_embed is None:
            self.embed_tokens = Embedding(num_embeddings, embed_dim,
                                          padding_idx)
        else:
            self.embed_tokens = pretrained_embed

        self.encoder_output_units = encoder_output_units
        assert encoder_output_units == hidden_size, \
            'encoder_output_units ({}) != hidden_size ({})'.format(encoder_output_units, hidden_size)
        # TODO another Linear layer if not equal

        self.layers = nn.ModuleList([
            LSTMCell(
                input_size=encoder_output_units +
                embed_dim if layer == 0 else hidden_size,
                hidden_size=hidden_size,
            ) for layer in range(num_layers)
        ])

        self.attention_layers = nn.ModuleList()
        for i in range(num_layers):
            self.attention_layers.append(
                AttentionLayer(encoder_output_units, hidden_size))
        # self.attention = AttentionLayer(encoder_output_units, hidden_size) if attention else None
        if hidden_size != out_embed_dim:
            self.additional_fc = Linear(hidden_size, out_embed_dim)
        if adaptive_softmax_cutoff is not None:
            # setting adaptive_softmax dropout to dropout_out for now but can be redefined
            self.adaptive_softmax = AdaptiveSoftmax(num_embeddings,
                                                    embed_dim,
                                                    adaptive_softmax_cutoff,
                                                    dropout=dropout_out)
        elif not self.share_input_output_embed:
            self.fc_out = Linear(out_embed_dim,
                                 num_embeddings,
                                 dropout=dropout_out)

        self.re_fc3 = Linear(out_embed_dim,
                             num_embeddings,
                             dropout=dropout_out)
Exemplo n.º 2
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        self.args = args
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout = args.dropout
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.embed_dim = embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(
            embed_dim)

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = (PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        self.cross_self_attention = getattr(args, "cross_self_attention",
                                            False)

        if self.decoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_decoder_layer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])
        self.num_layers = len(self.layers)

        if args.decoder_normalize_before and not getattr(
                args, "no_decoder_final_norm", False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None

        self.project_out_dim = (Linear(
            embed_dim, self.output_embed_dim, bias=False)
                                if embed_dim != self.output_embed_dim
                                and not args.tie_adaptive_weights else None)

        self.adaptive_softmax = None
        self.output_projection = None
        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif self.share_input_output_embed:
            self.output_projection = nn.Linear(
                self.embed_tokens.weight.shape[1],
                self.embed_tokens.weight.shape[0],
                bias=False,
            )
            self.output_projection.weight = self.embed_tokens.weight
        else:
            self.output_projection = nn.Linear(self.output_embed_dim,
                                               len(dictionary),
                                               bias=False)
            nn.init.normal_(self.output_projection.weight,
                            mean=0,
                            std=self.output_embed_dim**-0.5)
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 no_encoder_attn=False,
                 left_pad=False,
                 final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            ContextTransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \
            if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)
Exemplo n.º 4
0
class transformer_with_copyDecoder(FairseqIncrementalDecoder):
    """
    transformer_with_copy decoder consisting of *args.decoder_layers* layers. Each layer
    is a :class:`transformer_with_copyDecoderLayer`.

    Args:
        args (argparse.Namespace): parsed command-line arguments
        dictionary (~fairseq.data.Dictionary): decoding dictionary
        embed_tokens (torch.nn.Embedding): output embedding
        no_encoder_attn (bool, optional): whether to attend to encoder outputs
            (default: False).
        final_norm (bool, optional): apply layer norm to the output of the
            final decoder layer (default: True).
    """
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 no_encoder_attn=False,
                 final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            transformer_with_copyDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.copy_attention = MultiheadOnlyAttention(
            embed_dim,
            1,
            dropout=0,
        )
        self.copy_or_generate = nn.Sequential(nn.Linear(embed_dim, 1),
                                              nn.Sigmoid())

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \
            if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)

    def forward(self,
                prev_output_tokens,
                encoder_out=None,
                incremental_state=None):
        """
        Args:
            prev_output_tokens (LongTensor): previous decoder outputs of shape
                `(batch, tgt_len)`, for input feeding/teacher forcing
            encoder_out (Tensor, optional): output from the encoder, used for
                encoder-side attention
            incremental_state (dict): dictionary used for storing state during
                :ref:`Incremental decoding`

        Returns:
            tuple:
                - the last decoder layer's output of shape `(batch, tgt_len,
                  vocab)`
                - the last decoder layer's attention weights of shape `(batch,
                  tgt_len, src_len)`
        """
        # embed positions
        positions = self.embed_positions(
            prev_output_tokens,
            incremental_state=incremental_state,
        ) if self.embed_positions is not None else None

        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
            if positions is not None:
                positions = positions[:, -1:]

        # embed tokens and positions
        x = self.embed_scale * self.embed_tokens(prev_output_tokens)

        if self.project_in_dim is not None:
            x = self.project_in_dim(x)

        if positions is not None:
            x += positions
        x = F.dropout(x, p=self.dropout, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        inner_states = [x]

        # decoder layers
        for layer in self.layers:
            x, _ = layer(
                x,
                encoder_out['encoder_out']
                if encoder_out is not None else None,
                encoder_out['encoder_padding_mask']
                if encoder_out is not None else None,
                incremental_state,
                self_attn_mask=self.buffered_future_mask(x)
                if incremental_state is None else None,
            )
            inner_states.append(x)

        if self.normalize:
            x = self.layer_norm(x)

        _, copy = self.copy_attention(
            query=x,
            key=encoder_out['encoder_out']
            if encoder_out is not None else None,
            value=encoder_out['encoder_out']
            if encoder_out is not None else None,
            key_padding_mask=encoder_out['encoder_padding_mask']
            if encoder_out is not None else None,
            incremental_state=incremental_state,
            static_kv=True,
            need_weights=True,
        )

        copy_or_generate = self.copy_or_generate(x).transpose(0, 1)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        if self.project_out_dim is not None:
            x = self.project_out_dim(x)

        if self.adaptive_softmax is None:
            # project back to size of vocabulary
            if self.share_input_output_embed:
                x = F.linear(x, self.embed_tokens.weight)
            else:
                x = F.linear(x, self.embed_out)

        return x, {
            'attn': copy,
            'inner_states': inner_states,
            'copy_or_generate': copy_or_generate
        }

    def get_normalized_probs(self, net_output, log_probs, sample):
        """Get normalized probabilities (or log probs) from a net's output."""
        # print('enter normalized.')
        if 'net_input' in sample.keys():
            enc_seq_ids = sample['net_input']['src_tokens']
        else:
            enc_seq_ids = sample['src_tokens']

        # wvocab_size = net_output[0].size(2)
        # batch_size = enc_seq_ids.size(0)
        # seq_len = enc_seq_ids.size(1)
        # one_hot = torch.zeros(batch_size, seq_len, wvocab_size).cuda().scatter_(dim=2, index=enc_seq_ids.unsqueeze(-1), value=1)
        #
        # copy_probs = torch.matmul(net_output[1]['attn'], one_hot)

        # final_dist = vocab_dist.scatter_add(1, encoder_batch_extend_vocab, attn_dist)

        if hasattr(self,
                   'adaptive_softmax') and self.adaptive_softmax is not None:
            if sample is not None:
                assert 'target' in sample
                target = sample['target']
            else:
                target = None
            out = self.adaptive_softmax.get_log_prob(net_output[0],
                                                     target=target)
            return out.exp_() if not log_probs else out

        logits = net_output[0]
        if log_probs:
            generate = utils.softmax(
                logits, dim=-1,
                onnx_trace=self.onnx_trace) * net_output[1]['copy_or_generate']
            copy = net_output[1]['attn'] * (1 -
                                            net_output[1]['copy_or_generate'])
            enc_seq_ids = enc_seq_ids.unsqueeze(1).repeat(
                1, net_output[1]['attn'].size(1), 1)
            final = generate.scatter_add(2, enc_seq_ids, copy)
            final = torch.log(final + 1e-15)
            return final
        else:
            generate = utils.log_softmax(
                logits, dim=-1,
                onnx_trace=self.onnx_trace) * net_output[1]['copy_or_generate']
            copy = net_output[1]['attn'] * (1 -
                                            net_output[1]['copy_or_generate'])
            enc_seq_ids = enc_seq_ids.unsqueeze(1).repeat(
                1, net_output[1]['attn'].size(1), 1)
            final = generate.scatter_add(2, enc_seq_ids, copy)
            return final

    def max_positions(self):
        """Maximum output length supported by the decoder."""
        if self.embed_positions is None:
            return self.max_target_positions
        return min(self.max_target_positions,
                   self.embed_positions.max_positions())

    def buffered_future_mask(self, tensor):
        dim = tensor.size(0)
        if not hasattr(
                self, '_future_mask'
        ) or self._future_mask is None or self._future_mask.device != tensor.device:
            self._future_mask = torch.triu(
                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1)
        if self._future_mask.size(0) < dim:
            self._future_mask = torch.triu(
                utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)),
                1)
        return self._future_mask[:dim, :dim]

    def upgrade_state_dict_named(self, state_dict, name):
        """Upgrade a (possibly old) state dict for new versions of fairseq."""
        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
            weights_key = '{}.embed_positions.weights'.format(name)
            if weights_key in state_dict:
                del state_dict[weights_key]
            state_dict['{}.embed_positions._float_tensor'.format(
                name)] = torch.FloatTensor(1)

        for i in range(len(self.layers)):
            # update layer norms
            layer_norm_map = {
                '0': 'self_attn_layer_norm',
                '1': 'encoder_attn_layer_norm',
                '2': 'final_layer_norm'
            }
            for old, new in layer_norm_map.items():
                for m in ('weight', 'bias'):
                    k = '{}.layers.{}.layer_norms.{}.{}'.format(
                        name, i, old, m)
                    if k in state_dict:
                        state_dict['{}.layers.{}.{}.{}'.format(
                            name, i, new, m)] = state_dict[k]
                        del state_dict[k]
        if utils.item(
                state_dict.get('{}.version'.format(name), torch.Tensor(
                    [1]))[0]) < 2:
            # earlier checkpoints did not normalize after the stack of layers
            self.layer_norm = None
            self.normalize = False
            state_dict['{}.version'.format(name)] = torch.Tensor([1])

        return state_dict
Exemplo n.º 5
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([3]))

        self.dropout = args.dropout
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(
            embed_dim)

        self.project_in_dim = Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.cross_self_attention = getattr(args, 'cross_self_attention',
                                            False)
        self.layer_wise_attention = getattr(args, 'layer_wise_attention',
                                            False)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), self.output_embed_dim))
            nn.init.normal_(self.embed_out,
                            mean=0,
                            std=self.output_embed_dim**-0.5)

        if args.decoder_normalize_before and not getattr(
                args, 'no_decoder_final_norm', False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, 'layernorm_embedding', False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None
Exemplo n.º 6
0
    def __init__(
        self,
        dictionary,
        embed_dim=512,
        hidden_size=512,
        out_embed_dim=512,
        num_layers=1,
        dropout_in=0.1,
        dropout_out=0.1,
        encoder_output_units=0,
        attn_type=None,
        attn_dim=0,
        need_attn=False,
        residual=False,
        pretrained_embed=None,
        share_input_output_embed=False,
        adaptive_softmax_cutoff=None,
        max_target_positions=DEFAULT_MAX_TARGET_POSITIONS,
        scheduled_sampling_rate_scheduler=None,
    ):
        super().__init__(dictionary)
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.hidden_size = hidden_size
        self.share_input_output_embed = share_input_output_embed
        if attn_type is None or attn_type.lower() == 'none':
            # no attention, no encoder output needed (language model case)
            need_attn = False
            encoder_output_units = 0
        self.need_attn = need_attn
        self.residual = residual
        self.max_target_positions = max_target_positions

        self.adaptive_softmax = None
        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        if pretrained_embed is None:
            self.embed_tokens = Embedding(num_embeddings, embed_dim,
                                          padding_idx)
        else:
            self.embed_tokens = pretrained_embed

        self.encoder_output_units = encoder_output_units

        self.layers = nn.ModuleList([
            LSTMCell(
                input_size=encoder_output_units +
                (embed_dim if layer == 0 else hidden_size),
                hidden_size=hidden_size,
            ) for layer in range(num_layers)
        ])
        if attn_type is None or attn_type.lower() == 'none':
            self.attention = None
        elif attn_type.lower() == 'bahdanau':
            self.attention = speech_attention.BahdanauAttention(
                hidden_size,
                encoder_output_units,
                attn_dim,
            )
        elif attn_type.lower() == 'luong':
            self.attention = speech_attention.LuongAttention(
                hidden_size,
                encoder_output_units,
            )
        else:
            raise ValueError('unrecognized attention type.')
        if hidden_size + encoder_output_units != out_embed_dim:
            self.additional_fc = Linear(hidden_size + encoder_output_units,
                                        out_embed_dim)
        if adaptive_softmax_cutoff is not None:
            # setting adaptive_softmax dropout to dropout_out for now but can be redefined
            self.adaptive_softmax = AdaptiveSoftmax(num_embeddings,
                                                    hidden_size,
                                                    adaptive_softmax_cutoff,
                                                    dropout=dropout_out)
        elif not self.share_input_output_embed:
            self.fc_out = Linear(out_embed_dim,
                                 num_embeddings,
                                 dropout=dropout_out)

        self.scheduled_sampling_rate_scheduler = scheduled_sampling_rate_scheduler
class FConvCustomDecoder(FairseqIncrementalDecoder):
    """Convolutional decoder"""
    def __init__(
        self,
        dictionary,
        embed_dim=512,
        embed_dict=None,
        out_embed_dim=256,
        max_positions=1024,
        convolutions=((512, 3), ) * 20,
        attention=True,
        dropout=0.1,
        share_embed=False,
        positional_embeddings=True,
        adaptive_softmax_cutoff=None,
        normalization_constant=0.5,
        left_pad=False,
    ):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([2]))
        self.dropout = dropout
        self.normalization_constant = normalization_constant
        self.left_pad = left_pad

        convolutions = extend_conv_spec(convolutions)
        in_channels = convolutions[0][0]
        if isinstance(attention, bool):
            # expand True into [True, True, ...] and do the same with False
            attention = [attention] * len(convolutions)
        if not isinstance(attention,
                          list) or len(attention) != len(convolutions):
            raise ValueError(
                'Attention is expected to be a list of booleans of '
                'length equal to the number of layers.')

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if embed_dict:
            self.embed_tokens = utils.load_embedding(embed_dict,
                                                     self.dictionary,
                                                     self.embed_tokens)

        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            padding_idx,
            left_pad=self.left_pad,
        ) if positional_embeddings else None

        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.auxattention = nn.ModuleList()
        self.attention = nn.ModuleList()
        self.auxgates = nn.ModuleList()
        self.residuals = []

        layer_in_channels = [in_channels]
        for i, (out_channels, kernel_size,
                residual) in enumerate(convolutions):
            if residual == 0:
                residual_dim = out_channels
            else:
                residual_dim = layer_in_channels[-residual]
            self.projections.append(
                Linear(residual_dim, out_channels
                       ) if residual_dim != out_channels else None)
            self.convolutions.append(
                LinearizedConv1d(in_channels,
                                 out_channels * 2,
                                 kernel_size,
                                 padding=(kernel_size - 1),
                                 dropout=dropout))
            self.auxattention.append(
                AttentionLayer(out_channels, embed_dim, self.
                               normalization_constant
                               ) if attention[i] else None)

            self.attention.append(
                AttentionLayer(out_channels, embed_dim, self.
                               normalization_constant
                               ) if attention[i] else None)
            self.auxgates.append(
                Gating(gate_dim=out_channels, inputs_dim=out_channels))

            self.residuals.append(residual)
            in_channels = out_channels
            layer_in_channels.append(out_channels)

        self.adaptive_softmax = None
        self.fc2 = self.fc3 = None

        if adaptive_softmax_cutoff is not None:
            assert not share_embed
            self.adaptive_softmax = AdaptiveSoftmax(num_embeddings,
                                                    in_channels,
                                                    adaptive_softmax_cutoff,
                                                    dropout=dropout)
        else:
            self.fc2 = Linear(in_channels, out_embed_dim)
            if share_embed:
                assert out_embed_dim == embed_dim, \
                    "Shared embed weights implies same dimensions " \
                    " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
                self.fc3 = nn.Linear(out_embed_dim, num_embeddings)
                self.fc3.weight = self.embed_tokens.weight
            else:
                self.fc3 = Linear(out_embed_dim,
                                  num_embeddings,
                                  dropout=dropout)

    def forward(self,
                prev_output_tokens,
                auxencoder_out_dict=None,
                encoder_out_dict=None,
                incremental_state=None):

        if auxencoder_out_dict is not None:
            auxencoder_out = auxencoder_out_dict['encoder_out']
            auxencoder_padding_mask = auxencoder_out_dict[
                'encoder_padding_mask']

            # split and transpose aux. encoder outputs
            auxencoder_a, auxencoder_b = self._split_encoder_out(
                auxencoder_out, incremental_state, aux=True)

        if encoder_out_dict is not None:
            encoder_out = encoder_out_dict['encoder_out']
            encoder_padding_mask = encoder_out_dict['encoder_padding_mask']

            # split and transpose encoder outputs
            encoder_a, encoder_b = self._split_encoder_out(
                encoder_out, incremental_state)

        if self.embed_positions is not None:
            pos_embed = self.embed_positions(prev_output_tokens,
                                             incremental_state)
        else:
            pos_embed = 0

        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
        x = self._embed_tokens(prev_output_tokens, incremental_state)

        # embed tokens and combine with positional embeddings
        x += pos_embed
        x = F.dropout(x, p=self.dropout, training=self.training)
        target_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = self._transpose_if_training(x, incremental_state)

        # temporal convolutions
        avg_attn_scores = None
        avg_auxattn_scores = None
        residuals = [x]
        for proj, conv, attention, auxattention, res_layer, auxgate in zip(
                self.projections, self.convolutions, self.attention,
                self.auxattention, self.residuals, self.auxgates):
            if res_layer > 0:
                residual = residuals[-res_layer]
                residual = residual if proj is None else proj(residual)
            else:
                residual = None

            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x, incremental_state)
            x = F.glu(x, dim=2)

            # auxiliary attention
            acx = None
            if auxattention is not None:
                x = self._transpose_if_training(x, incremental_state)
                acx, auxattn_scores = auxattention(
                    x, target_embedding, (auxencoder_a, auxencoder_b),
                    auxencoder_padding_mask)
                auxattn_scores = auxattn_scores / len(self.auxattention)
                if avg_auxattn_scores is None:
                    avg_auxattn_scores = auxattn_scores
                else:
                    avg_auxattn_scores.add_(auxattn_scores)
                x = self._transpose_if_training(x, incremental_state)

            # attention
            if attention is not None:
                x = self._transpose_if_training(x, incremental_state)

                cx, attn_scores = attention(x, target_embedding,
                                            (encoder_a, encoder_b),
                                            encoder_padding_mask)
                attn_scores = attn_scores / len(self.attention)
                if avg_attn_scores is None:
                    avg_attn_scores = attn_scores
                else:
                    avg_attn_scores.add_(attn_scores)

                if acx is not None:
                    auxgt = auxgate(decoder_state=x, attn=cx)

                x = (x + cx) * math.sqrt(self.normalization_constant)
                if acx is not None:
                    x = (x + auxgt * acx) * math.sqrt(
                        self.normalization_constant)

                x = self._transpose_if_training(x, incremental_state)

        # residual
            if residual is not None:
                x = (x + residual) * math.sqrt(self.normalization_constant)
            residuals.append(x)

        # T x B x C -> B x T x C
        x = self._transpose_if_training(x, incremental_state)

        # project back to size of vocabulary if not using adaptive softmax
        if self.fc2 is not None and self.fc3 is not None:
            x = self.fc2(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = self.fc3(x)

        return x, avg_attn_scores

    def get_normalized_probs(self, net_output, log_probs, sample):
        """Get normalized probabilities (or log probs) from a net's output."""

        if self.adaptive_softmax is not None:
            assert sample is not None and 'target' in sample
            out = self.adaptive_softmax.get_log_prob(net_output[0],
                                                     sample['target'])
            return out.exp_() if not log_probs else out
        else:
            return super().get_normalized_probs(net_output, log_probs, sample)

    def reorder_incremental_state(self, incremental_state, new_order):
        super().reorder_incremental_state(incremental_state, new_order)
        auxencoder_out = utils.get_incremental_state(self, incremental_state,
                                                     'auxencoder_out')
        encoder_out = utils.get_incremental_state(self, incremental_state,
                                                  'encoder_out')
        if auxencoder_out is not None:
            auxencoder_out = tuple(
                aeo.index_select(0, new_order) for aeo in auxencoder_out)
            utils.set_incremental_state(self, incremental_state,
                                        'auxencoder_out', auxencoder_out)
        if encoder_out is not None:
            encoder_out = tuple(
                eo.index_select(0, new_order) for eo in encoder_out)
            utils.set_incremental_state(self, incremental_state, 'encoder_out',
                                        encoder_out)

    def reorder_encoder_out(self, encoder_out_dict, new_order):
        if encoder_out_dict and encoder_out_dict[
                'encoder_padding_mask'] is not None:
            encoder_out_dict['encoder_padding_mask'] = \
                encoder_out_dict['encoder_padding_mask'].index_select(0, new_order)
        return encoder_out_dict

    def reorder_auxencoder_out(self, auxencoder_out_dict, new_order):
        if auxencoder_out_dict and auxencoder_out_dict[
                'encoder_padding_mask'] is not None:
            auxencoder_out_dict['encoder_padding_mask'] = \
                encoder_out_dict['encoder_padding_mask'].index_select(0, new_order)
        return auxencoder_out_dict

    def max_positions(self):
        """Maximum output length supported by the decoder."""
        return self.embed_positions.max_positions(
        ) if self.embed_positions is not None else float('inf')

    def upgrade_state_dict(self, state_dict):
        if state_dict.get('decoder.version', torch.Tensor([1]))[0] < 2:
            # old models use incorrect weight norm dimension
            for i, conv in enumerate(self.convolutions):
                # reconfigure weight norm
                nn.utils.remove_weight_norm(conv)
                self.convolutions[i] = nn.utils.weight_norm(conv, dim=0)
            state_dict['decoder.version'] = torch.Tensor([1])
        return state_dict

    def _embed_tokens(self, tokens, incremental_state):
        if incremental_state is not None:
            # keep only the last token for incremental forward pass
            tokens = tokens[:, -1:]
        return self.embed_tokens(tokens)

    def _split_encoder_out(self, encoder_out, incremental_state, aux=False):
        """Split and transpose encoder outputs.

        This is cached when doing incremental inference.
        """
        state_name = 'encoder_out'
        if aux == True:
            state_name = 'aux' + state_name

        cached_result = utils.get_incremental_state(self, incremental_state,
                                                    state_name)

        if cached_result is not None:
            return cached_result

        # transpose only once to speed up attention layers
        encoder_a, encoder_b = encoder_out
        encoder_a = encoder_a.transpose(1, 2).contiguous()
        result = (encoder_a, encoder_b)

        if incremental_state is not None:
            utils.set_incremental_state(self, incremental_state, state_name,
                                        result)
        return result

    def _transpose_if_training(self, x, incremental_state):
        if incremental_state is None:
            x = x.transpose(0, 1)
        return x
Exemplo n.º 8
0
    def __init__(self, args, dictionary, embed_tokens, embed_scale=None, no_encoder_attn=False, left_pad=False,
                 final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        self.embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions
        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__
        )

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(self.embed_dim) if embed_scale is None else embed_scale

        self.project_in_dim = nn.Linear(input_embed_dim, self.embed_dim,
                                        bias=False) if self.embed_dim != input_embed_dim else None

        #self.project_hid_dim = nn.Linear(self.embed_dim,self.embed_dim,bias=True)
        self.embed_positions = PositionalEmbedding(
            args.max_target_positions, self.embed_dim, self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_dec_token_positional_embeddings else None

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(self.embed_dim)
        else:
            self.layernorm_embedding = None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        if args.decoder_normalize_before and not getattr(
                args, "no_decoder_final_norm", False
        ):
            self.layer_norm = LayerNorm(self.embed_dim)
        else:
            self.layer_norm = None

        self.project_out_dim = (
            nn.Linear(self.embed_dim, output_embed_dim, bias=False)
            if self.embed_dim != output_embed_dim and not args.tie_adaptive_weights
            else None
        )

        self.load_softmax = not getattr(args, 'remove_head', False)

        if self.load_softmax:
            if args.adaptive_softmax_cutoff is not None:
                self.adaptive_softmax = AdaptiveSoftmax(
                    len(dictionary),
                    output_embed_dim,
                    options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                    dropout=args.adaptive_softmax_dropout,
                    adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
                    factor=args.adaptive_softmax_factor,
                    tie_proj=args.tie_adaptive_proj,
                )
            elif not self.share_input_output_embed:
                self.output_projection = nn.Linear(
                    self.output_embed_dim, len(dictionary), bias=False
                )
                nn.init.normal_(
                    self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5
                )
            else:
                self.output_projection = nn.Linear(
                    self.embed_tokens.weight.shape[1],
                    self.embed_tokens.weight.shape[0],
                    bias=False,
                )
                self.output_projection.weight = self.embed_tokens.weight
        self.register_buffer('version', torch.Tensor([2]))
Exemplo n.º 9
0
    def build_single_decoder(args,
                             src_dict,
                             dst_dict,
                             ngram_decoder=None,
                             project_output=True,
                             is_lm=False):
        if args.adaptive_softmax_cutoff is not None:
            project_output = False
        attention_type = args.attention_type
        encoder_hidden_dim = args.encoder_hidden_dim
        if is_lm:
            attention_type = "no"
            encoder_hidden_dim = 0
        if ngram_decoder:
            if args.ngram_activation_type == "relu":
                activation_fn = nn.ReLU
            elif args.ngram_activation_type == "tanh":
                activation_fn = nn.Tanh
            else:
                raise Exception("ngram_activation_type '%s' not implemented" %
                                args.ngram_activation_type)
            decoder = NGramDecoder(
                src_dict=src_dict,
                dst_dict=dst_dict,
                n=ngram_decoder,
                encoder_hidden_dim=encoder_hidden_dim,
                embed_dim=args.decoder_embed_dim,
                freeze_embed=args.decoder_freeze_embed,
                out_embed_dim=args.decoder_out_embed_dim,
                num_layers=args.decoder_layers,
                hidden_dim=args.decoder_hidden_dim,
                attention_type=attention_type,
                dropout_in=args.decoder_dropout_in,
                dropout_out=args.decoder_dropout_out,
                residual_level=args.residual_level,
                activation_fn=activation_fn,
                project_output=project_output,
                pretrained_embed=args.decoder_pretrained_embed,
                projection_pretrained_embed=args.decoder_out_pretrained_embed,
            )
        else:
            decoder = RNNDecoder(
                src_dict=src_dict,
                dst_dict=dst_dict,
                vocab_reduction_params=args.vocab_reduction_params,
                encoder_hidden_dim=encoder_hidden_dim,
                embed_dim=args.decoder_embed_dim,
                freeze_embed=args.decoder_freeze_embed,
                out_embed_dim=args.decoder_out_embed_dim,
                cell_type=args.cell_type,
                num_layers=args.decoder_layers,
                hidden_dim=args.decoder_hidden_dim,
                attention_type=attention_type,
                dropout_in=args.decoder_dropout_in,
                dropout_out=args.decoder_dropout_out,
                residual_level=args.residual_level,
                averaging_encoder=args.averaging_encoder,
                project_output=project_output,
                pretrained_embed=args.decoder_pretrained_embed,
                projection_pretrained_embed=args.decoder_out_pretrained_embed,
                tie_embeddings=args.decoder_tie_embeddings,
                att_weighted_src_embeds=args.att_weighted_src_embeds,
                src_embed_dim=args.encoder_embed_dim,
                att_weighted_activation_type=args.att_weighted_activation_type,
            )

        # Being able to use adaptive softmax for RNN decoder
        decoder.adaptive_softmax = None

        if args.adaptive_softmax_cutoff is not None:
            decoder.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                args.decoder_out_embed_dim or args.decoder_hidden_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.dropout,
            )
        return decoder
Exemplo n.º 10
0
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 left_pad=False,
                 final_norm=True):
        super().__init__(dictionary)
        self.padding_idx = embed_tokens.padding_idx
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.embed_dim
        output_embed_dim = args.output_dim

        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        #self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim

        self.max_positions = args.max_positions + 1

        self.embed_segment = nn.Embedding(
            args.num_segment,
            embed_dim,
            self.padding_idx,
        ) if args.num_segment > 0 else None

        self.project_in_dim = nn.Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None
        self.prediction_word_embedding = nn.Parameter(
            torch.Tensor(1, 1, embed_dim).zero_())
        self.embed_positions = PositionalEmbedding(
            self.max_positions,
            embed_dim,
            padding_idx,
            left_pad=left_pad,
        ) if not args.no_token_positional_embeddings else None

        def make_layers(args, layers, needs_key_values):
            if args.universal:
                layers = [
                    ShuffleTransformerDecoderLayer(
                        args, needs_key_values=needs_key_values)
                ] * layers
            else:
                layers = [
                    ShuffleTransformerDecoderLayer(
                        args, needs_key_values=needs_key_values)
                    for _ in range(layers)
                ]
            return nn.ModuleList(layers)

        self.stacked_decoder = args.stacked_decoder
        self.encoder_layers = make_layers(args,
                                          args.encoder_layers,
                                          needs_key_values=True)
        self.decoder_layers = make_layers(
            args, args.decoder_layers,
            needs_key_values=False) if args.asymmetric else self.encoder_layers

        if not args.stacked_decoder and args.encoder_layers != args.decoder_layers:
            raise (
                "If not using stacked-decoder, encoder and decoder must have the same number of layers"
            )
        if not args.asymmetric and args.encoder_layers != args.decoder_layers:
            raise (
                "If not using asymmetric, encoder and decoder must have the same number of layers"
            )

        if args.relative_position == 'sinusoidal':
            num_positions = self.max_positions
            sinusoidal_positions = SinusoidalPositionalEmbedding.get_embedding(
                num_positions, args.embed_dim // args.attention_heads)
            sinusoidal_relative_positions = []
            for i in range(num_positions):
                sinusoidal_relative_positions.append(
                    torch.cat([
                        sinusoidal_positions[num_positions - i:],
                        sinusoidal_positions[:num_positions - i]
                    ], 0))
                # Make sentinel token have same relative position to everything
                sinusoidal_relative_positions[-1][0] = 0
                assert sinusoidal_relative_positions[-1].size(
                ) == sinusoidal_positions.size()
            sinusoidal_relative_positions = torch.stack(
                sinusoidal_relative_positions, 0)
            self.sinusoidal_relative_positions = nn.Parameter(
                sinusoidal_relative_positions)

            assert sinusoidal_relative_positions.size() == (
                num_positions, num_positions,
                args.embed_dim // args.attention_heads)
            #assert (sinusoidal_relative_positions[0] == sinusoidal_positions).all()
            assert (sinusoidal_relative_positions[7, 7] ==
                    sinusoidal_relative_positions[11, 11]).all()
            assert (sinusoidal_relative_positions[5, 11] ==
                    sinusoidal_relative_positions[6, 12]).all()
        else:
            self.sinusoidal_relative_positions = None

        self.adaptive_softmax = None

        self.project_out_dim = nn.Linear(embed_dim, output_embed_dim, bias=False) \
            if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None

        self.load_softmax = not getattr(args, 'remove_head', False)
        if self.load_softmax:
            if args.adaptive_softmax_cutoff is not None:
                self.adaptive_softmax = AdaptiveSoftmax(
                    len(dictionary),
                    output_embed_dim,
                    options.eval_str_list(args.adaptive_softmax_cutoff,
                                          type=int),
                    dropout=args.adaptive_softmax_dropout,
                    adaptive_inputs=embed_tokens
                    if args.tie_adaptive_weights else None,
                    factor=args.adaptive_softmax_factor,
                    tie_proj=args.tie_adaptive_proj,
                )
            elif not self.share_input_output_embed:
                self.embed_out = nn.Parameter(
                    torch.Tensor(len(dictionary), output_embed_dim))
                nn.init.normal_(self.embed_out,
                                mean=0,
                                std=output_embed_dim**-0.5)

            #if args.sentence_class_num > 0:
            #    self.sentence_projection_layer = Linear(embed_dim, args.sentence_class_num, bias=False)

        self.normalize = args.normalize_before and final_norm
        if self.normalize:
            self.layer_norm = BertLayerNorm(embed_dim)

        self.apply(self.init_bert_weights)
Exemplo n.º 11
0
    def __init__(
        self,
        dictionary,
        rnn_type="lstm",
        embed_dim=512,
        hidden_size=512,
        out_embed_dim=512,
        num_layers=1,
        dropout_in=0.1,
        dropout_out=0.1,
        attention_type="luong-dot",
        encoder_output_units=512,
        pretrained_embed=None,
        share_input_output_embed=False,
        adaptive_softmax_cutoff=None,
        max_target_positions=DEFAULT_MAX_TARGET_POSITIONS,
        residuals=False,
    ):
        super().__init__(dictionary)
        self.dropout_in_module = FairseqDropout(
            dropout_in, module_name=self.__class__.__name__
        )
        self.dropout_out_module = FairseqDropout(
            dropout_out, module_name=self.__class__.__name__
        )
        self.hidden_size = hidden_size
        self.share_input_output_embed = share_input_output_embed
        self.need_attn = True
        self.max_target_positions = max_target_positions
        self.residuals = residuals
        self.num_layers = num_layers

        self.adaptive_softmax = None
        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        if pretrained_embed is None:
            self.embed_tokens = torch.nn.Embedding(num_embeddings, embed_dim, padding_idx)
        else:
            self.embed_tokens = pretrained_embed

        self.encoder_output_units = encoder_output_units
        if encoder_output_units != hidden_size and encoder_output_units != 0:
            self.encoder_hidden_proj = torch.nn.Linear(encoder_output_units, hidden_size)
            self.encoder_cell_proj = torch.nn.Linear(encoder_output_units, hidden_size)
        else:
            self.encoder_hidden_proj = self.encoder_cell_proj = None

        # input feeding is described in arxiv.org/abs/1508.04025
        input_feed_size = 0 if encoder_output_units == 0 else hidden_size
        # For Bahdanau, we compute the context on the input feed
        bahd_factor = hidden_size \
            if attention_type in ["bahdanau-dot", "bahdanau-concat", "bahdanau-general", "bahdanau"] \
            else 0
        self.rnn_type = rnn_type
        if rnn_type == "lstm":
            self.layers = LSTM(
                input_size=input_feed_size + embed_dim + bahd_factor,
                hidden_size=hidden_size,
                num_layers=num_layers
            )
        else:
            self.layers = GRU(
                input_size=input_feed_size + embed_dim + bahd_factor,
                hidden_size=hidden_size,
                num_layers=num_layers
            )

        if attention_type == "none":
            self.attention_type = "none"
            self.attention = None
        else:
            self.attention_type = attention_type
            self.attention = Attention(self.attention_type, hidden_size)

        if hidden_size != out_embed_dim:
            self.additional_fc = torch.nn.Linear(hidden_size, out_embed_dim)

        if adaptive_softmax_cutoff is not None:
            # setting adaptive_softmax dropout to dropout_out for now but can be redefined
            self.adaptive_softmax = AdaptiveSoftmax(
                num_embeddings,
                hidden_size,
                adaptive_softmax_cutoff,
                dropout=dropout_out,
            )
        elif not self.share_input_output_embed:
            self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
Exemplo n.º 12
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = (PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        self.cross_self_attention = getattr(args, "cross_self_attention",
                                            False)
        self.layer_wise_attention = getattr(args, "layer_wise_attention",
                                            False)

        # We combine the q/k/v flows of decoder self-attention with the q flow
        # of encoder-decoder attention. So there are four flows in total.
        self.proj_bias_selfattn = nn.Parameter(
            torch.randn(4, args.decoder_layers, embed_dim))
        # The remaining k/v flows of encoder-decoder attention.
        self.proj_bias_encattn_kv = nn.Parameter(
            torch.randn(2, args.decoder_layers, embed_dim))
        multiplier = 2
        max_seq_len = 255  #255 or 490
        self.register_buffer("time_dt", torch.tensor(0.01))
        self.register_buffer("max_seq_len", torch.tensor(max_seq_len))
        #self.register_buffer("max_seq_len", torch.tensor(490))
        #self.register_buffer("update_flow_every", torch.tensor(100))
        self.register_buffer("update_flow_every", torch.tensor(400))
        self.register_buffer("k_updates", torch.tensor(0))
        self.register_buffer(
            "cached_bias_selfattn",
            torch.randn(max_seq_len, 4, args.decoder_layers, embed_dim))
        self.register_buffer(
            "cached_bias_encattn",
            torch.randn(max_seq_len, 2, args.decoder_layers, embed_dim))
        self.proj_flow_selfattn = flow_func_linear(args.decoder_layers,
                                                   multiplier, embed_dim)
        self.proj_flow_encattn_kv = flow_func_linear(args.decoder_layers,
                                                     multiplier, embed_dim)
        #self.proj_flow_selfattn = id_flow(args.decoder_layers, multiplier, embed_dim)
        #self.proj_flow_encattn_kv = id_flow(
        #    args.decoder_layers, multiplier, embed_dim
        #)
        self.layers = nn.ModuleList([])
        self.layers.extend([
            FlowTransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = (Linear(
            embed_dim, self.output_embed_dim, bias=False)
                                if embed_dim != self.output_embed_dim
                                and not args.tie_adaptive_weights else None)

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), self.output_embed_dim))
            nn.init.normal_(self.embed_out,
                            mean=0,
                            std=self.output_embed_dim**-0.5)

        if args.decoder_normalize_before and not getattr(
                args, "no_decoder_final_norm", False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None

        self.ode_args = {
            "method": "rk4",
            "options": {
                "step_size": self.time_dt.item() / 5.0
            },
        }
        self.reset_parameters()
class TransformerDecoder(FairseqIncrementalDecoder):
    """
    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
    is a :class:`TransformerDecoderLayer`.

    Args:
        args (argparse.Namespace): parsed command-line arguments
        dictionary (~fairseq.data.Dictionary): decoding dictionary
        embed_tokens (torch.nn.Embedding): output embedding
        no_encoder_attn (bool, optional): whether to attend to encoder outputs
            (default: False).
        final_norm (bool, optional): apply layer norm to the output of the
            final decoder layer (default: True).
    """
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 no_encoder_attn=False,
                 final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), self.output_embed_dim))
            nn.init.normal_(self.embed_out,
                            mean=0,
                            std=self.output_embed_dim**-0.5)

        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)
        self.onnx_trace = False
        self.decoder_max_order = args.decoder_max_order
        self.clamp_value = getattr(args, 'clamp_value', 0.01)
        self.gs_clamp = args.gs_clamp

    def set_perm_order(self, perm_order=0):
        assert isinstance(perm_order, int) and 0 <= perm_order <= 5
        for layer in self.layers:
            layer.set_perm_order(perm_order)

    def forward(self,
                prev_output_tokens,
                encoder_out=None,
                incremental_state=None,
                **unused):
        """
        Args:
            prev_output_tokens (LongTensor): previous decoder outputs of shape
                `(batch, tgt_len)`, for input feeding/teacher forcing
            encoder_out (Tensor, optional): output from the encoder, used for

                encoder-side attention
            incremental_state (dict): dictionary used for storing state during
                :ref:`Incremental decoding`

        Returns:
            tuple:
                - the decoder's output of shape `(batch, tgt_len, vocab)`
                - a dictionary with any model-specific outputs
        """
        x, extra = self.extract_features(prev_output_tokens, encoder_out,
                                         incremental_state)
        x = self.output_layer(x, encoder_out)
        return x, extra

    def extract_features(self,
                         prev_output_tokens,
                         encoder_out=None,
                         incremental_state=None,
                         **unused):
        """
        Similar to *forward* but only return features.

        Returns:
            tuple:
                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
                - a dictionary with any model-specific outputs
        """
        # embed positions
        positions = self.embed_positions(
            prev_output_tokens,
            incremental_state=incremental_state,
        ) if self.embed_positions is not None else None

        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
            if positions is not None:
                positions = positions[:, -1:]

        # embed tokens and positions
        x = self.embed_scale * self.embed_tokens(prev_output_tokens)

        if self.project_in_dim is not None:
            x = self.project_in_dim(x)

        if positions is not None:
            x += positions
        x = F.dropout(x, p=self.dropout, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)
        attn = None

        inner_states = [x]

        # decoder layers
        for layer in self.layers:
            x, attn = layer(
                x,
                encoder_out['encoder_out']
                if encoder_out is not None else None,
                encoder_out['encoder_padding_mask']
                if encoder_out is not None else None,
                incremental_state,
                self_attn_mask=self.buffered_future_mask(x)
                if incremental_state is None else None,
            )
            inner_states.append(x)

        if self.normalize:
            x = self.layer_norm(x)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        if self.project_out_dim is not None:
            x = self.project_out_dim(x)

        return x, {'attn': attn, 'inner_states': inner_states}

    def output_layer(self, features, encoder_out, **kwargs):
        """Project features to the vocabulary size."""
        if self.adaptive_softmax is None:
            # project back to size of vocabulary
            if self.share_input_output_embed:
                return [
                    F.linear(features, self.embed_tokens.weight),
                    encoder_out['encoder_pred_order']
                ]
            else:
                return F.linear(features, self.embed_out)
        else:
            return features

    def max_positions(self):
        """Maximum output length supported by the decoder."""
        if self.embed_positions is None:
            return self.max_target_positions
        return min(self.max_target_positions,
                   self.embed_positions.max_positions())

    def buffered_future_mask(self, tensor):
        dim = tensor.size(0)
        if not hasattr(
                self, '_future_mask'
        ) or self._future_mask is None or self._future_mask.device != tensor.device:
            self._future_mask = torch.triu(
                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1)
        if self._future_mask.size(0) < dim:
            self._future_mask = torch.triu(
                utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)),
                1)
        return self._future_mask[:dim, :dim]

    def upgrade_state_dict_named(self, state_dict, name):
        """Upgrade a (possibly old) state dict for new versions of fairseq."""
        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
            weights_key = '{}.embed_positions.weights'.format(name)
            if weights_key in state_dict:
                del state_dict[weights_key]
            state_dict['{}.embed_positions._float_tensor'.format(
                name)] = torch.FloatTensor(1)

        for i in range(len(self.layers)):
            # update layer norms
            layer_norm_map = {
                '0': 'self_attn_layer_norm',
                '1': 'encoder_attn_layer_norm',
                '2': 'final_layer_norm'
            }
            for old, new in layer_norm_map.items():
                for m in ('weight', 'bias'):
                    k = '{}.layers.{}.layer_norms.{}.{}'.format(
                        name, i, old, m)
                    if k in state_dict:
                        state_dict['{}.layers.{}.{}.{}'.format(
                            name, i, new, m)] = state_dict[k]
                        del state_dict[k]
        if utils.item(
                state_dict.get('{}.version'.format(name), torch.Tensor(
                    [1]))[0]) < 2:
            # earlier checkpoints did not normalize after the stack of layers
            self.layer_norm = None
            self.normalize = False
            state_dict['{}.version'.format(name)] = torch.Tensor([1])

        return state_dict

    def get_normalized_probs(self,
                             net_output,
                             log_probs,
                             sample,
                             gs_tau=0.5,
                             gs_hard=False):
        """Get normalized probabilities (or log probs) from a net's output."""

        if hasattr(self,
                   'adaptive_softmax') and self.adaptive_softmax is not None:
            if sample is not None:
                assert 'target' in sample
                target = sample['target']
            else:
                target = None
            out = self.adaptive_softmax.get_log_prob(net_output[0],
                                                     target=target)
            return out.exp_() if not log_probs else out

        logits = net_output[0][0]
        orders = net_output[0][1]
        if log_probs:
            return (utils.log_softmax(logits,
                                      dim=-1,
                                      onnx_trace=self.onnx_trace),
                    *self.gumbel_softmax(
                        orders, gs_tau=gs_tau, gs_hard=gs_hard, dim=-1))
        else:
            return (utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace),
                    *self.gumbel_softmax(
                        orders, gs_tau=gs_tau, gs_hard=gs_hard, dim=-1))

    def gumbel_softmax(self, logits, gs_tau=0.5, gs_hard=False, dim=-1):
        if not gs_hard:
            prob = utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
            prob_clamp = torch.clamp(
                prob, self.clamp_value,
                1. - (self.decoder_max_order - 1) * self.clamp_value)
            logprob = torch.log(prob_clamp if self.gs_clamp else prob)
            gs = F.gumbel_softmax(
                logprob,
                tau=gs_tau,
                hard=False,
            )
        else:
            prob = utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
            prob_clamp = torch.clamp(
                prob, self.clamp_value,
                1. - (self.decoder_max_order - 1) * self.clamp_value)
            max_idx = torch.argmax(logits, -1, keepdim=True)
            one_hot = logits.new_zeros(logits.size())
            gs = one_hot.scatter(-1, max_idx, 1)
        return gs, prob, prob_clamp
Exemplo n.º 14
0
    def __init__(
        self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512,
        num_layers=1, dropout_in=0.1, dropout_out=0.1, num_attentions=1,
        encoder_output_units=512, pretrained_embed=None,
        share_input_output_embed=False, adaptive_softmax_cutoff=None,
        max_target_positions=DEFAULT_MAX_TARGET_POSITIONS,
        token_map=None,
        granularity_flags=None,
        double_learning=False
    ):
        super().__init__(dictionary)
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.hidden_size = hidden_size
        self.share_input_output_embed = share_input_output_embed
        self.need_attn = True
        self.max_target_positions = max_target_positions
        self.token2components_map = token_map
        self.token_sequences = granularity_flags[0] if granularity_flags is not None else False
        self.char_sequences = granularity_flags[1] if granularity_flags is not None else False
        self.g_id = 'char' if ((not self.token_sequences) and self.char_sequences) else 'token'
        self.merge_flag = False
        self.double_learning = double_learning 

        self.adaptive_softmax = None
        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        if pretrained_embed is None:
            self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) 
        else:
            self.embed_tokens = pretrained_embed

        self.encoder_output_units = encoder_output_units
        if encoder_output_units != hidden_size and encoder_output_units != 0:
            self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size)
            self.encoder_cell_proj = Linear(encoder_output_units, hidden_size)
        else:
            self.encoder_hidden_proj = self.encoder_cell_proj = None

        # disable input feeding if there is no encoder
        # input feeding is described in arxiv.org/abs/1508.04025
        input_feed_size = 0 if encoder_output_units == 0 else hidden_size * num_attentions
        total_embed_size = 2*embed_dim if self.token_sequences and self.char_sequences else embed_dim
        self.layers = nn.ModuleList([
            LSTMCell(
                input_size=input_feed_size + total_embed_size if layer == 0 else hidden_size,
                hidden_size=hidden_size,
            )
            for layer in range(num_layers)
        ])

        self.num_attentions = num_attentions
        self.attentions = nn.ModuleList()
        for i in range(num_attentions):
            # TODO make bias configurable
            query_size = hidden_size
            key_size = encoder_output_units if i == 0 else out_embed_dim
            value_size = hidden_size 
            self.attentions.append( AttentionLayer(query_size, key_size, value_size, bias=False) ) 
 
        if self.double_learning or hidden_size != out_embed_dim:
            if hidden_size != out_embed_dim:
                self.tk_additional_fc = Linear(hidden_size, out_embed_dim)
            if self.double_learning:
                self.char_rnn = LSTMCell(input_size=hidden_size, hidden_size=hidden_size) #Linear(embed_dim + hidden_size, hidden_size)    # input: char embed, $c_{i-1}$ (input-feed) => $h_i$
                self.char2tok_att = AttentionLayer(hidden_size, hidden_size, hidden_size, bias=False)   # input: $h_i$, x => $c_i$
                if hidden_size != out_embed_dim:
                    self.char_out = Linear(hidden_size, out_embed_dim)  # input: $c_i$ => $o_i$
        if adaptive_softmax_cutoff is not None:
            # setting adaptive_softmax dropout to dropout_out for now but can be redefined
            self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out)
        elif not self.share_input_output_embed:
            self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 classification_head=None):
        super().__init__(dictionary)
        self.onnx_trace = False
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        self.embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.self_target = args.self_target
        self.future_target = args.future_target
        self.past_target = args.past_target
        self.char_inputs = args.char_inputs

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(self.embed_dim)

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            self.embed_dim,
            self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.forward_layers = nn.ModuleList([
            TransformerDecoderLayer(
                args,
                no_encoder_attn=True,
                add_bias_kv=not args.no_bias_kv,
                add_zero_attn=args.no_bias_kv,
            ) for _ in range(args.decoder_layers)
        ])
        self.backward_layers = nn.ModuleList([
            TransformerDecoderLayer(
                args,
                no_encoder_attn=True,
                add_bias_kv=not args.no_bias_kv,
                add_zero_attn=args.no_bias_kv,
            ) for _ in range(args.decoder_layers)
        ])

        self.full_attn_layer = None
        self.full_linear_layer = None

        if self.self_target:
            if args.linear_final_layer:
                self.full_linear_layer = Linear(self.embed_dim * 2,
                                                self.embed_dim,
                                                args.linear_final_layer_bias)
            else:
                self.full_attn_layer = BidirectionalTransformerDecoderLayer(
                    args)

        self.load_softmax = not getattr(args, 'remove_head', False)
        self.embed_out = None
        self.adaptive_softmax = None
        self.classification_head = classification_head

        if self.load_softmax:
            if args.adaptive_softmax_cutoff is not None:
                self.adaptive_softmax = AdaptiveSoftmax(
                    len(dictionary),
                    args.decoder_embed_dim,
                    options.eval_str_list(args.adaptive_softmax_cutoff,
                                          type=int),
                    dropout=args.adaptive_softmax_dropout,
                )
            elif not self.share_input_output_embed:
                self.embed_out = nn.Parameter(
                    torch.Tensor(len(dictionary), self.embed_dim))
                nn.init.normal_(self.embed_out,
                                mean=0,
                                std=self.embed_dim**-0.5)
Exemplo n.º 16
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        self.args = args
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout = args.dropout
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.embed_dim = embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(
            embed_dim)

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = (PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        self.cross_self_attention = getattr(args, "cross_self_attention",
                                            False)
        self.layer_wise_attention = getattr(args, "layer_wise_attention",
                                            False)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_decoder_layer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])
        self.num_layers = len(self.layers)

        self.adaptive_softmax = None

        self.project_out_dim = (Linear(
            embed_dim, self.output_embed_dim, bias=False)
                                if embed_dim != self.output_embed_dim
                                and not args.tie_adaptive_weights else None)

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), self.output_embed_dim))
            nn.init.normal_(self.embed_out,
                            mean=0,
                            std=self.output_embed_dim**-0.5)

        if args.decoder_normalize_before and not getattr(
                args, "no_decoder_final_norm", False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        self.n_experts = 1
        self.nhidlast = self.embed_dim
        self.ninp = self.embed_dim
        self.ntoken = 9744
        self.prior = nn.Linear(self.nhidlast, self.n_experts, bias=False)
        self.latent = nn.Sequential(
            nn.Linear(self.nhidlast, self.n_experts * self.ninp), nn.Tanh())
Exemplo n.º 17
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        if args.PRUNE_BOOL:
            self.decoder_self_attn_path    = args.DECODER_SELF_ATTN_PATH
            self.encoder_decoder_attn_path = args.ENCODER_DECODER_ATTN_PATH
            self.decoder_self_attn_pattern = torch.from_numpy(np.load(self.decoder_self_attn_path)) #(no_layers, 1, no_head, 1024, 1024)
            self.encoder_decoder_attn_pattern = torch.from_numpy(np.load(self.encoder_decoder_attn_path)) #(no_layers, 1, no_head, 1024, 1024)
            if args.CUDA:
                self.decoder_self_attn_pattern = self.decoder_self_attn_pattern.cuda()
                self.encoder_decoder_attn_pattern = self.encoder_decoder_attn_pattern.cuda()

        self.dropout = args.dropout
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.embed_dim = embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

        self.project_in_dim = (
            Linear(input_embed_dim, embed_dim, bias=False)
            if embed_dim != input_embed_dim
            else None
        )

        self.embed_positions = (
            PositionalEmbedding(
                args.max_target_positions,
                embed_dim,
                self.padding_idx,
                learned=args.decoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        self.cross_self_attention = getattr(args, "cross_self_attention", False)
        self.layer_wise_attention = getattr(args, "layer_wise_attention", False)

        self.layers = nn.ModuleList([])
        if args.PRUNE_BOOL:
            self.layers.extend(
                [
                    TransformerDecoderLayer(args, self.decoder_self_attn_pattern[i], self.encoder_decoder_attn_pattern[i], no_encoder_attn)
                    for i in range(args.decoder_layers)
                ]
            )
        else:
            self.layers.extend(
                [
                    TransformerDecoderLayer(args, None, None, no_encoder_attn)
                    for i in range(args.decoder_layers)
                ]
            )

        self.num_layers = len(self.layers)

        self.adaptive_softmax = None

        self.project_out_dim = (
            Linear(embed_dim, self.output_embed_dim, bias=False)
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights
            else None
        )

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), self.output_embed_dim)
            )
            nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5)

        if args.decoder_normalize_before and not getattr(
            args, "no_decoder_final_norm", False
        ):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None
Exemplo n.º 18
0
    def __init__(self,
                 dictionary,
                 embed_tokens,
                 embed_dim=512,
                 num_layers=1,
                 dropout_in=0.1,
                 dropout_out=0.1,
                 bidirectional=False,
                 left_pad=False,
                 padding_value=0.,
                 adaptive_softmax=False,
                 adaptive_softmax_cutoff=[],
                 adaptive_softmax_dropout=0.1,
                 adaptive_softmax_factor=None):
        super(LSTMTaggerDecoder, self).__init__(dictionary=dictionary)

        if hasattr(embed_tokens, "embedded_dim"):
            self.in_embed_dim = embed_tokens.embedded_dim
        elif hasattr(embed_tokens, "embed_dim"):
            self.in_embed_dim = embed_tokens.embed_dim
        elif hasattr(embed_tokens, "embedding_dim"):
            self.in_embed_dim = embed_tokens.embedding_dim
        else:
            raise Exception
        self.output_units = self.embed_dim = embed_dim
        self.out_embed_dim = len(dictionary)

        self.num_layers = num_layers
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out

        self.bidirectional = bidirectional
        if self.bidirectional:
            #self.output_units *= 2
            pass

        self.padding_idx = dictionary.pad()
        self.padding_value = 0.
        self.left_pad = left_pad

        self.embed_tokens = embed_tokens

        self.fc_in = self.fc_out1 = self.fc_out2 = None
        if self.in_embed_dim != self.embed_dim:
            self.fc_in = Linear(self.in_embed_dim, self.embed_dim)
        if self.output_units != self.embed_dim:
            self.fc_out1 = Linear(self.output_units, self.embed_dim)
        if self.embed_dim != self.out_embed_dim:
            self.fc_out2 = Linear(self.embed_dim, self.out_embed_dim)

        self.lstm = LSTM(
            input_size=embed_dim,
            hidden_size=embed_dim,
            num_layers=num_layers,
            dropout=self.dropout_out if num_layers > 1 else 0.,
            bidirectional=bidirectional,
        )

        self.adaptive_softmax = None

        if adaptive_softmax:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.embed_dim,
                adaptive_softmax_cutoff,
                dropout=adaptive_softmax_dropout,
                adaptive_inputs=None,
                factor=adaptive_softmax_factor,
                tie_proj=False,
            )
    def __init__(self, args, dictionary, embed_tokens, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.unk_idx = dictionary.unk()
        self.eos_idx = dictionary.eos()
        self.max_target_positions = args.max_target_positions
        self.output_dim = args.decoder_embed_dim

        self.self_target = args.self_target
        self.future_target = args.future_target
        self.past_target = args.past_target

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)

        self.input_dropout = torch.tensor(
            args.input_dropout) if args.input_dropout > 0 else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            self.padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.forward_layers = nn.ModuleList([
            TransformerDecoderLayer(args) for _ in range(args.decoder_layers)
        ])
        self.backward_layers = nn.ModuleList([
            TransformerDecoderLayer(args) for _ in range(args.decoder_layers)
        ]) if not args.single_tower else self.forward_layers
        self.single_tower = args.single_tower

        self.full_attn_layer = None
        self.full_linear_layer = None

        if self.self_target:
            if args.linear_final_layer:
                self.full_linear_layer = Linear(embed_dim * 2, embed_dim,
                                                args.linear_final_layer_bias)
            else:
                self.full_attn_layer = BidirectionalTransformerDecoderLayer(
                    args)

        self.load_softmax = not getattr(args, 'remove_head', False)
        self.embed_out = None
        self.adaptive_softmax = None

        if self.load_softmax:
            if args.adaptive_softmax_cutoff is not None:
                self.adaptive_softmax = AdaptiveSoftmax(
                    len(dictionary),
                    args.decoder_embed_dim,
                    options.eval_str_list(args.adaptive_softmax_cutoff,
                                          type=int),
                    dropout=args.adaptive_softmax_dropout,
                    adaptive_inputs=embed_tokens
                    if args.tie_adaptive_weights else None,
                    factor=args.adaptive_softmax_factor,
                    tie_proj=args.tie_adaptive_proj,
                )
            elif not self.share_input_output_embed:
                self.embed_out = nn.Parameter(
                    torch.Tensor(len(dictionary), embed_dim))
                nn.init.normal_(self.embed_out, mean=0, std=embed_dim**-0.5)
        else:
            self.share_input_output_embed = False
Exemplo n.º 20
0
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 no_encoder_attn=False,
                 final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            LightConvDecoderLayer(args,
                                  no_encoder_attn,
                                  kernel_size=args.decoder_kernel_size_list[i])
            for i in range(args.decoder_layers)
        ])
        self.decoder_dynamic_combination = args.decoder_dynamic_combination
        self.decoder_linear_combination = args.decoder_linear_combination
        assert not (self.decoder_dynamic_combination
                    and self.decoder_linear_combination)
        if self.decoder_linear_combination or self.decoder_dynamic_combination:
            self.weight_ffn = nn.Sequential(
                nn.Linear(embed_dim, args.decoder_ffn_embed_dim),
                nn.ReLU(),
                nn.Linear(args.decoder_ffn_embed_dim, embed_dim),
            )
        if self.decoder_dynamic_combination:
            self.proj = nn.ModuleList([
                nn.Sequential(
                    nn.Linear(embed_dim * args.decoder_layers, embed_dim * 2),
                    nn.ReLU(), nn.Linear(embed_dim * 2, embed_dim))
                for _ in range(args.decoder_layers)
            ])
        if self.decoder_linear_combination:
            self.weights = nn.ParameterList([
                nn.Parameter(torch.randn(1, 1, embed_dim), requires_grad=True)
                for _ in range(args.decoder_layers)
            ])
        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \
            if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)
Exemplo n.º 21
0
class TransformerDecoder(nn.Module):
    """
    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
    is a :class:`TransformerDecoderLayer`.

    Args:
        args (argparse.Namespace): parsed command-line arguments
        dictionary (~fairseq.data.Dictionary): decoding dictionary
        embed_tokens (torch.nn.Embedding): output embedding
        no_encoder_attn (bool, optional): whether to attend to encoder outputs
            (default: False).
    """

    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        super().__init__()
        self.register_buffer('version', torch.Tensor([3]))

        self.dictionary = dictionary
        self.onnx_trace = False

        self.dropout = args.dropout
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

        self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions, embed_dim, self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.cross_self_attention = getattr(args, 'cross_self_attention', False)
        self.layer_wise_attention = getattr(args, 'layer_wise_attention', False)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5)

        if args.decoder_normalize_before and not getattr(args, 'no_decoder_final_norm', False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, 'layernorm_embedding', False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

    def forward(
        self,
        prev_output_tokens,
        encoder_out=None,
        incremental_state=None,
        features_only=False,
        **extra_args
    ):
        """
        Args:
            prev_output_tokens (LongTensor): previous decoder outputs of shape
                `(batch, tgt_len)`, for teacher forcing
            encoder_out (optional): output from the encoder, used for
                encoder-side attention
            incremental_state (dict): dictionary used for storing state during
                :ref:`Incremental decoding`
            features_only (bool, optional): only return features without
                applying output layer (default: False).

        Returns:
            tuple:
                - the decoder's output of shape `(batch, tgt_len, vocab)`
                - a dictionary with any model-specific outputs
        """
        x, extra = self.extract_features(
            prev_output_tokens,
            encoder_out=encoder_out,
            incremental_state=incremental_state,
            **extra_args
        )
        if not features_only:
            x = self.output_layer(x)
        return x, extra

    def extract_features(
        self,
        prev_output_tokens,
        encoder_out=None,
        incremental_state=None,
        full_context_alignment=False,
        alignment_layer=None,
        alignment_heads=None,
        **unused,
    ):
        """
        Similar to *forward* but only return features.

        Includes several features from "Jointly Learning to Align and
        Translate with Transformer Models" (Garg et al., EMNLP 2019).

        Args:
            full_context_alignment (bool, optional): don't apply
                auto-regressive mask to self-attention (default: False).
            alignment_layer (int, optional): return mean alignment over
                heads at this layer (default: last layer).
            alignment_heads (int, optional): only average alignment over
                this many heads (default: all heads).

        Returns:
            tuple:
                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
                - a dictionary with any model-specific outputs
        """
        if alignment_layer is None:
            alignment_layer = len(self.layers) - 1

        # embed positions
        positions = self.embed_positions(
            prev_output_tokens,
            incremental_state=incremental_state,
        ) if self.embed_positions is not None else None

        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
            if positions is not None:
                positions = positions[:, -1:]

        # embed tokens and positions
        x = self.embed_scale * self.embed_tokens(prev_output_tokens)

        if self.project_in_dim is not None:
            x = self.project_in_dim(x)

        if positions is not None:
            x += positions

        if self.layernorm_embedding:
            x = self.layernorm_embedding(x)

        x = F.dropout(x, p=self.dropout, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        self_attn_padding_mask = None
        if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)

        # decoder layers
        attn = None
        inner_states = [x]
        for idx, layer in enumerate(self.layers):
            encoder_state = None
            if encoder_out is not None:
                if self.layer_wise_attention:
                    encoder_state = encoder_out.encoder_states[idx]
                else:
                    encoder_state = encoder_out.encoder_out

            if incremental_state is None and not full_context_alignment:
                self_attn_mask = self.buffered_future_mask(x)
            else:
                self_attn_mask = None

            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            dropout_probability = random.uniform(0, 1)
            if not self.training or (dropout_probability > self.decoder_layerdrop):
                x, layer_attn = layer(
                    x,
                    encoder_state,
                    encoder_out.encoder_padding_mask if encoder_out is not None else None,
                    incremental_state,
                    self_attn_mask=self_attn_mask,
                    self_attn_padding_mask=self_attn_padding_mask,
                    need_attn=(idx == alignment_layer),
                    need_head_weights=(idx == alignment_layer),
                )
                inner_states.append(x)
                if layer_attn is not None and idx == alignment_layer:
                    attn = layer_attn.float()

        if attn is not None:
            if alignment_heads is not None:
                attn = attn[:alignment_heads]

            # average probabilities over heads
            attn = attn.mean(dim=0)

        if self.layer_norm:
            x = self.layer_norm(x)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        if self.project_out_dim is not None:
            x = self.project_out_dim(x)

        return x, {'attn': attn, 'inner_states': inner_states}

    def output_layer(self, features, **kwargs):
        """Project features to the vocabulary size."""
        if self.adaptive_softmax is None:
            # project back to size of vocabulary
            if self.share_input_output_embed:
                return F.linear(features, self.embed_tokens.weight)
            else:
                return F.linear(features, self.embed_out)
        else:
            return features

    def get_normalized_probs(self, net_output, log_probs, sample):
        """Get normalized probabilities (or log probs) from a net's output."""

        if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None:
            if sample is not None:
                assert 'target' in sample
                target = sample['target']
            else:
                target = None
            out = self.adaptive_softmax.get_log_prob(net_output[0], target=target)
            return out.exp_() if not log_probs else out

        logits = net_output[0]
        if log_probs:
            return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
        else:
            return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace)

    def max_positions(self):
        """Maximum output length supported by the decoder."""
        if self.embed_positions is None:
            return self.max_target_positions
        return min(self.max_target_positions, self.embed_positions.max_positions())

    def buffered_future_mask(self, tensor):
        dim = tensor.size(0)
        if (
            not hasattr(self, '_future_mask')
            or self._future_mask is None
            or self._future_mask.device != tensor.device
            or self._future_mask.size(0) < dim
        ):
            self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1)
        return self._future_mask[:dim, :dim]

    def upgrade_state_dict(self, state_dict):
        """Upgrade a (possibly old) state dict for new versions of fairseq."""
        return state_dict

    def prepare_for_onnx_export_(self):
        self.onnx_trace = True

    def upgrade_state_dict_named(self, state_dict, name):
        """Upgrade a (possibly old) state dict for new versions of fairseq."""
        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
            weights_key = '{}.embed_positions.weights'.format(name)
            if weights_key in state_dict:
                del state_dict[weights_key]
            state_dict['{}.embed_positions._float_tensor'.format(name)] = torch.FloatTensor(1)

        for i in range(len(self.layers)):
            # update layer norms
            layer_norm_map = {
                '0': 'self_attn_layer_norm',
                '1': 'encoder_attn_layer_norm',
                '2': 'final_layer_norm'
            }
            for old, new in layer_norm_map.items():
                for m in ('weight', 'bias'):
                    k = '{}.layers.{}.layer_norms.{}.{}'.format(name, i, old, m)
                    if k in state_dict:
                        state_dict['{}.layers.{}.{}.{}'.format(name, i, new, m)] = state_dict[k]
                        del state_dict[k]

        version_key = '{}.version'.format(name)
        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2:
            # earlier checkpoints did not normalize after the stack of layers
            self.layer_norm = None
            self.normalize = False
            state_dict[version_key] = torch.Tensor([1])

        return state_dict

    def reorder_incremental_state(self, incremental_state, new_order):
        """Reorder incremental state.

        This should be called when the order of the input has changed from the
        previous time step. A typical use case is beam search, where the input
        order changes between time steps based on the selection of beams.
        """
        seen = set()

        def apply_reorder_incremental_state(module):
            if module != self and hasattr(module, 'reorder_incremental_state') \
                    and module not in seen:
                seen.add(module)
                module.reorder_incremental_state(incremental_state, new_order)

        self.apply(apply_reorder_incremental_state)

    def set_beam_size(self, beam_size):
        """Sets the beam size in the decoder and all children."""
        if getattr(self, '_beam_size', -1) != beam_size:
            seen = set()

            def apply_set_beam_size(module):
                if module != self and hasattr(module, 'set_beam_size') \
                        and module not in seen:
                    seen.add(module)
                    module.set_beam_size(beam_size)

            self.apply(apply_set_beam_size)
            self._beam_size = beam_size
Exemplo n.º 22
0
    def __init__(
        self,
        args,
        src_dict,
        dst_dict,
        embed_tokens,
        no_encoder_attn=False,
        left_pad=False,
        final_norm=True,
    ):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerAANDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = Linear(embed_dim,
                                           args.decoder_out_embed_dim)
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)
        self.register_buffer("version", torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False
Exemplo n.º 23
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False,
                 left_pad=False, final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions, embed_dim, padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        """
        MODIFIED: add copying mechanism as a separate multi-head attention
        """
        if args.use_copy_scores:
            assert not no_encoder_attn, \
                "copy scores cannot be computed if " \
                "there is no encoder-decoder attention"
            # Number of heads in copy attention layer is an optional argument
            self.copy_attention_heads = (args.decoder_attention_heads
                                         if args.copy_attention_heads == 0
                                         else args.copy_attention_heads)
            self.copy_attention = MultiheadAttention(
                embed_dim, self.copy_attention_heads,
                dropout=args.attention_dropout,
            )
            # (NOTE) For computing alpha.
            self.copy_balancing_layer = Linear(input_embed_dim, 1)
            if args.decode_with_edit_labels:
                raise NotImplementedError
        else:
            self.copy_attention = None
            self.copy_balancing_layer = None

        # Alpha scheduler & diagnostic checker
        self.alpha_warmup = args.alpha_warmup
        self.num_batches = 0
        self.num_copies = 0
        self.mean_alpha = 0.0

        # Zero out generative probability of a word if also in source sentence
        self.pad_copied_words = args.pad_copied_words

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \
            if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            """
            MODIFIED: require share_input_output_embed for copying mechanism
            """
            raise NotImplementedError(
                "copying mechanism requires share_input_output_embed"
            )
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)
Exemplo n.º 24
0
    def __init__(self, args, src_dict, dst_dict, embed_tokens):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.decoder_layerdrop = 0
        if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0:
            self.decoder_layerdrop = args.decoder_layerdrop

        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos)

        self.aan = args.aan
        decoder_layer_class = (AANDecoderLayer if self.aan else
                               fairseq_transformer.TransformerDecoderLayer)

        self.layers = nn.ModuleList([])
        self.layers.extend(
            [decoder_layer_class(args) for i in range(args.decoder_layers)])
        if hasattr(args,
                   "decoder_layers_to_keep") and args.decoder_layers_to_keep:
            layers_to_keep = sorted(
                int(x) for x in args.decoder_layers_to_keep.split(","))
            self.decoder_layers_to_keep = {
                layer_id: layer_idx
                for layer_idx, layer_id in enumerate(layers_to_keep)
            }

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = fairseq_transformer.Linear(
                embed_dim, args.decoder_out_embed_dim)
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False

        # Use quantizable nn.Linear for output projection instead of F.linear
        self.output_projection = None
        if self.vocab_reduction_module is None:
            if self.share_input_output_embed:
                self.output_projection = nn.Linear(
                    self.embed_tokens.weight.shape[1],
                    self.embed_tokens.weight.shape[0])
                self.output_projection.weight = self.embed_tokens.weight
            else:
                self.output_projection = nn.Linear(self.embed_out.shape[1],
                                                   self.embed_out.shape[0])
                self.output_projection.weight = self.embed_out
Exemplo n.º 25
0
    def __init__(
            self, dictionary, embed_dim=512, embed_dict=None, out_embed_dim=256,
            max_positions=1024, convolutions=((512, 3),) * 20, attention=True,
            dropout=0.1, share_embed=False, positional_embeddings=True,
            adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0,
            left_pad=False,
    ):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([2]))
        self.dropout = dropout
        self.left_pad = left_pad
        self.need_attn = True

        convolutions = extend_conv_spec(convolutions)
        in_channels = convolutions[0][0]
        if isinstance(attention, bool):
            # expand True into [True, True, ...] and do the same with False
            attention = [attention] * len(convolutions)
        if not isinstance(attention, list) or len(attention) != len(convolutions):
            raise ValueError('Attention is expected to be a list of booleans of '
                             'length equal to the number of layers.')

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if embed_dict:
            self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens)

        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            padding_idx,
            left_pad=self.left_pad,
        ) if positional_embeddings else None

        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.attention = nn.ModuleList()
        self.residuals = []

        layer_in_channels = [in_channels]
        for i, (out_channels, kernel_size, residual) in enumerate(convolutions):
            if residual == 0:
                residual_dim = out_channels
            else:
                residual_dim = layer_in_channels[-residual]
            self.projections.append(Linear(residual_dim, out_channels)
                                    if residual_dim != out_channels else None)
            self.convolutions.append(
                LinearizedConv1d(in_channels, out_channels * 2, kernel_size,
                                 padding=(kernel_size - 1), dropout=dropout)
            )
            self.attention.append(AttentionLayer(out_channels, embed_dim)
                                  if attention[i] else None)
            self.residuals.append(residual)
            in_channels = out_channels
            layer_in_channels.append(out_channels)

        self.adaptive_softmax = None
        self.fc2 = self.fc3 = None

        if adaptive_softmax_cutoff is not None:
            assert not share_embed
            self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, in_channels, adaptive_softmax_cutoff,
                                                    dropout=adaptive_softmax_dropout)
        else:
            self.fc2 = Linear(in_channels, out_embed_dim)
            if share_embed:
                assert out_embed_dim == embed_dim, \
                    "Shared embed weights implies same dimensions " \
                    " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
                self.fc3 = nn.Linear(out_embed_dim, num_embeddings)
                self.fc3.weight = self.embed_tokens.weight
            else:
                self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
Exemplo n.º 26
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

#        self.dropout = [0.05, 0.1, 0.25, 0.3]
        self.dropout = [0, 0, 0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3]
#        self.dropout = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3]
        self.index = None
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.embed_dim = embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

#        self.embedding_hidden_mapping_out = SlimmableLinear([int(embed_dim / 4), int(embed_dim * 2 / 4), int(embed_dim * 3 / 4), embed_dim], 
#                                                            [embed_dim, embed_dim, embed_dim,  embed_dim])
#        self.embedding_hidden_mapping_in = SlimmableLinear([embed_dim, embed_dim, embed_dim, embed_dim],
#                                                           [int(embed_dim / 4), int(embed_dim * 2 / 4), int(embed_dim * 3 / 4),  embed_dim])

        self.embedding_hidden_mapping_in = SlimmableLinear([embed_dim,               embed_dim,               embed_dim,                embed_dim,                embed_dim,               embed_dim,               embed_dim,                embed_dim,                embed_dim,                embed_dim,                 embed_dim,                embed_dim,                embed_dim],
                                                           [int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16), int(embed_dim * 7 / 16),   int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16),  int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim])
        self.embedding_hidden_mapping_out = SlimmableLinear([int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16),  int(embed_dim * 7 / 16),  int(embed_dim * 8 / 16),  int(embed_dim * 9 / 16),  int(embed_dim * 10 / 16),  int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16),  int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim],
                                                            [embed_dim,               embed_dim,               embed_dim,                embed_dim,                embed_dim,               embed_dim,               embed_dim,                embed_dim,                embed_dim,                embed_dim,                 embed_dim,                embed_dim,                embed_dim])




        self.project_in_dim = (
            Linear(input_embed_dim, embed_dim, bias=False)
            if embed_dim != input_embed_dim
            else None
        )

        self.embed_positions = (
            PositionalEmbedding(
                args.max_target_positions,
                embed_dim,
                self.padding_idx,
                learned=args.decoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        self.cross_self_attention = getattr(args, "cross_self_attention", False)
        self.layer_wise_attention = getattr(args, "layer_wise_attention", False)

        self.layers = nn.ModuleList([])
        self.layers.extend(
            [
                TransformerDecoderLayer(args, no_encoder_attn)
                for _ in range(args.decoder_layers)
            ]
        )
        self.num_layers = len(self.layers)

        self.adaptive_softmax = None

        self.project_out_dim = (
            Linear(embed_dim, self.output_embed_dim, bias=False)
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights
            else None
        )

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), self.output_embed_dim)
            )
            nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5)

        if args.decoder_normalize_before and not getattr(
            args, "no_decoder_final_norm", False
        ):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None
Exemplo n.º 27
0
    def __init__(self,
                 dictionary,
                 embed_dim=512,
                 hidden_size=512,
                 out_embed_dim=512,
                 num_layers=1,
                 dropout_in=0.1,
                 dropout_out=0.1,
                 attention=True,
                 encoder_output_units=512,
                 pretrained_embed=None,
                 share_input_output_embed=False,
                 adaptive_softmax_cutoff=None,
                 max_target_positions=DEFAULT_MAX_TARGET_POSITIONS):
        super().__init__(dictionary)
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.hidden_size = hidden_size
        self.share_input_output_embed = share_input_output_embed
        self.need_attn = True
        self.max_target_positions = max_target_positions

        self.adaptive_softmax = None
        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        if pretrained_embed is None:
            self.embed_tokens = Embedding(num_embeddings, embed_dim,
                                          padding_idx)
        else:
            self.embed_tokens = pretrained_embed

        self.encoder_output_units = encoder_output_units
        if encoder_output_units != hidden_size and encoder_output_units != 0:
            self.encoder_hidden_proj = Linear(encoder_output_units,
                                              hidden_size)
            self.encoder_cell_proj = Linear(encoder_output_units, hidden_size)
        else:
            self.encoder_hidden_proj = self.encoder_cell_proj = None

        # disable input feeding if there is no encoder
        # input feeding is described in arxiv.org/abs/1508.04025
        input_feed_size = 0 if encoder_output_units == 0 else hidden_size
        self.layers = nn.ModuleList([
            LSTMCell(
                input_size=input_feed_size +
                embed_dim if layer == 0 else hidden_size,
                hidden_size=hidden_size,
            ) for layer in range(num_layers)
        ])
        if attention:
            # TODO make bias configurable
            self.attention = AttentionLayer(hidden_size,
                                            encoder_output_units,
                                            hidden_size,
                                            bias=False)
        else:
            self.attention = None
        if hidden_size != out_embed_dim:
            self.additional_fc = Linear(hidden_size, out_embed_dim)
        if adaptive_softmax_cutoff is not None:
            # setting adaptive_softmax dropout to dropout_out for now but can be redefined
            self.adaptive_softmax = AdaptiveSoftmax(num_embeddings,
                                                    hidden_size,
                                                    adaptive_softmax_cutoff,
                                                    dropout=dropout_out)
        elif not self.share_input_output_embed:
            self.fc_out = Linear(out_embed_dim,
                                 num_embeddings,
                                 dropout=dropout_out)
Exemplo n.º 28
0
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 embed_scale=None,
                 no_encoder_attn=False,
                 left_pad=False,
                 final_norm=True,
                 light=False,
                 masker=False):
        super().__init__(dictionary)
        self.args = args
        self.light = light
        self.masker = masker
        self.pnet = getattr(args, "pnet", False)
        self.pnet2 = getattr(args, "pnet2", False)
        if self.pnet2:
            assert not self.light
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        self.embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            self.embed_dim) if embed_scale is None else embed_scale

        self.project_in_dim = nn.Linear(
            input_embed_dim, self.embed_dim,
            bias=False) if self.embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            self.embed_dim,
            self.padding_idx,
            #learned=args.decoder_learned_pos,
        ) if not args.no_dec_token_positional_embeddings else None

        if hasattr(args,
                   "decoding_iterations") and args.decoding_iterations > 0:
            args.refinetot = args.decoding_iterations

        self.selected_decoder = 0
        if self.light:
            self.layers = nn.ModuleList()
            self.layers.extend([
                TransformerDecoderLayer(args, no_encoder_attn)
                for _ in range(args.decoder_layers)
            ])
            # self.layers.requires_grad_(False)

            self.last_decoder_layers = nn.ModuleList()
            self.last_decoder_layers.extend([
                TransformerDecoderLayer(args, no_encoder_attn)
                for _ in range(args.refinetot)
            ])
        else:
            self.layers = None  # switchable
            self.layer_stack = nn.ModuleList()
            for _ in range(args.refinetot):
                layers = nn.ModuleList([])
                layers.extend([
                    TransformerDecoderLayer(args, no_encoder_attn)
                    for _ in range(args.decoder_layers)
                ])
                self.layer_stack.append(layers)

        if self.pnet:
            self.pnet_layers = nn.ModuleList([
                TransformerDecoderLayer(args, no_encoder_attn)
                for _ in range(3)
            ])
            self.pnet_pred = nn.Linear(self.embed_dim, 1)

        if self.pnet2:
            self.pnet_stack = nn.ModuleList()
            self.pnet_stack.extend([
                nn.Linear(self.embed_dim, 1)
                for _ in range(args.decoder_layers)
            ])

        if self.masker:
            self.masker_stack = nn.ModuleList()
            if hasattr(self.args, "masker_light") and self.args.masker_light:
                masker_layers = 1
            else:
                masker_layers = args.refinetot - 1
            for _ in range(masker_layers):
                layers = nn.ModuleList([])
                layers.extend([
                    TransformerDecoderLayer(args, no_encoder_attn)
                    for _ in range(3)
                ])
                self.masker_stack.append(layers)
            self.masker_predict_stack = nn.ModuleList()
            for _ in range(masker_layers):
                self.masker_predict_stack.append(
                    nn.Linear(args.decoder_output_dim, 1))

        self.adaptive_softmax = None

        self.project_out_dim = nn.Linear(self.embed_dim, output_embed_dim, bias=False) \
            if self.embed_dim != output_embed_dim and not args.tie_adaptive_weights else None

        self.load_softmax = not getattr(args, 'remove_head', False)

        if self.load_softmax:
            if args.adaptive_softmax_cutoff is not None:
                self.adaptive_softmax = AdaptiveSoftmax(
                    len(dictionary),
                    output_embed_dim,
                    options.eval_str_list(args.adaptive_softmax_cutoff,
                                          type=int),
                    dropout=args.adaptive_softmax_dropout,
                    adaptive_inputs=embed_tokens
                    if args.tie_adaptive_weights else None,
                    factor=args.adaptive_softmax_factor,
                    tie_proj=args.tie_adaptive_proj,
                )
            elif not self.share_input_output_embed:
                self.embed_out = nn.Parameter(
                    torch.Tensor(len(dictionary), output_embed_dim))
                #nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = BertLayerNorm(self.embed_dim)
        if not self.share_input_output_embed:
            self.embed_out.requires_grad_(False)
        self.embed_tokens.requires_grad_(False)
        if self.pnet:
            for name, param in self.named_parameters():
                if "pnet" not in name:
                    param.requires_grad_(False)
Exemplo n.º 29
0
    def __init__(self, args, dictionary, embed_tokens):
        self.args = args
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__)
        self.dropword_module = FairseqFeatureDropout(args.word_dropout, module_name=self.__class__.__name__)
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

        self.project_in_dim = (
            Linear(input_embed_dim, embed_dim, bias=False)
            if embed_dim != input_embed_dim
            else None
        )

        self.embed_positions = (
            PositionalEmbedding(
                args.max_target_positions,
                embed_dim,
                self.padding_idx,
                learned=args.decoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        assert not args.layernorm_embedding or not args.decoder_normalize_before

        if args.layernorm_embedding:
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        if self.decoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([self.build_decoder_layer(i, args) for i in range(args.decoder_layers)])
        self.num_layers = len(self.layers)

        if args.decoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
            self.proj_layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
            self.proj_layer_norm = None

        self.project_out_dim = (
            Linear(embed_dim, self.output_embed_dim, bias=False)
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None
        )

        self.adaptive_softmax = None
        self.output_projection = None
        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif self.share_input_output_embed:
            self.output_projection = nn.Linear(
                self.embed_tokens.weight.shape[1],
                self.embed_tokens.weight.shape[0],
                bias=False,
            )
            self.output_projection.weight = self.embed_tokens.weight
        else:
            self.output_projection = nn.Linear(
                self.output_embed_dim, len(dictionary), bias=False
            )
            nn.init.normal_(
                self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5
            )
Exemplo n.º 30
0
    def __init__(
        self,
        rnn_type: Union[str, Namespace],
        dictionary,
        embed_dim=512,
        hidden_size=512,
        out_embed_dim=512,
        num_layers=1,
        dropout_in=0.1,
        dropout_out=0.1,
        attention=True,
        attention_bias=False,  # todo
        encoder_output_units=512,
        pretrained_embed=None,
        share_input_output_embed=False,
        adaptive_softmax_cutoff=None,
        max_target_positions=DEFAULT_MAX_TARGET_POSITIONS,
        residuals=False,
    ):
        super().__init__(dictionary)
        rnn_type = rnn_type.rnn_type if isinstance(rnn_type,
                                                   Namespace) else rnn_type
        self.rnn_type = rnn_type.lower().strip()
        self.is_lstm = True if self.rnn_type == 'lstm' else False  # 方便后面做判断,以便处理lstm的 cell值
        self.dropout_in_module = FairseqDropout(
            dropout_in, module_name=self.__class__.__name__)
        self.dropout_out_module = FairseqDropout(
            dropout_out, module_name=self.__class__.__name__)
        self.hidden_size = hidden_size
        self.share_input_output_embed = share_input_output_embed
        self.need_attn = True
        self.max_target_positions = max_target_positions
        self.residuals = residuals
        self.num_layers = num_layers

        self.adaptive_softmax = None
        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        if pretrained_embed is None:
            self.embed_tokens = JqEmbedding(num_embeddings, embed_dim,
                                            padding_idx)
        else:
            self.embed_tokens = pretrained_embed

        self.encoder_output_units = encoder_output_units
        if encoder_output_units != hidden_size and encoder_output_units != 0:
            self.encoder_hidden_proj = Linear(encoder_output_units,
                                              hidden_size)
            self.encoder_cell_proj = Linear(
                encoder_output_units, hidden_size) if self.is_lstm else None
        else:
            self.encoder_hidden_proj = self.encoder_cell_proj = None

        # disable input feeding if there is no encoder
        # input feeding is described in arxiv.org/abs/1508.04025
        input_feed_size = 0 if encoder_output_units == 0 else hidden_size

        _, JQRNNCell = get_rnn_cell(self.rnn_type)  # 返回的是方法
        # _ JQRNN

        self.layers = nn.ModuleList([
            JQRNNCell(
                input_size=input_feed_size +
                embed_dim if layer == 0 else hidden_size,
                hidden_size=hidden_size,
            ) for layer in range(num_layers)
        ])

        if attention:
            # TODO make bias configurable: Done
            self.attention = AttentionLayer(hidden_size,
                                            encoder_output_units,
                                            hidden_size,
                                            bias=attention_bias)
        else:
            self.attention = None

        if hidden_size != out_embed_dim:
            self.additional_fc = Linear(hidden_size, out_embed_dim)

        if adaptive_softmax_cutoff is not None:
            # setting adaptive_softmax dropout to dropout_out for now but can be redefined
            self.adaptive_softmax = AdaptiveSoftmax(
                num_embeddings,
                hidden_size,
                adaptive_softmax_cutoff,
                dropout=dropout_out,
            )
        elif not self.share_input_output_embed:
            self.fc_out = Linear(out_embed_dim,
                                 num_embeddings,
                                 dropout=dropout_out)