class DenseTransformerDecoder(FairseqIncrementalDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). left_pad (bool, optional): whether the input is left-padded (default: False). final_norm (bool, optional): apply layer norm to the output of the final decoder layer (default: True). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None # create decoder layer history self.history = CreateLayerHistory(args, is_encoder=False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ # embed positions if self.history is not None: self.history.clean() positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # add emb into history if self.history is not None: self.history.add(x) # decoder layers for layer in self.layers: if self.history is not None: x = self.history.pop() x, attn = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, ) inner_states.append(x) if self.history is not None: self.history.add(x) if self.history is not None: x = self.history.pop() if self.normalize: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = F.linear(x, self.embed_out) return x, {'attn': attn, 'inner_states': inner_states} def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr( self, '_future_mask' ) or self._future_mask is None or self._future_mask.device != tensor.device: self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._future_mask.size(0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format( name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format( name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format( name, i, new, m)] = state_dict[k] del state_dict[k] if utils.item( state_dict.get('{}.version'.format(name), torch.Tensor( [1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict['{}.version'.format(name)] = torch.Tensor([1]) return state_dict
class DlclTransformerDecoder(FairseqIncrementalDecoder): """Transformer decoder.""" def __init__(self, args, dictionary, embed_tokens, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( 1024, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args) for i in range(args.decoder_layers) ]) self.history = CreateLayerHistory(args, is_encoder=False) self.normalize = args.decoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim) if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=embed_dim**-0.5) self.inspected_grads = OrderedDict() if getattr( args, 'inspect_grad', False) else None def forward(self, prev_output_tokens, encoder_out, incremental_state=None): self.history.clean() # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) util.inspect_grad('decoder_0', x, self.inspected_grads) # push embedding layer into memory self.history.add(x) # decoder layers for layer_id, layer in enumerate(self.layers): # read from memory x = self.history.pop() x, attn = layer( x, encoder_out['encoder_out'], encoder_out['encoder_padding_mask'], incremental_state, ) # write into memory self.history.add(x) util.inspect_grad('decoder_%d' % (layer_id + 1), x, self.inspected_grads) # read from memory x = self.history.pop() if self.normalize: x = self.layer_norm(x) util.inspect_grad('decoder_top', x, self.inspected_grads) # T x B x C -> B x T x C x = x.transpose(0, 1) # project back to size of vocabulary if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = F.linear(x, self.embed_out) return x, attn def reorder_encoder_out(self, encoder_out_dict, new_order): if encoder_out_dict['encoder_padding_mask'] is not None: encoder_out_dict['encoder_padding_mask'] = \ encoder_out_dict['encoder_padding_mask'].index_select(0, new_order) return encoder_out_dict def max_positions(self): """Maximum output length supported by the decoder.""" return self.embed_positions.max_positions() def upgrade_state_dict(self, state_dict): if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): if 'decoder.embed_positions.weights' in state_dict: del state_dict['decoder.embed_positions.weights'] if 'decoder.embed_positions._float_tensor' not in state_dict: state_dict[ 'decoder.embed_positions._float_tensor'] = torch.FloatTensor( ) return state_dict
class DenseTransformerEncoder(FairseqEncoder): """ Transformer encoder consisting of *args.encoder_layers* layers. Each layer is a :class:`TransformerEncoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): encoding dictionary embed_tokens (torch.nn.Embedding): input embedding left_pad (bool, optional): whether the input is left-padded (default: True). """ def __init__(self, args, dictionary, embed_tokens, left_pad=True): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, left_pad=left_pad, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None # create encoder layer history self.history = CreateLayerHistory(args, is_encoder=True) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim) def forward(self, src_tokens, src_lengths): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ if self.history is not None: self.history.clean() # embed tokens and positions x = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x += self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # add emb into history if self.history is not None: self.history.add(x) # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) if not encoder_padding_mask.any(): encoder_padding_mask = None # encoder layers for layer in self.layers: if self.history is not None: x = self.history.pop() x = layer(x, encoder_padding_mask) if self.history is not None: self.history.add(x) if self.history is not None: x = self.history.pop() if self.normalize: x = self.layer_norm(x) return { 'encoder_out': x, # T x B x C 'encoder_padding_mask': encoder_padding_mask, # B x T } def reorder_encoder_out(self, encoder_out, new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ if encoder_out['encoder_out'] is not None: encoder_out['encoder_out'] = \ encoder_out['encoder_out'].index_select(1, new_order) if encoder_out['encoder_padding_mask'] is not None: encoder_out['encoder_padding_mask'] = \ encoder_out['encoder_padding_mask'].index_select(0, new_order) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" if self.embed_positions is None: return self.max_source_positions return min(self.max_source_positions, self.embed_positions.max_positions()) def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format( name)] = torch.FloatTensor(1) version_key = '{}.version'.format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict
class DlclTransformerEncoder(FairseqEncoder): """Transformer encoder.""" def __init__(self, args, dictionary, embed_tokens, left_pad=True): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( 1024, embed_dim, self.padding_idx, left_pad=left_pad, learned=args.encoder_learned_pos, ) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.history = CreateLayerHistory(args, is_encoder=True) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.inspected_grads = OrderedDict() if getattr( args, 'inspect_grad', False) else None def forward(self, src_tokens, src_lengths): # clean layer history self.history.clean() # embed tokens and positions x = self.embed_scale * self.embed_tokens(src_tokens) x += self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) util.inspect_grad("encoder_0", x, self.inspected_grads) # push embedding layer into memory self.history.add(x) # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) if not encoder_padding_mask.any(): encoder_padding_mask = None # encoder layers for layer_id, layer in enumerate(self.layers): # fetch combined input from memory for the next layer x = self.history.pop() x = layer(x, encoder_padding_mask) # push into memory self.history.add(x) util.inspect_grad("encoder_%d" % (layer_id + 1), x, self.inspected_grads) # read from memory x = self.history.pop() if self.normalize: x = self.layer_norm(x) util.inspect_grad("encoder_top", x, self.inspected_grads) return { 'encoder_out': x, # T x B x C 'encoder_padding_mask': encoder_padding_mask, # B x T } def max_positions(self): """Maximum input length supported by the encoder.""" return self.embed_positions.max_positions() def upgrade_state_dict(self, state_dict): if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): if 'encoder.embed_positions.weights' in state_dict: del state_dict['encoder.embed_positions.weights'] if 'encoder.embed_positions._float_tensor' not in state_dict: state_dict[ 'encoder.embed_positions._float_tensor'] = torch.FloatTensor( ) return state_dict