def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_embed_dim=512, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.num_layers = num_layers self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units assert encoder_output_units == hidden_size, \ 'encoder_output_units ({}) != hidden_size ({})'.format(encoder_output_units, hidden_size) # TODO another Linear layer if not equal self.layers = nn.ModuleList([ LSTMCell( input_size=encoder_output_units + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) self.attention_layers = nn.ModuleList() for i in range(num_layers): self.attention_layers.append( AttentionLayer(encoder_output_units, hidden_size)) # self.attention = AttentionLayer(encoder_output_units, hidden_size) if attention else None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, embed_dim, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out) self.re_fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.cross_self_attention = getattr(args, "cross_self_attention", False) if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) self.adaptive_softmax = None self.output_projection = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear(self.output_embed_dim, len(dictionary), bias=False) nn.init.normal_(self.output_projection.weight, mean=0, std=self.output_embed_dim**-0.5)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ ContextTransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
class transformer_with_copyDecoder(FairseqIncrementalDecoder): """ transformer_with_copy decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`transformer_with_copyDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). final_norm (bool, optional): apply layer norm to the output of the final decoder layer (default: True). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ transformer_with_copyDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.copy_attention = MultiheadOnlyAttention( embed_dim, 1, dropout=0, ) self.copy_or_generate = nn.Sequential(nn.Linear(embed_dim, 1), nn.Sigmoid()) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) inner_states = [x] # decoder layers for layer in self.layers: x, _ = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, ) inner_states.append(x) if self.normalize: x = self.layer_norm(x) _, copy = self.copy_attention( query=x, key=encoder_out['encoder_out'] if encoder_out is not None else None, value=encoder_out['encoder_out'] if encoder_out is not None else None, key_padding_mask=encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state=incremental_state, static_kv=True, need_weights=True, ) copy_or_generate = self.copy_or_generate(x).transpose(0, 1) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = F.linear(x, self.embed_out) return x, { 'attn': copy, 'inner_states': inner_states, 'copy_or_generate': copy_or_generate } def get_normalized_probs(self, net_output, log_probs, sample): """Get normalized probabilities (or log probs) from a net's output.""" # print('enter normalized.') if 'net_input' in sample.keys(): enc_seq_ids = sample['net_input']['src_tokens'] else: enc_seq_ids = sample['src_tokens'] # wvocab_size = net_output[0].size(2) # batch_size = enc_seq_ids.size(0) # seq_len = enc_seq_ids.size(1) # one_hot = torch.zeros(batch_size, seq_len, wvocab_size).cuda().scatter_(dim=2, index=enc_seq_ids.unsqueeze(-1), value=1) # # copy_probs = torch.matmul(net_output[1]['attn'], one_hot) # final_dist = vocab_dist.scatter_add(1, encoder_batch_extend_vocab, attn_dist) if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None: if sample is not None: assert 'target' in sample target = sample['target'] else: target = None out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) return out.exp_() if not log_probs else out logits = net_output[0] if log_probs: generate = utils.softmax( logits, dim=-1, onnx_trace=self.onnx_trace) * net_output[1]['copy_or_generate'] copy = net_output[1]['attn'] * (1 - net_output[1]['copy_or_generate']) enc_seq_ids = enc_seq_ids.unsqueeze(1).repeat( 1, net_output[1]['attn'].size(1), 1) final = generate.scatter_add(2, enc_seq_ids, copy) final = torch.log(final + 1e-15) return final else: generate = utils.log_softmax( logits, dim=-1, onnx_trace=self.onnx_trace) * net_output[1]['copy_or_generate'] copy = net_output[1]['attn'] * (1 - net_output[1]['copy_or_generate']) enc_seq_ids = enc_seq_ids.unsqueeze(1).repeat( 1, net_output[1]['attn'].size(1), 1) final = generate.scatter_add(2, enc_seq_ids, copy) return final def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr( self, '_future_mask' ) or self._future_mask is None or self._future_mask.device != tensor.device: self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._future_mask.size(0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format( name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format( name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format( name, i, new, m)] = state_dict[k] del state_dict[k] if utils.item( state_dict.get('{}.version'.format(name), torch.Tensor( [1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict['{}.version'.format(name)] = torch.Tensor([1]) return state_dict
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, encoder_output_units=0, attn_type=None, attn_dim=0, need_attn=False, residual=False, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, scheduled_sampling_rate_scheduler=None, ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed if attn_type is None or attn_type.lower() == 'none': # no attention, no encoder output needed (language model case) need_attn = False encoder_output_units = 0 self.need_attn = need_attn self.residual = residual self.max_target_positions = max_target_positions self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units self.layers = nn.ModuleList([ LSTMCell( input_size=encoder_output_units + (embed_dim if layer == 0 else hidden_size), hidden_size=hidden_size, ) for layer in range(num_layers) ]) if attn_type is None or attn_type.lower() == 'none': self.attention = None elif attn_type.lower() == 'bahdanau': self.attention = speech_attention.BahdanauAttention( hidden_size, encoder_output_units, attn_dim, ) elif attn_type.lower() == 'luong': self.attention = speech_attention.LuongAttention( hidden_size, encoder_output_units, ) else: raise ValueError('unrecognized attention type.') if hidden_size + encoder_output_units != out_embed_dim: self.additional_fc = Linear(hidden_size + encoder_output_units, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out) self.scheduled_sampling_rate_scheduler = scheduled_sampling_rate_scheduler
class FConvCustomDecoder(FairseqIncrementalDecoder): """Convolutional decoder""" def __init__( self, dictionary, embed_dim=512, embed_dict=None, out_embed_dim=256, max_positions=1024, convolutions=((512, 3), ) * 20, attention=True, dropout=0.1, share_embed=False, positional_embeddings=True, adaptive_softmax_cutoff=None, normalization_constant=0.5, left_pad=False, ): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([2])) self.dropout = dropout self.normalization_constant = normalization_constant self.left_pad = left_pad convolutions = extend_conv_spec(convolutions) in_channels = convolutions[0][0] if isinstance(attention, bool): # expand True into [True, True, ...] and do the same with False attention = [attention] * len(convolutions) if not isinstance(attention, list) or len(attention) != len(convolutions): raise ValueError( 'Attention is expected to be a list of booleans of ' 'length equal to the number of layers.') num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if embed_dict: self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=self.left_pad, ) if positional_embeddings else None self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.auxattention = nn.ModuleList() self.attention = nn.ModuleList() self.auxgates = nn.ModuleList() self.residuals = [] layer_in_channels = [in_channels] for i, (out_channels, kernel_size, residual) in enumerate(convolutions): if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] self.projections.append( Linear(residual_dim, out_channels ) if residual_dim != out_channels else None) self.convolutions.append( LinearizedConv1d(in_channels, out_channels * 2, kernel_size, padding=(kernel_size - 1), dropout=dropout)) self.auxattention.append( AttentionLayer(out_channels, embed_dim, self. normalization_constant ) if attention[i] else None) self.attention.append( AttentionLayer(out_channels, embed_dim, self. normalization_constant ) if attention[i] else None) self.auxgates.append( Gating(gate_dim=out_channels, inputs_dim=out_channels)) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) self.adaptive_softmax = None self.fc2 = self.fc3 = None if adaptive_softmax_cutoff is not None: assert not share_embed self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, in_channels, adaptive_softmax_cutoff, dropout=dropout) else: self.fc2 = Linear(in_channels, out_embed_dim) if share_embed: assert out_embed_dim == embed_dim, \ "Shared embed weights implies same dimensions " \ " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim) self.fc3 = nn.Linear(out_embed_dim, num_embeddings) self.fc3.weight = self.embed_tokens.weight else: self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout) def forward(self, prev_output_tokens, auxencoder_out_dict=None, encoder_out_dict=None, incremental_state=None): if auxencoder_out_dict is not None: auxencoder_out = auxencoder_out_dict['encoder_out'] auxencoder_padding_mask = auxencoder_out_dict[ 'encoder_padding_mask'] # split and transpose aux. encoder outputs auxencoder_a, auxencoder_b = self._split_encoder_out( auxencoder_out, incremental_state, aux=True) if encoder_out_dict is not None: encoder_out = encoder_out_dict['encoder_out'] encoder_padding_mask = encoder_out_dict['encoder_padding_mask'] # split and transpose encoder outputs encoder_a, encoder_b = self._split_encoder_out( encoder_out, incremental_state) if self.embed_positions is not None: pos_embed = self.embed_positions(prev_output_tokens, incremental_state) else: pos_embed = 0 if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] x = self._embed_tokens(prev_output_tokens, incremental_state) # embed tokens and combine with positional embeddings x += pos_embed x = F.dropout(x, p=self.dropout, training=self.training) target_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = self._transpose_if_training(x, incremental_state) # temporal convolutions avg_attn_scores = None avg_auxattn_scores = None residuals = [x] for proj, conv, attention, auxattention, res_layer, auxgate in zip( self.projections, self.convolutions, self.attention, self.auxattention, self.residuals, self.auxgates): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x, incremental_state) x = F.glu(x, dim=2) # auxiliary attention acx = None if auxattention is not None: x = self._transpose_if_training(x, incremental_state) acx, auxattn_scores = auxattention( x, target_embedding, (auxencoder_a, auxencoder_b), auxencoder_padding_mask) auxattn_scores = auxattn_scores / len(self.auxattention) if avg_auxattn_scores is None: avg_auxattn_scores = auxattn_scores else: avg_auxattn_scores.add_(auxattn_scores) x = self._transpose_if_training(x, incremental_state) # attention if attention is not None: x = self._transpose_if_training(x, incremental_state) cx, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b), encoder_padding_mask) attn_scores = attn_scores / len(self.attention) if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) if acx is not None: auxgt = auxgate(decoder_state=x, attn=cx) x = (x + cx) * math.sqrt(self.normalization_constant) if acx is not None: x = (x + auxgt * acx) * math.sqrt( self.normalization_constant) x = self._transpose_if_training(x, incremental_state) # residual if residual is not None: x = (x + residual) * math.sqrt(self.normalization_constant) residuals.append(x) # T x B x C -> B x T x C x = self._transpose_if_training(x, incremental_state) # project back to size of vocabulary if not using adaptive softmax if self.fc2 is not None and self.fc3 is not None: x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = self.fc3(x) return x, avg_attn_scores def get_normalized_probs(self, net_output, log_probs, sample): """Get normalized probabilities (or log probs) from a net's output.""" if self.adaptive_softmax is not None: assert sample is not None and 'target' in sample out = self.adaptive_softmax.get_log_prob(net_output[0], sample['target']) return out.exp_() if not log_probs else out else: return super().get_normalized_probs(net_output, log_probs, sample) def reorder_incremental_state(self, incremental_state, new_order): super().reorder_incremental_state(incremental_state, new_order) auxencoder_out = utils.get_incremental_state(self, incremental_state, 'auxencoder_out') encoder_out = utils.get_incremental_state(self, incremental_state, 'encoder_out') if auxencoder_out is not None: auxencoder_out = tuple( aeo.index_select(0, new_order) for aeo in auxencoder_out) utils.set_incremental_state(self, incremental_state, 'auxencoder_out', auxencoder_out) if encoder_out is not None: encoder_out = tuple( eo.index_select(0, new_order) for eo in encoder_out) utils.set_incremental_state(self, incremental_state, 'encoder_out', encoder_out) def reorder_encoder_out(self, encoder_out_dict, new_order): if encoder_out_dict and encoder_out_dict[ 'encoder_padding_mask'] is not None: encoder_out_dict['encoder_padding_mask'] = \ encoder_out_dict['encoder_padding_mask'].index_select(0, new_order) return encoder_out_dict def reorder_auxencoder_out(self, auxencoder_out_dict, new_order): if auxencoder_out_dict and auxencoder_out_dict[ 'encoder_padding_mask'] is not None: auxencoder_out_dict['encoder_padding_mask'] = \ encoder_out_dict['encoder_padding_mask'].index_select(0, new_order) return auxencoder_out_dict def max_positions(self): """Maximum output length supported by the decoder.""" return self.embed_positions.max_positions( ) if self.embed_positions is not None else float('inf') def upgrade_state_dict(self, state_dict): if state_dict.get('decoder.version', torch.Tensor([1]))[0] < 2: # old models use incorrect weight norm dimension for i, conv in enumerate(self.convolutions): # reconfigure weight norm nn.utils.remove_weight_norm(conv) self.convolutions[i] = nn.utils.weight_norm(conv, dim=0) state_dict['decoder.version'] = torch.Tensor([1]) return state_dict def _embed_tokens(self, tokens, incremental_state): if incremental_state is not None: # keep only the last token for incremental forward pass tokens = tokens[:, -1:] return self.embed_tokens(tokens) def _split_encoder_out(self, encoder_out, incremental_state, aux=False): """Split and transpose encoder outputs. This is cached when doing incremental inference. """ state_name = 'encoder_out' if aux == True: state_name = 'aux' + state_name cached_result = utils.get_incremental_state(self, incremental_state, state_name) if cached_result is not None: return cached_result # transpose only once to speed up attention layers encoder_a, encoder_b = encoder_out encoder_a = encoder_a.transpose(1, 2).contiguous() result = (encoder_a, encoder_b) if incremental_state is not None: utils.set_incremental_state(self, incremental_state, state_name, result) return result def _transpose_if_training(self, x, incremental_state): if incremental_state is None: x = x.transpose(0, 1) return x
def __init__(self, args, dictionary, embed_tokens, embed_scale=None, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim self.embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(self.embed_dim) if embed_scale is None else embed_scale self.project_in_dim = nn.Linear(input_embed_dim, self.embed_dim, bias=False) if self.embed_dim != input_embed_dim else None #self.project_hid_dim = nn.Linear(self.embed_dim,self.embed_dim,bias=True) self.embed_positions = PositionalEmbedding( args.max_target_positions, self.embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_dec_token_positional_embeddings else None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(self.embed_dim) else: self.layernorm_embedding = None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False ): self.layer_norm = LayerNorm(self.embed_dim) else: self.layer_norm = None self.project_out_dim = ( nn.Linear(self.embed_dim, output_embed_dim, bias=False) if self.embed_dim != output_embed_dim and not args.tie_adaptive_weights else None ) self.load_softmax = not getattr(args, 'remove_head', False) if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.output_projection = nn.Linear( self.output_embed_dim, len(dictionary), bias=False ) nn.init.normal_( self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5 ) else: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight self.register_buffer('version', torch.Tensor([2]))
def build_single_decoder(args, src_dict, dst_dict, ngram_decoder=None, project_output=True, is_lm=False): if args.adaptive_softmax_cutoff is not None: project_output = False attention_type = args.attention_type encoder_hidden_dim = args.encoder_hidden_dim if is_lm: attention_type = "no" encoder_hidden_dim = 0 if ngram_decoder: if args.ngram_activation_type == "relu": activation_fn = nn.ReLU elif args.ngram_activation_type == "tanh": activation_fn = nn.Tanh else: raise Exception("ngram_activation_type '%s' not implemented" % args.ngram_activation_type) decoder = NGramDecoder( src_dict=src_dict, dst_dict=dst_dict, n=ngram_decoder, encoder_hidden_dim=encoder_hidden_dim, embed_dim=args.decoder_embed_dim, freeze_embed=args.decoder_freeze_embed, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, hidden_dim=args.decoder_hidden_dim, attention_type=attention_type, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, residual_level=args.residual_level, activation_fn=activation_fn, project_output=project_output, pretrained_embed=args.decoder_pretrained_embed, projection_pretrained_embed=args.decoder_out_pretrained_embed, ) else: decoder = RNNDecoder( src_dict=src_dict, dst_dict=dst_dict, vocab_reduction_params=args.vocab_reduction_params, encoder_hidden_dim=encoder_hidden_dim, embed_dim=args.decoder_embed_dim, freeze_embed=args.decoder_freeze_embed, out_embed_dim=args.decoder_out_embed_dim, cell_type=args.cell_type, num_layers=args.decoder_layers, hidden_dim=args.decoder_hidden_dim, attention_type=attention_type, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, residual_level=args.residual_level, averaging_encoder=args.averaging_encoder, project_output=project_output, pretrained_embed=args.decoder_pretrained_embed, projection_pretrained_embed=args.decoder_out_pretrained_embed, tie_embeddings=args.decoder_tie_embeddings, att_weighted_src_embeds=args.att_weighted_src_embeds, src_embed_dim=args.encoder_embed_dim, att_weighted_activation_type=args.att_weighted_activation_type, ) # Being able to use adaptive softmax for RNN decoder decoder.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: decoder.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), args.decoder_out_embed_dim or args.decoder_hidden_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) return decoder
def __init__(self, args, dictionary, embed_tokens, left_pad=False, final_norm=True): super().__init__(dictionary) self.padding_idx = embed_tokens.padding_idx self.dropout = args.dropout self.share_input_output_embed = args.share_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.embed_dim output_embed_dim = args.output_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens #self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.max_positions = args.max_positions + 1 self.embed_segment = nn.Embedding( args.num_segment, embed_dim, self.padding_idx, ) if args.num_segment > 0 else None self.project_in_dim = nn.Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.prediction_word_embedding = nn.Parameter( torch.Tensor(1, 1, embed_dim).zero_()) self.embed_positions = PositionalEmbedding( self.max_positions, embed_dim, padding_idx, left_pad=left_pad, ) if not args.no_token_positional_embeddings else None def make_layers(args, layers, needs_key_values): if args.universal: layers = [ ShuffleTransformerDecoderLayer( args, needs_key_values=needs_key_values) ] * layers else: layers = [ ShuffleTransformerDecoderLayer( args, needs_key_values=needs_key_values) for _ in range(layers) ] return nn.ModuleList(layers) self.stacked_decoder = args.stacked_decoder self.encoder_layers = make_layers(args, args.encoder_layers, needs_key_values=True) self.decoder_layers = make_layers( args, args.decoder_layers, needs_key_values=False) if args.asymmetric else self.encoder_layers if not args.stacked_decoder and args.encoder_layers != args.decoder_layers: raise ( "If not using stacked-decoder, encoder and decoder must have the same number of layers" ) if not args.asymmetric and args.encoder_layers != args.decoder_layers: raise ( "If not using asymmetric, encoder and decoder must have the same number of layers" ) if args.relative_position == 'sinusoidal': num_positions = self.max_positions sinusoidal_positions = SinusoidalPositionalEmbedding.get_embedding( num_positions, args.embed_dim // args.attention_heads) sinusoidal_relative_positions = [] for i in range(num_positions): sinusoidal_relative_positions.append( torch.cat([ sinusoidal_positions[num_positions - i:], sinusoidal_positions[:num_positions - i] ], 0)) # Make sentinel token have same relative position to everything sinusoidal_relative_positions[-1][0] = 0 assert sinusoidal_relative_positions[-1].size( ) == sinusoidal_positions.size() sinusoidal_relative_positions = torch.stack( sinusoidal_relative_positions, 0) self.sinusoidal_relative_positions = nn.Parameter( sinusoidal_relative_positions) assert sinusoidal_relative_positions.size() == ( num_positions, num_positions, args.embed_dim // args.attention_heads) #assert (sinusoidal_relative_positions[0] == sinusoidal_positions).all() assert (sinusoidal_relative_positions[7, 7] == sinusoidal_relative_positions[11, 11]).all() assert (sinusoidal_relative_positions[5, 11] == sinusoidal_relative_positions[6, 12]).all() else: self.sinusoidal_relative_positions = None self.adaptive_softmax = None self.project_out_dim = nn.Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None self.load_softmax = not getattr(args, 'remove_head', False) if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) #if args.sentence_class_num > 0: # self.sentence_projection_layer = Linear(embed_dim, args.sentence_class_num, bias=False) self.normalize = args.normalize_before and final_norm if self.normalize: self.layer_norm = BertLayerNorm(embed_dim) self.apply(self.init_bert_weights)
def __init__( self, dictionary, rnn_type="lstm", embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention_type="luong-dot", encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, residuals=False, ): super().__init__(dictionary) self.dropout_in_module = FairseqDropout( dropout_in, module_name=self.__class__.__name__ ) self.dropout_out_module = FairseqDropout( dropout_out, module_name=self.__class__.__name__ ) self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.max_target_positions = max_target_positions self.residuals = residuals self.num_layers = num_layers self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = torch.nn.Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size and encoder_output_units != 0: self.encoder_hidden_proj = torch.nn.Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = torch.nn.Linear(encoder_output_units, hidden_size) else: self.encoder_hidden_proj = self.encoder_cell_proj = None # input feeding is described in arxiv.org/abs/1508.04025 input_feed_size = 0 if encoder_output_units == 0 else hidden_size # For Bahdanau, we compute the context on the input feed bahd_factor = hidden_size \ if attention_type in ["bahdanau-dot", "bahdanau-concat", "bahdanau-general", "bahdanau"] \ else 0 self.rnn_type = rnn_type if rnn_type == "lstm": self.layers = LSTM( input_size=input_feed_size + embed_dim + bahd_factor, hidden_size=hidden_size, num_layers=num_layers ) else: self.layers = GRU( input_size=input_feed_size + embed_dim + bahd_factor, hidden_size=hidden_size, num_layers=num_layers ) if attention_type == "none": self.attention_type = "none" self.attention = None else: self.attention_type = attention_type self.attention = Attention(self.attention_type, hidden_size) if hidden_size != out_embed_dim: self.additional_fc = torch.nn.Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax( num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out, ) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) # We combine the q/k/v flows of decoder self-attention with the q flow # of encoder-decoder attention. So there are four flows in total. self.proj_bias_selfattn = nn.Parameter( torch.randn(4, args.decoder_layers, embed_dim)) # The remaining k/v flows of encoder-decoder attention. self.proj_bias_encattn_kv = nn.Parameter( torch.randn(2, args.decoder_layers, embed_dim)) multiplier = 2 max_seq_len = 255 #255 or 490 self.register_buffer("time_dt", torch.tensor(0.01)) self.register_buffer("max_seq_len", torch.tensor(max_seq_len)) #self.register_buffer("max_seq_len", torch.tensor(490)) #self.register_buffer("update_flow_every", torch.tensor(100)) self.register_buffer("update_flow_every", torch.tensor(400)) self.register_buffer("k_updates", torch.tensor(0)) self.register_buffer( "cached_bias_selfattn", torch.randn(max_seq_len, 4, args.decoder_layers, embed_dim)) self.register_buffer( "cached_bias_encattn", torch.randn(max_seq_len, 2, args.decoder_layers, embed_dim)) self.proj_flow_selfattn = flow_func_linear(args.decoder_layers, multiplier, embed_dim) self.proj_flow_encattn_kv = flow_func_linear(args.decoder_layers, multiplier, embed_dim) #self.proj_flow_selfattn = id_flow(args.decoder_layers, multiplier, embed_dim) #self.proj_flow_encattn_kv = id_flow( # args.decoder_layers, multiplier, embed_dim #) self.layers = nn.ModuleList([]) self.layers.extend([ FlowTransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.ode_args = { "method": "rk4", "options": { "step_size": self.time_dt.item() / 5.0 }, } self.reset_parameters()
class TransformerDecoder(FairseqIncrementalDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). final_norm (bool, optional): apply layer norm to the output of the final decoder layer (default: True). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.onnx_trace = False self.decoder_max_order = args.decoder_max_order self.clamp_value = getattr(args, 'clamp_value', 0.01) self.gs_clamp = args.gs_clamp def set_perm_order(self, perm_order=0): assert isinstance(perm_order, int) and 0 <= perm_order <= 5 for layer in self.layers: layer.set_perm_order(perm_order) def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features(prev_output_tokens, encoder_out, incremental_state) x = self.output_layer(x, encoder_out) return x, extra def extract_features(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused): """ Similar to *forward* but only return features. Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers for layer in self.layers: x, attn = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, ) inner_states.append(x) if self.normalize: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {'attn': attn, 'inner_states': inner_states} def output_layer(self, features, encoder_out, **kwargs): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: return [ F.linear(features, self.embed_tokens.weight), encoder_out['encoder_pred_order'] ] else: return F.linear(features, self.embed_out) else: return features def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr( self, '_future_mask' ) or self._future_mask is None or self._future_mask.device != tensor.device: self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._future_mask.size(0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format( name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format( name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format( name, i, new, m)] = state_dict[k] del state_dict[k] if utils.item( state_dict.get('{}.version'.format(name), torch.Tensor( [1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict['{}.version'.format(name)] = torch.Tensor([1]) return state_dict def get_normalized_probs(self, net_output, log_probs, sample, gs_tau=0.5, gs_hard=False): """Get normalized probabilities (or log probs) from a net's output.""" if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None: if sample is not None: assert 'target' in sample target = sample['target'] else: target = None out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) return out.exp_() if not log_probs else out logits = net_output[0][0] orders = net_output[0][1] if log_probs: return (utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace), *self.gumbel_softmax( orders, gs_tau=gs_tau, gs_hard=gs_hard, dim=-1)) else: return (utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace), *self.gumbel_softmax( orders, gs_tau=gs_tau, gs_hard=gs_hard, dim=-1)) def gumbel_softmax(self, logits, gs_tau=0.5, gs_hard=False, dim=-1): if not gs_hard: prob = utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) prob_clamp = torch.clamp( prob, self.clamp_value, 1. - (self.decoder_max_order - 1) * self.clamp_value) logprob = torch.log(prob_clamp if self.gs_clamp else prob) gs = F.gumbel_softmax( logprob, tau=gs_tau, hard=False, ) else: prob = utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) prob_clamp = torch.clamp( prob, self.clamp_value, 1. - (self.decoder_max_order - 1) * self.clamp_value) max_idx = torch.argmax(logits, -1, keepdim=True) one_hot = logits.new_zeros(logits.size()) gs = one_hot.scatter(-1, max_idx, 1) return gs, prob, prob_clamp
def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, num_attentions=1, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, token_map=None, granularity_flags=None, double_learning=False ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.max_target_positions = max_target_positions self.token2components_map = token_map self.token_sequences = granularity_flags[0] if granularity_flags is not None else False self.char_sequences = granularity_flags[1] if granularity_flags is not None else False self.g_id = 'char' if ((not self.token_sequences) and self.char_sequences) else 'token' self.merge_flag = False self.double_learning = double_learning self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size and encoder_output_units != 0: self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear(encoder_output_units, hidden_size) else: self.encoder_hidden_proj = self.encoder_cell_proj = None # disable input feeding if there is no encoder # input feeding is described in arxiv.org/abs/1508.04025 input_feed_size = 0 if encoder_output_units == 0 else hidden_size * num_attentions total_embed_size = 2*embed_dim if self.token_sequences and self.char_sequences else embed_dim self.layers = nn.ModuleList([ LSTMCell( input_size=input_feed_size + total_embed_size if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) self.num_attentions = num_attentions self.attentions = nn.ModuleList() for i in range(num_attentions): # TODO make bias configurable query_size = hidden_size key_size = encoder_output_units if i == 0 else out_embed_dim value_size = hidden_size self.attentions.append( AttentionLayer(query_size, key_size, value_size, bias=False) ) if self.double_learning or hidden_size != out_embed_dim: if hidden_size != out_embed_dim: self.tk_additional_fc = Linear(hidden_size, out_embed_dim) if self.double_learning: self.char_rnn = LSTMCell(input_size=hidden_size, hidden_size=hidden_size) #Linear(embed_dim + hidden_size, hidden_size) # input: char embed, $c_{i-1}$ (input-feed) => $h_i$ self.char2tok_att = AttentionLayer(hidden_size, hidden_size, hidden_size, bias=False) # input: $h_i$, x => $c_i$ if hidden_size != out_embed_dim: self.char_out = Linear(hidden_size, out_embed_dim) # input: $c_i$ => $o_i$ if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__(self, args, dictionary, embed_tokens, classification_head=None): super().__init__(dictionary) self.onnx_trace = False self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed self.embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.self_target = args.self_target self.future_target = args.future_target self.past_target = args.past_target self.char_inputs = args.char_inputs self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(self.embed_dim) self.embed_positions = PositionalEmbedding( args.max_target_positions, self.embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.forward_layers = nn.ModuleList([ TransformerDecoderLayer( args, no_encoder_attn=True, add_bias_kv=not args.no_bias_kv, add_zero_attn=args.no_bias_kv, ) for _ in range(args.decoder_layers) ]) self.backward_layers = nn.ModuleList([ TransformerDecoderLayer( args, no_encoder_attn=True, add_bias_kv=not args.no_bias_kv, add_zero_attn=args.no_bias_kv, ) for _ in range(args.decoder_layers) ]) self.full_attn_layer = None self.full_linear_layer = None if self.self_target: if args.linear_final_layer: self.full_linear_layer = Linear(self.embed_dim * 2, self.embed_dim, args.linear_final_layer_bias) else: self.full_attn_layer = BidirectionalTransformerDecoderLayer( args) self.load_softmax = not getattr(args, 'remove_head', False) self.embed_out = None self.adaptive_softmax = None self.classification_head = classification_head if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), args.decoder_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.embed_dim**-0.5)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.n_experts = 1 self.nhidlast = self.embed_dim self.ninp = self.embed_dim self.ntoken = 9744 self.prior = nn.Linear(self.nhidlast, self.n_experts, bias=False) self.latent = nn.Sequential( nn.Linear(self.nhidlast, self.n_experts * self.ninp), nn.Tanh())
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) if args.PRUNE_BOOL: self.decoder_self_attn_path = args.DECODER_SELF_ATTN_PATH self.encoder_decoder_attn_path = args.ENCODER_DECODER_ATTN_PATH self.decoder_self_attn_pattern = torch.from_numpy(np.load(self.decoder_self_attn_path)) #(no_layers, 1, no_head, 1024, 1024) self.encoder_decoder_attn_pattern = torch.from_numpy(np.load(self.encoder_decoder_attn_path)) #(no_layers, 1, no_head, 1024, 1024) if args.CUDA: self.decoder_self_attn_pattern = self.decoder_self_attn_pattern.cuda() self.encoder_decoder_attn_pattern = self.encoder_decoder_attn_pattern.cuda() self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) if args.PRUNE_BOOL: self.layers.extend( [ TransformerDecoderLayer(args, self.decoder_self_attn_pattern[i], self.encoder_decoder_attn_pattern[i], no_encoder_attn) for i in range(args.decoder_layers) ] ) else: self.layers.extend( [ TransformerDecoderLayer(args, None, None, no_encoder_attn) for i in range(args.decoder_layers) ] ) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False ): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, dictionary, embed_tokens, embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, bidirectional=False, left_pad=False, padding_value=0., adaptive_softmax=False, adaptive_softmax_cutoff=[], adaptive_softmax_dropout=0.1, adaptive_softmax_factor=None): super(LSTMTaggerDecoder, self).__init__(dictionary=dictionary) if hasattr(embed_tokens, "embedded_dim"): self.in_embed_dim = embed_tokens.embedded_dim elif hasattr(embed_tokens, "embed_dim"): self.in_embed_dim = embed_tokens.embed_dim elif hasattr(embed_tokens, "embedding_dim"): self.in_embed_dim = embed_tokens.embedding_dim else: raise Exception self.output_units = self.embed_dim = embed_dim self.out_embed_dim = len(dictionary) self.num_layers = num_layers self.dropout_in = dropout_in self.dropout_out = dropout_out self.bidirectional = bidirectional if self.bidirectional: #self.output_units *= 2 pass self.padding_idx = dictionary.pad() self.padding_value = 0. self.left_pad = left_pad self.embed_tokens = embed_tokens self.fc_in = self.fc_out1 = self.fc_out2 = None if self.in_embed_dim != self.embed_dim: self.fc_in = Linear(self.in_embed_dim, self.embed_dim) if self.output_units != self.embed_dim: self.fc_out1 = Linear(self.output_units, self.embed_dim) if self.embed_dim != self.out_embed_dim: self.fc_out2 = Linear(self.embed_dim, self.out_embed_dim) self.lstm = LSTM( input_size=embed_dim, hidden_size=embed_dim, num_layers=num_layers, dropout=self.dropout_out if num_layers > 1 else 0., bidirectional=bidirectional, ) self.adaptive_softmax = None if adaptive_softmax: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.embed_dim, adaptive_softmax_cutoff, dropout=adaptive_softmax_dropout, adaptive_inputs=None, factor=adaptive_softmax_factor, tie_proj=False, )
def __init__(self, args, dictionary, embed_tokens, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.unk_idx = dictionary.unk() self.eos_idx = dictionary.eos() self.max_target_positions = args.max_target_positions self.output_dim = args.decoder_embed_dim self.self_target = args.self_target self.future_target = args.future_target self.past_target = args.past_target self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.input_dropout = torch.tensor( args.input_dropout) if args.input_dropout > 0 else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.forward_layers = nn.ModuleList([ TransformerDecoderLayer(args) for _ in range(args.decoder_layers) ]) self.backward_layers = nn.ModuleList([ TransformerDecoderLayer(args) for _ in range(args.decoder_layers) ]) if not args.single_tower else self.forward_layers self.single_tower = args.single_tower self.full_attn_layer = None self.full_linear_layer = None if self.self_target: if args.linear_final_layer: self.full_linear_layer = Linear(embed_dim * 2, embed_dim, args.linear_final_layer_bias) else: self.full_attn_layer = BidirectionalTransformerDecoderLayer( args) self.load_softmax = not getattr(args, 'remove_head', False) self.embed_out = None self.adaptive_softmax = None if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), args.decoder_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=embed_dim**-0.5) else: self.share_input_output_embed = False
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ LightConvDecoderLayer(args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i]) for i in range(args.decoder_layers) ]) self.decoder_dynamic_combination = args.decoder_dynamic_combination self.decoder_linear_combination = args.decoder_linear_combination assert not (self.decoder_dynamic_combination and self.decoder_linear_combination) if self.decoder_linear_combination or self.decoder_dynamic_combination: self.weight_ffn = nn.Sequential( nn.Linear(embed_dim, args.decoder_ffn_embed_dim), nn.ReLU(), nn.Linear(args.decoder_ffn_embed_dim, embed_dim), ) if self.decoder_dynamic_combination: self.proj = nn.ModuleList([ nn.Sequential( nn.Linear(embed_dim * args.decoder_layers, embed_dim * 2), nn.ReLU(), nn.Linear(embed_dim * 2, embed_dim)) for _ in range(args.decoder_layers) ]) if self.decoder_linear_combination: self.weights = nn.ParameterList([ nn.Parameter(torch.randn(1, 1, embed_dim), requires_grad=True) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
class TransformerDecoder(nn.Module): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__() self.register_buffer('version', torch.Tensor([3])) self.dictionary = dictionary self.onnx_trace = False self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr(args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None def forward( self, prev_output_tokens, encoder_out=None, incremental_state=None, features_only=False, **extra_args ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, **extra_args ) if not features_only: x = self.output_layer(x) return x, extra def extract_features( self, prev_output_tokens, encoder_out=None, incremental_state=None, full_context_alignment=False, alignment_layer=None, alignment_heads=None, **unused, ): """ Similar to *forward* but only return features. Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). alignment_layer (int, optional): return mean alignment over heads at this layer (default: last layer). alignment_heads (int, optional): only average alignment over this many heads (default: all heads). Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ if alignment_layer is None: alignment_layer = len(self.layers) - 1 # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions if self.layernorm_embedding: x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) self_attn_padding_mask = None if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) # decoder layers attn = None inner_states = [x] for idx, layer in enumerate(self.layers): encoder_state = None if encoder_out is not None: if self.layer_wise_attention: encoder_state = encoder_out.encoder_states[idx] else: encoder_state = encoder_out.encoder_out if incremental_state is None and not full_context_alignment: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) dropout_probability = random.uniform(0, 1) if not self.training or (dropout_probability > self.decoder_layerdrop): x, layer_attn = layer( x, encoder_state, encoder_out.encoder_padding_mask if encoder_out is not None else None, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_attn=(idx == alignment_layer), need_head_weights=(idx == alignment_layer), ) inner_states.append(x) if layer_attn is not None and idx == alignment_layer: attn = layer_attn.float() if attn is not None: if alignment_heads is not None: attn = attn[:alignment_heads] # average probabilities over heads attn = attn.mean(dim=0) if self.layer_norm: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {'attn': attn, 'inner_states': inner_states} def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: return F.linear(features, self.embed_tokens.weight) else: return F.linear(features, self.embed_out) else: return features def get_normalized_probs(self, net_output, log_probs, sample): """Get normalized probabilities (or log probs) from a net's output.""" if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None: if sample is not None: assert 'target' in sample target = sample['target'] else: target = None out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) return out.exp_() if not log_probs else out logits = net_output[0] if log_probs: return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace) else: return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if ( not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device or self._future_mask.size(0) < dim ): self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict(self, state_dict): """Upgrade a (possibly old) state dict for new versions of fairseq.""" return state_dict def prepare_for_onnx_export_(self): self.onnx_trace = True def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format(name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format(name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format(name, i, new, m)] = state_dict[k] del state_dict[k] version_key = '{}.version'.format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict def reorder_incremental_state(self, incremental_state, new_order): """Reorder incremental state. This should be called when the order of the input has changed from the previous time step. A typical use case is beam search, where the input order changes between time steps based on the selection of beams. """ seen = set() def apply_reorder_incremental_state(module): if module != self and hasattr(module, 'reorder_incremental_state') \ and module not in seen: seen.add(module) module.reorder_incremental_state(incremental_state, new_order) self.apply(apply_reorder_incremental_state) def set_beam_size(self, beam_size): """Sets the beam size in the decoder and all children.""" if getattr(self, '_beam_size', -1) != beam_size: seen = set() def apply_set_beam_size(module): if module != self and hasattr(module, 'set_beam_size') \ and module not in seen: seen.add(module) module.set_beam_size(beam_size) self.apply(apply_set_beam_size) self._beam_size = beam_size
def __init__( self, args, src_dict, dst_dict, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True, ): super().__init__(dst_dict) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerAANDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = Linear(embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) """ MODIFIED: add copying mechanism as a separate multi-head attention """ if args.use_copy_scores: assert not no_encoder_attn, \ "copy scores cannot be computed if " \ "there is no encoder-decoder attention" # Number of heads in copy attention layer is an optional argument self.copy_attention_heads = (args.decoder_attention_heads if args.copy_attention_heads == 0 else args.copy_attention_heads) self.copy_attention = MultiheadAttention( embed_dim, self.copy_attention_heads, dropout=args.attention_dropout, ) # (NOTE) For computing alpha. self.copy_balancing_layer = Linear(input_embed_dim, 1) if args.decode_with_edit_labels: raise NotImplementedError else: self.copy_attention = None self.copy_balancing_layer = None # Alpha scheduler & diagnostic checker self.alpha_warmup = args.alpha_warmup self.num_batches = 0 self.num_copies = 0 self.mean_alpha = 0.0 # Zero out generative probability of a word if also in source sentence self.pad_copied_words = args.pad_copied_words self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: """ MODIFIED: require share_input_output_embed for copying mechanism """ raise NotImplementedError( "copying mechanism requires share_input_output_embed" ) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, args, src_dict, dst_dict, embed_tokens): super().__init__(dst_dict) self.dropout = args.dropout self.decoder_layerdrop = 0 if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0: self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.aan = args.aan decoder_layer_class = (AANDecoderLayer if self.aan else fairseq_transformer.TransformerDecoderLayer) self.layers = nn.ModuleList([]) self.layers.extend( [decoder_layer_class(args) for i in range(args.decoder_layers)]) if hasattr(args, "decoder_layers_to_keep") and args.decoder_layers_to_keep: layers_to_keep = sorted( int(x) for x in args.decoder_layers_to_keep.split(",")) self.decoder_layers_to_keep = { layer_id: layer_idx for layer_idx, layer_id in enumerate(layers_to_keep) } self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = fairseq_transformer.Linear( embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False # Use quantizable nn.Linear for output projection instead of F.linear self.output_projection = None if self.vocab_reduction_module is None: if self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0]) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear(self.embed_out.shape[1], self.embed_out.shape[0]) self.output_projection.weight = self.embed_out
def __init__( self, dictionary, embed_dim=512, embed_dict=None, out_embed_dim=256, max_positions=1024, convolutions=((512, 3),) * 20, attention=True, dropout=0.1, share_embed=False, positional_embeddings=True, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, left_pad=False, ): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([2])) self.dropout = dropout self.left_pad = left_pad self.need_attn = True convolutions = extend_conv_spec(convolutions) in_channels = convolutions[0][0] if isinstance(attention, bool): # expand True into [True, True, ...] and do the same with False attention = [attention] * len(convolutions) if not isinstance(attention, list) or len(attention) != len(convolutions): raise ValueError('Attention is expected to be a list of booleans of ' 'length equal to the number of layers.') num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if embed_dict: self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=self.left_pad, ) if positional_embeddings else None self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() self.residuals = [] layer_in_channels = [in_channels] for i, (out_channels, kernel_size, residual) in enumerate(convolutions): if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] self.projections.append(Linear(residual_dim, out_channels) if residual_dim != out_channels else None) self.convolutions.append( LinearizedConv1d(in_channels, out_channels * 2, kernel_size, padding=(kernel_size - 1), dropout=dropout) ) self.attention.append(AttentionLayer(out_channels, embed_dim) if attention[i] else None) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) self.adaptive_softmax = None self.fc2 = self.fc3 = None if adaptive_softmax_cutoff is not None: assert not share_embed self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, in_channels, adaptive_softmax_cutoff, dropout=adaptive_softmax_dropout) else: self.fc2 = Linear(in_channels, out_embed_dim) if share_embed: assert out_embed_dim == embed_dim, \ "Shared embed weights implies same dimensions " \ " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim) self.fc3 = nn.Linear(out_embed_dim, num_embeddings) self.fc3.weight = self.embed_tokens.weight else: self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) # self.dropout = [0.05, 0.1, 0.25, 0.3] self.dropout = [0, 0, 0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3] # self.dropout = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3] self.index = None self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) # self.embedding_hidden_mapping_out = SlimmableLinear([int(embed_dim / 4), int(embed_dim * 2 / 4), int(embed_dim * 3 / 4), embed_dim], # [embed_dim, embed_dim, embed_dim, embed_dim]) # self.embedding_hidden_mapping_in = SlimmableLinear([embed_dim, embed_dim, embed_dim, embed_dim], # [int(embed_dim / 4), int(embed_dim * 2 / 4), int(embed_dim * 3 / 4), embed_dim]) self.embedding_hidden_mapping_in = SlimmableLinear([embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim], [int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16), int(embed_dim * 7 / 16), int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16), int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim]) self.embedding_hidden_mapping_out = SlimmableLinear([int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16), int(embed_dim * 7 / 16), int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16), int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim], [embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim]) self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend( [ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ] ) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False ): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.max_target_positions = max_target_positions self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size and encoder_output_units != 0: self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear(encoder_output_units, hidden_size) else: self.encoder_hidden_proj = self.encoder_cell_proj = None # disable input feeding if there is no encoder # input feeding is described in arxiv.org/abs/1508.04025 input_feed_size = 0 if encoder_output_units == 0 else hidden_size self.layers = nn.ModuleList([ LSTMCell( input_size=input_feed_size + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) if attention: # TODO make bias configurable self.attention = AttentionLayer(hidden_size, encoder_output_units, hidden_size, bias=False) else: self.attention = None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__(self, args, dictionary, embed_tokens, embed_scale=None, no_encoder_attn=False, left_pad=False, final_norm=True, light=False, masker=False): super().__init__(dictionary) self.args = args self.light = light self.masker = masker self.pnet = getattr(args, "pnet", False) self.pnet2 = getattr(args, "pnet2", False) if self.pnet2: assert not self.light self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim self.embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( self.embed_dim) if embed_scale is None else embed_scale self.project_in_dim = nn.Linear( input_embed_dim, self.embed_dim, bias=False) if self.embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, self.embed_dim, self.padding_idx, #learned=args.decoder_learned_pos, ) if not args.no_dec_token_positional_embeddings else None if hasattr(args, "decoding_iterations") and args.decoding_iterations > 0: args.refinetot = args.decoding_iterations self.selected_decoder = 0 if self.light: self.layers = nn.ModuleList() self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) # self.layers.requires_grad_(False) self.last_decoder_layers = nn.ModuleList() self.last_decoder_layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.refinetot) ]) else: self.layers = None # switchable self.layer_stack = nn.ModuleList() for _ in range(args.refinetot): layers = nn.ModuleList([]) layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.layer_stack.append(layers) if self.pnet: self.pnet_layers = nn.ModuleList([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(3) ]) self.pnet_pred = nn.Linear(self.embed_dim, 1) if self.pnet2: self.pnet_stack = nn.ModuleList() self.pnet_stack.extend([ nn.Linear(self.embed_dim, 1) for _ in range(args.decoder_layers) ]) if self.masker: self.masker_stack = nn.ModuleList() if hasattr(self.args, "masker_light") and self.args.masker_light: masker_layers = 1 else: masker_layers = args.refinetot - 1 for _ in range(masker_layers): layers = nn.ModuleList([]) layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(3) ]) self.masker_stack.append(layers) self.masker_predict_stack = nn.ModuleList() for _ in range(masker_layers): self.masker_predict_stack.append( nn.Linear(args.decoder_output_dim, 1)) self.adaptive_softmax = None self.project_out_dim = nn.Linear(self.embed_dim, output_embed_dim, bias=False) \ if self.embed_dim != output_embed_dim and not args.tie_adaptive_weights else None self.load_softmax = not getattr(args, 'remove_head', False) if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) #nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = BertLayerNorm(self.embed_dim) if not self.share_input_output_embed: self.embed_out.requires_grad_(False) self.embed_tokens.requires_grad_(False) if self.pnet: for name, param in self.named_parameters(): if "pnet" not in name: param.requires_grad_(False)
def __init__(self, args, dictionary, embed_tokens): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__) self.dropword_module = FairseqFeatureDropout(args.word_dropout, module_name=self.__class__.__name__) self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) assert not args.layernorm_embedding or not args.decoder_normalize_before if args.layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([self.build_decoder_layer(i, args) for i in range(args.decoder_layers)]) self.num_layers = len(self.layers) if args.decoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) self.proj_layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.proj_layer_norm = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) self.adaptive_softmax = None self.output_projection = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear( self.output_embed_dim, len(dictionary), bias=False ) nn.init.normal_( self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5 )
def __init__( self, rnn_type: Union[str, Namespace], dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, attention_bias=False, # todo encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, residuals=False, ): super().__init__(dictionary) rnn_type = rnn_type.rnn_type if isinstance(rnn_type, Namespace) else rnn_type self.rnn_type = rnn_type.lower().strip() self.is_lstm = True if self.rnn_type == 'lstm' else False # 方便后面做判断,以便处理lstm的 cell值 self.dropout_in_module = FairseqDropout( dropout_in, module_name=self.__class__.__name__) self.dropout_out_module = FairseqDropout( dropout_out, module_name=self.__class__.__name__) self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.max_target_positions = max_target_positions self.residuals = residuals self.num_layers = num_layers self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = JqEmbedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size and encoder_output_units != 0: self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear( encoder_output_units, hidden_size) if self.is_lstm else None else: self.encoder_hidden_proj = self.encoder_cell_proj = None # disable input feeding if there is no encoder # input feeding is described in arxiv.org/abs/1508.04025 input_feed_size = 0 if encoder_output_units == 0 else hidden_size _, JQRNNCell = get_rnn_cell(self.rnn_type) # 返回的是方法 # _ JQRNN self.layers = nn.ModuleList([ JQRNNCell( input_size=input_feed_size + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) if attention: # TODO make bias configurable: Done self.attention = AttentionLayer(hidden_size, encoder_output_units, hidden_size, bias=attention_bias) else: self.attention = None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax( num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out, ) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)