def __init__(self, args, src_dict, dst_dict, embed_tokens, left_pad=False): super().__init__(dst_dict) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) self.layers = nn.ModuleList([]) self.layers.extend([ fairseq_transformer.TransformerDecoderLayer(args) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), args.decoder_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params) self.onnx_trace = False
def _init_components(self, args, src_dict, dst_dict, embed_tokens): self.initial_rnn_layer = nn.LSTM(input_size=self.initial_input_dim, hidden_size=self.lstm_units) self.proj_encoder_layer = None if self.attention_dim != self.encoder_output_dim: self.proj_encoder_layer = fairseq_transformer.Linear( self.encoder_output_dim, self.attention_dim) self.proj_layer = None if self.lstm_units != self.attention_dim: self.proj_layer = fairseq_transformer.Linear( self.lstm_units, self.attention_dim) self.attention = fairseq_transformer.MultiheadAttention( self.attention_dim, self.num_attention_heads, dropout=args.attention_dropout) self.extra_rnn_layers = nn.ModuleList([]) for _ in range(self.num_layers - 1): self.extra_rnn_layers.append( nn.LSTM(input_size=self.input_dim, hidden_size=self.lstm_units)) self.bottleneck_layer = None if self.bottleneck_dim is not None: self.out_embed_dim = self.bottleneck_dim self.bottleneck_layer = fairseq_transformer.Linear( self.input_dim, self.out_embed_dim) else: self.out_embed_dim = self.input_dim self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), self.out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.out_embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False
def __init__( self, src_dict, dst_dict, vocab_reduction_params=None, out_embed_dim=512, project_output=True, ): super().__init__(dst_dict) self.project_output = project_output if project_output: self.num_embeddings = len(dst_dict) self.out_embed_dim = out_embed_dim self.vocab_reduction_module = None if vocab_reduction_params: self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, vocab_reduction_params) self.output_projection_w = nn.Parameter( torch.FloatTensor(self.num_embeddings, self.out_embed_dim).uniform_(-0.1, 0.1)) self.output_projection_b = nn.Parameter( torch.FloatTensor(self.num_embeddings).zero_())
def __init__( self, src_dict, dst_dict, decoders, combination_strategy, split_encoder=False, vocab_reduction_params=None, ): """Create a new multi-decoder instance. Args: src_dict (Dictionary): Source language dictionary. dst_dict (Dictionary): Target language dictionary. decoders (list): List of DecoderWithOutputProjection. combination_strategy (string): Name of the combination strategy. Passed through to `create_strategy()`. split_encoder (bool): If true, split encoder output, each decoder gets its own split. vocab_reduction_params: For vocabular reduction. """ super().__init__(dst_dict) assert not any(decoder.project_output for decoder in decoders) self.decoders = nn.ModuleList(decoders) vocab_reduction_module = None if vocab_reduction_params: vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, vocab_reduction_params ) self.combi_strat = create_strategy( combination_strategy, [decoder.out_embed_dim for decoder in decoders], len(dst_dict), vocab_reduction_module, ) self.split_encoder = split_encoder
def __init__( self, src_dict, dst_dict, vocab_reduction_params=None, encoder_hidden_dim=512, embed_dim=512, freeze_embed=False, hidden_dim=512, out_embed_dim=512, cell_type="lstm", num_layers=1, dropout_in=0.1, dropout_out=0.1, attention_type="dot", residual_level=None, averaging_encoder=False, ): super().__init__(dst_dict) self.encoder_hidden_dim = encoder_hidden_dim self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.out_embed_dim = out_embed_dim self.dropout_in = dropout_in self.dropout_out = dropout_out self.attention_type = attention_type self.residual_level = residual_level num_embeddings = len(dst_dict) padding_idx = dst_dict.pad() self.embed_tokens = Embedding( num_embeddings=num_embeddings, embedding_dim=embed_dim, padding_idx=padding_idx, freeze_embed=freeze_embed, ) self.hidden_dim = hidden_dim self.averaging_encoder = averaging_encoder if cell_type == "lstm": cell_class = rnn_cell.LSTMCell elif cell_type == "milstm": cell_class = rnn_cell.MILSTMCell elif cell_type == "layer_norm_lstm": cell_class = rnn_cell.LayerNormLSTMCell if hidden_dim != encoder_hidden_dim: hidden_init_fc_list = [] cell_init_fc_list = [] for _ in range(num_layers): hidden_init_fc_list.append( Linear(encoder_hidden_dim, hidden_dim)) cell_init_fc_list.append(Linear(encoder_hidden_dim, hidden_dim)) self.hidden_init_fc_list = nn.ModuleList(hidden_init_fc_list) self.cell_init_fc_list = nn.ModuleList(cell_init_fc_list) self.initial_attn_context = nn.Parameter( torch.Tensor(encoder_hidden_dim).zero_(), ) if attention_type is not None: self.attention = attention.build_attention( attention_type=attention_type, decoder_hidden_state_dim=hidden_dim, encoder_output_dim=encoder_hidden_dim, ) self.combined_output_and_context_dim = encoder_hidden_dim + hidden_dim else: self.attention = None self.combined_output_and_context_dim = hidden_dim layers = [] for layer in range(num_layers): if layer == 0: if self.attention is not None: cell_input_dim = encoder_hidden_dim + embed_dim else: cell_input_dim = embed_dim else: cell_input_dim = hidden_dim layers.append( cell_class(input_dim=cell_input_dim, hidden_dim=hidden_dim)) self.layers = nn.ModuleList(layers) if self.combined_output_and_context_dim != out_embed_dim: self.additional_fc = Linear(self.combined_output_and_context_dim, out_embed_dim) self.vocab_reduction_module = None if vocab_reduction_params: self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, vocab_reduction_params) self.output_projection_w = nn.Parameter( torch.FloatTensor(num_embeddings, out_embed_dim).uniform_(-0.1, 0.1)) self.output_projection_b = nn.Parameter( torch.FloatTensor(num_embeddings).zero_())
def __init__( self, src_dict, dst_dict, vocab_reduction_params=None, out_embed_dim=512, project_output=True, pretrained_embed=None, out_embed_norm=None, att_weighted_src_embeds=False, src_embed_dim=512, att_weighted_activation_type="tanh", predictor=None, fp16: bool = False, ): super().__init__(dst_dict) self.project_output = project_output if project_output: self.num_embeddings = len(dst_dict) self.out_embed_dim = out_embed_dim self.out_embed_norm = out_embed_norm self.att_weighted_src_embeds = att_weighted_src_embeds self.src_embed_dim = src_embed_dim self.vocab_reduction_module = None if vocab_reduction_params or predictor is not None: self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict=src_dict, dst_dict=dst_dict, vocab_reduction_params=vocab_reduction_params, predictor=predictor, fp16=fp16, ) projection_weights = torch.FloatTensor( self.num_embeddings, self.out_embed_dim ).uniform_(-0.1, 0.1) if isinstance(pretrained_embed, nn.Embedding): projection_weights.data = pretrained_embed.weights.data elif pretrained_embed is not None: embed_dict = utils.parse_embedding(pretrained_embed) # equivalent to utils.load_embedding but for nn.Parameter for idx in range(len(dst_dict)): token = dst_dict[idx] if token in embed_dict: projection_weights[idx] = embed_dict[token] self.output_projection_w = nn.Parameter(projection_weights) self.output_projection_b = nn.Parameter( torch.FloatTensor(self.num_embeddings).zero_() ) if att_weighted_activation_type == "tanh": activation_fn = nn.Tanh self.att_weighted_activation_fn = torch.tanh elif att_weighted_activation_type == "relu": activation_fn = nn.ReLU self.att_weighted_activation_fn = torch.relu else: raise Exception( "att_weighted_activation_type '%s' not implemented" % att_weighted_activation_type ) if att_weighted_src_embeds: print(att_weighted_activation_type) self.lexical_layer = NonlinearLayer( self.src_embed_dim, self.out_embed_dim, bias=False, activation_fn=activation_fn, ) self.output_projection_w_lex = nn.Parameter( torch.FloatTensor(self.num_embeddings, self.out_embed_dim).uniform_( -0.1, 0.1 ) ) self.output_projection_b_lex = nn.Parameter( torch.FloatTensor(self.num_embeddings).zero_() )
def __init__(self, args, src_dict, dst_dict, embed_tokens): super().__init__(dst_dict) self.dropout = args.dropout self.decoder_layerdrop = 0 if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0: self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.aan = args.aan decoder_layer_class = (AANDecoderLayer if self.aan else fairseq_transformer.TransformerDecoderLayer) self.layers = nn.ModuleList([]) self.layers.extend( [decoder_layer_class(args) for i in range(args.decoder_layers)]) if hasattr(args, "decoder_layers_to_keep") and args.decoder_layers_to_keep: layers_to_keep = sorted( int(x) for x in args.decoder_layers_to_keep.split(",")) self.decoder_layers_to_keep = { layer_id: layer_idx for layer_idx, layer_id in enumerate(layers_to_keep) } self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = fairseq_transformer.Linear( embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False # Use quantizable nn.Linear for output projection instead of F.linear self.output_projection = None if self.vocab_reduction_module is None: if self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0]) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear(self.embed_out.shape[1], self.embed_out.shape[0]) self.output_projection.weight = self.embed_out
def __init__( self, src_dict, dst_dict, decoders, combination_strategy, is_lm=None, split_encoder=False, vocab_reduction_params=None, training_schedule="complete", fixed_weights=None, ): """Create a new multi-decoder instance. Args: src_dict (Dictionary): Source language dictionary. dst_dict (Dictionary): Target language dictionary. decoders (list): List of DecoderWithOutputProjection. combination_strategy (string): Name of the combination strategy. Passed through to `create_strategy()`. is_lm (list): List of booleans determining whether the n-th decoder is a language model. If None, none of the decoders are considered an LM. split_encoder (bool): If true, split encoder output, each decoder gets its own split. vocab_reduction_params: For vocabular reduction. training_schedule (str): Training strategy. fixed_weights (list): None or list of floats. If specified, use these fixed model weights in weighted* combination strategies. """ super().__init__(dst_dict) if is_lm is None: is_lm = [False] * len(decoders) assert not any(decoder.project_output for decoder in decoders) assert len(is_lm) == len(decoders) self.attentive_decoder_ids = [i for i, b in enumerate(is_lm) if not b] self.decoders_is_lm = is_lm self.decoders = nn.ModuleList(decoders) vocab_reduction_module = None if vocab_reduction_params: vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, vocab_reduction_params) self.combi_strat = create_strategy( combination_strategy, [decoder.out_embed_dim for decoder in decoders], len(dst_dict), vocab_reduction_module, fixed_weights, ) self.split_encoder = split_encoder self.unfreeze_single = False self.separate_training = False self.unfreeze_idx = -1 if self.training: if training_schedule in ["freeze_all", "freeze_all_decoders"]: self.freeze_decoders() elif training_schedule.startswith( "unfreeze_dec_") or training_schedule.startswith( "unfreeze_encdec_"): _, _, n = training_schedule.split("_") self.freeze_decoders(int(n)) elif training_schedule in [ "unfreeze_single", "unfreeze_single_decoder" ]: self.unfreeze_single = True self.unfreeze_mod = len(decoders) elif training_schedule == "separate": self.unfreeze_single = True self.unfreeze_mod = len(decoders) + 1 self.separate_training = True elif training_schedule != "complete": raise RuntimeError( f"Unknown training schedule '{training_schedule}'")
def __init__( self, args, src_dict, dst_dict, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True, ): super().__init__(dst_dict) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerAANDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = Linear(embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False