def __init__( self, out_embed_dims, vocab_size, vocab_reduction_module=None, hidden_layer_size=256, ): super().__init__(out_embed_dims, vocab_size, vocab_reduction_module) self.hidden_layer = NonlinearLayer(vocab_size, hidden_layer_size, bias=False, activation_fn=nn.ReLU) trans_dim = sum(out_embed_dims[1:]) self.gating_network = NonlinearLayer( hidden_layer_size + trans_dim, hidden_layer_size, bias=True, activation_fn=nn.Sigmoid, ) # output_projections is [LM projection, Joint projection]. This is a # trick to load pretrained LM projection. self.output_projections = nn.ModuleList([ OutputProjection(out_embed_dims[0], vocab_size), OutputProjection(hidden_layer_size + trans_dim, vocab_size, vocab_reduction_module), ]) self.pre_softmax_activation = nn.ReLU()
def __init__(self, out_embed_dims, vocab_size, vocab_reduction_module=None): super().__init__(out_embed_dims, vocab_size, vocab_reduction_module) self.gating_network = NonlinearLayer(out_embed_dims[0], 1, bias=True, activation_fn=nn.Sigmoid) self.output_projection = OutputProjection(sum(out_embed_dims), vocab_size, vocab_reduction_module)
def __init__( self, src_dict, dst_dict, n=4, encoder_hidden_dim=512, embed_dim=512, freeze_embed=False, hidden_dim=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention_type="dot", residual_level=None, activation_fn=nn.ReLU, ): super().__init__(dst_dict) self.history_len = n - 1 self.encoder_hidden_dim = encoder_hidden_dim self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.out_embed_dim = out_embed_dim self.dropout_in = dropout_in self.dropout_out = dropout_out self.attention_type = attention_type self.residual_level = residual_level self.dst_dict = dst_dict self.activation_fn = activation_fn num_embeddings = len(dst_dict) padding_idx = dst_dict.pad() self.embed_tokens = Embedding( num_embeddings=num_embeddings, embedding_dim=embed_dim, padding_idx=padding_idx, freeze_embed=freeze_embed, ) self.history_conv = nn.Sequential( torch.nn.Conv1d(embed_dim, hidden_dim, self.history_len), activation_fn()) self.hidden_dim = hidden_dim self.layers = nn.ModuleList([ NonlinearLayer(hidden_dim, hidden_dim, activation_fn=activation_fn) for _ in range(num_layers) ]) self.attention = attention.build_attention( attention_type=attention_type, decoder_hidden_state_dim=hidden_dim, encoder_output_dim=encoder_hidden_dim, force_projection=True, ) self.combined_output_and_context_dim = ( self.attention.encoder_output_dim + hidden_dim) if self.combined_output_and_context_dim != out_embed_dim: self.additional_fc = Linear(self.combined_output_and_context_dim, out_embed_dim) self.output_projection_w = nn.Parameter( torch.FloatTensor(num_embeddings, out_embed_dim).uniform_(-0.1, 0.1)) self.output_projection_b = nn.Parameter( torch.FloatTensor(num_embeddings).zero_())
def __init__( self, src_dict, dst_dict, vocab_reduction_params=None, n=4, encoder_hidden_dim=512, embed_dim=512, freeze_embed=False, hidden_dim=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention_type="dot", residual_level=None, activation_fn=nn.ReLU, project_output=True, pretrained_embed=None, projection_pretrained_embed=None, ): super().__init__( src_dict, dst_dict, vocab_reduction_params, out_embed_dim, project_output=project_output, pretrained_embed=projection_pretrained_embed, ) self.history_len = n - 1 self.encoder_hidden_dim = encoder_hidden_dim self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.out_embed_dim = out_embed_dim self.dropout_in = dropout_in self.dropout_out = dropout_out self.attention_type = attention_type self.residual_level = residual_level self.dst_dict = dst_dict self.activation_fn = activation_fn num_embeddings = len(dst_dict) padding_idx = dst_dict.pad() self.embed_tokens = Embedding( num_embeddings=num_embeddings, embedding_dim=embed_dim, padding_idx=padding_idx, freeze_embed=freeze_embed, ) pytorch_translate_utils.load_embedding( embedding=self.embed_tokens, dictionary=dst_dict, pretrained_embed=pretrained_embed, ) self.history_conv = nn.Sequential( torch.nn.Conv1d(embed_dim, hidden_dim, self.history_len), activation_fn()) self.hidden_dim = hidden_dim self.layers = nn.ModuleList([ NonlinearLayer(hidden_dim, hidden_dim, activation_fn=activation_fn) for _ in range(num_layers) ]) self.attention = attention.build_attention( attention_type=attention_type, decoder_hidden_state_dim=hidden_dim, context_dim=encoder_hidden_dim, force_projection=True, ) self.combined_output_and_context_dim = self.attention.context_dim + hidden_dim if self.combined_output_and_context_dim != out_embed_dim: self.additional_fc = Linear(self.combined_output_and_context_dim, out_embed_dim)