def __init__(self, embedding_size, hidden_size, num_layers, num_heads, total_key_depth, total_value_depth, filter_size, max_length=3000, input_dropout=0.0, layer_dropout=0.0, attention_dropout=0.0, relu_dropout=0.0, use_mask=False, universal=False, concept=False): """ Parameters: embedding_size: Size of embeddings hidden_size: Hidden size num_layers: Total layers in the Encoder 2 num_heads: Number of attention heads 2 total_key_depth: Size of last dimension of keys. Must be divisible by num_head 40 total_value_depth: Size of last dimension of values. Must be divisible by num_head 40 output_depth: Size last dimension of the final output filter_size: Hidden size of the middle layer in FFN 50 max_length: Max sequence length (required for timing signal) input_dropout: Dropout just after embedding layer_dropout: Dropout for each layer attention_dropout: Dropout probability after attention (Should be non-zero only during training) relu_dropout: Dropout probability after relu in FFN (Should be non-zero only during training) use_mask: Set to True to turn on future value masking """ super(Encoder, self).__init__() self.universal = universal self.num_layers = num_layers self.timing_signal = _gen_timing_signal(max_length, hidden_size) if (self.universal): ## for t self.position_signal = _gen_timing_signal(num_layers, hidden_size) params = (hidden_size, total_key_depth or hidden_size, total_value_depth or hidden_size, filter_size, num_heads, _gen_bias_mask(max_length) if use_mask else None, layer_dropout, attention_dropout, relu_dropout) self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False) if (self.universal): self.enc = EncoderLayer(*params) else: self.enc = nn.ModuleList( [EncoderLayer(*params) for _ in range(num_layers)]) self.layer_norm = LayerNorm(hidden_size) self.input_dropout = nn.Dropout(input_dropout)
def __init__(self, embedding_size, hidden_size, num_layers, num_heads, total_key_depth, total_value_depth, filter_size, max_length=config.max_enc_steps, input_dropout=0.0, layer_dropout=0.0, attention_dropout=0.0, relu_dropout=0.0): """ Parameters: embedding_size: Size of embeddings hidden_size: Hidden size num_layers: Total layers in the Encoder num_heads: Number of attention heads total_key_depth: Size of last dimension of keys. Must be divisible by num_head total_value_depth: Size of last dimension of values. Must be divisible by num_head output_depth: Size last dimension of the final output filter_size: Hidden size of the middle layer in FFN max_length: Max sequence length (required for timing signal) input_dropout: Dropout just after embedding layer_dropout: Dropout for each layer attention_dropout: Dropout probability after attention (Should be non-zero only during training) relu_dropout: Dropout probability after relu in FFN (Should be non-zero only during training) """ super(Decoder, self).__init__() self.num_layers = num_layers self.timing_signal = _gen_timing_signal(max_length, hidden_size) self.mask = _get_attn_subsequent_mask( max_length) # mask to hide future params = ( hidden_size, total_key_depth or hidden_size, total_value_depth or hidden_size, filter_size, num_heads, _gen_bias_mask(max_length), # mandatory layer_dropout, attention_dropout, relu_dropout) self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False) # input to decoder: tuple consisting of decoder inputs and encoder output self.dec = nn.Sequential( *[DecoderLayer(*params) for l in range(num_layers)]) self.layer_norm = LayerNorm(hidden_size) self.input_dropout = nn.Dropout(input_dropout)
def __init__(self, hidden_size, num_heads, total_key_depth, total_value_depth, filter_size, vocab_size, max_length=1000, input_dropout=0, layer_dropout=0, attention_dropout=0.1, relu_dropout=0.1, use_mask=False, universal=False, is_eval=False): super(Latent, self).__init__() params = (hidden_size, total_key_depth or hidden_size, total_value_depth or hidden_size, filter_size, num_heads, _gen_bias_mask(max_length) if use_mask else None, layer_dropout, attention_dropout, relu_dropout) self.query = nn.Parameter( torch.randn(config.batch_size, config.max_seq_len, config.hidden_dim)) self.dec = DecoderLayer(*params) self.var_dec = DecoderLayer(*params) self.layer_norm1 = LayerNorm(hidden_size) self.layer_norm2 = LayerNorm(hidden_size) self.mean = PositionwiseFeedForward(config.hidden_dim, config.filter, config.hidden_dim, layer_config='lll', padding='left', dropout=0) self.var = PositionwiseFeedForward(config.hidden_dim, config.filter, config.hidden_dim, layer_config='lll', padding='left', dropout=0) self.mean_p = PositionwiseFeedForward(config.hidden_dim, config.filter, config.hidden_dim, layer_config='lll', padding='left', dropout=0) self.var_p = PositionwiseFeedForward(config.hidden_dim, config.filter, config.hidden_dim, layer_config='lll', padding='left', dropout=0) self.z_supervision = SoftmaxOutputLayer(2 * hidden_size, vocab_size) self.is_eval = is_eval
def __init__(self, embedding_size, hidden_size, num_layers, num_heads, total_key_depth, total_value_depth, filter_size, max_length=1000, input_dropout=0.0, layer_dropout=0.0, attention_dropout=0.0, relu_dropout=0.0, use_mask=False, universal=False): # super(EmotionInputEncoder, self).__init__() # self.universal = universal # self.num_layers = num_layers # self.timing_signal = _gen_timing_signal(max_length, hidden_size) # if(self.universal): # ## for t # self.position_signal = _gen_timing_signal(num_layers, hidden_size) # params =(hidden_size, # total_key_depth or hidden_size, # total_value_depth or hidden_size, # filter_size, # num_heads, # _gen_bias_mask(max_length) if use_mask else None, # layer_dropout, # attention_dropout, # relu_dropout) # self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False) # if(self.universal): # self.enc = EmotionInputAttentionLayer(*params) # else: # self.enc = nn.Sequential(*[EmotionInputAttentionLayer(*params) for l in range(num_layers)]) # self.layer_norm = LayerNorm(hidden_size) # self.input_dropout = nn.Dropout(input_dropout) # if(config.act): # self.act_fn = ACT_basic(hidden_size) # self.remainders = None # self.n_updates = None super(ComplexResDecoder, self).__init__() self.universal = universal self.num_layers = num_layers self.timing_signal = _gen_timing_signal(max_length, hidden_size) if (self.universal): self.position_signal = _gen_timing_signal(num_layers, hidden_size) self.mask = _get_attn_subsequent_mask(max_length) params = (hidden_size, total_key_depth or hidden_size, total_value_depth or hidden_size, filter_size, num_heads, _gen_bias_mask(max_length), # mandatory layer_dropout, attention_dropout, relu_dropout) if (self.universal): self.dec = ComplexEmoAttentionLayer(*params) else: self.dec = nn.Sequential(*[ComplexEmoAttentionLayer(*params) for _ in range(num_layers)]) self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False) self.layer_norm = LayerNorm(hidden_size) self.input_dropout = nn.Dropout(input_dropout)
def __init__(self, embedding_size, hidden_size, num_layers, num_heads, total_key_depth, total_value_depth, filter_size, max_length=512, input_dropout=0.0, layer_dropout=0.0, attention_dropout=0.0, relu_dropout=0.0, universal=False, multi_input=False, context_size=1, attention_fusion_type='mean'): """ Parameters: embedding_size: Size of embeddings hidden_size: Hidden size num_layers: Total layers in the Encoder num_heads: Number of attention heads total_key_depth: Size of last dimension of keys. Must be divisible by num_head total_value_depth: Size of last dimension of values. Must be divisible by num_head output_depth: Size last dimension of the final output filter_size: Hidden size of the middle layer in FFN max_length: Max sequence length (required for timing signal) input_dropout: Dropout just after embedding layer_dropout: Dropout for each layer attention_dropout: Dropout probability after attention (Should be non-zero only during training) relu_dropout: Dropout probability after relu in FFN (Should be non-zero only during training) multi_input: Whether use multiple attention modules in the decoder context_size: The number of multiple inputs """ super(Decoder, self).__init__() self.universal = universal self.num_layers = num_layers self.timing_signal = _gen_timing_signal(max_length, hidden_size) if (self.universal): ## for t self.position_signal = _gen_timing_signal(num_layers, hidden_size) self.mask = _get_attn_subsequent_mask(max_length) params = (hidden_size, total_key_depth or hidden_size, total_value_depth or hidden_size, filter_size, num_heads, _gen_bias_mask(max_length), # mandatory layer_dropout, attention_dropout, relu_dropout, multi_input, context_size, attention_fusion_type) self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False) if (self.universal): self.dec = DecoderLayer(*params) else: self.dec = nn.Sequential(*[DecoderLayer(*params) for l in range(num_layers)]) self.layer_norm = LayerNorm(hidden_size) self.input_dropout = nn.Dropout(input_dropout) self.multi_input = multi_input self.context_size = context_size
def __init__(self, embedding_size, hidden_size, num_layers, num_heads, total_key_depth, total_value_depth, filter_size, vocab_size, max_length=200, input_dropout=0, layer_dropout=0, attention_dropout=0.1, relu_dropout=0.1, universal=False): """ Parameters: embedding_size: Size of embeddings hidden_size: Hidden size num_layers: Total layers in the Encoder num_heads: Number of attention heads total_key_depth: Size of last dimension of keys. Must be divisible by num_head total_value_depth: Size of last dimension of values. Must be divisible by num_head output_depth: Size last dimension of the final output filter_size: Hidden size of the middle layer in FFN max_length: Max sequence length (required for timing signal) input_dropout: Dropout just after embedding layer_dropout: Dropout for each layer attention_dropout: Dropout probability after attention (Should be non-zero only during training) relu_dropout: Dropout probability after relu in FFN (Should be non-zero only during training) """ super(VarDecoder, self).__init__() self.universal = universal self.num_layers = num_layers self.timing_signal = _gen_timing_signal(max_length, hidden_size) if(self.universal): ## for t self.position_signal = _gen_timing_signal(num_layers, hidden_size) self.mask = _get_attn_subsequent_mask(max_length) if(self.universal): ## for t self.position_signal = _gen_timing_signal(num_layers, hidden_size) params =(hidden_size, total_key_depth or hidden_size, total_value_depth or hidden_size, filter_size, num_heads, _gen_bias_mask(max_length), # mandatory vocab_size, layer_dropout, attention_dropout, relu_dropout) self.var_dec = nn.Sequential(*[VarDecoderLayer(*params) for l in range(config.num_var_layers)]) self.dec = nn.Sequential(*[DecoderLayer(*params) for l in range(num_layers- config.num_var_layers)]) self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False) self.layer_norm1 = LayerNorm(hidden_size) self.layer_norm2 = LayerNorm(hidden_size) self.input_dropout = nn.Dropout(input_dropout)