def __init__(self, model_dim, num_heads, dropout=0.1, batch_first=False, masked_layers=False): super().__init__() self.num_heads = num_heads self.model_dim = model_dim self.batch_first = batch_first self.masked_layers = masked_layers assert model_dim % num_heads == 0 self.head_dim = model_dim // num_heads self.query_projection = MaskedFunction( XavierLinear(model_dim, model_dim, bias=False)) self.key_projection = MaskedFunction( XavierLinear(model_dim, model_dim, bias=False)) self.value_projection = MaskedFunction( XavierLinear(model_dim, model_dim, bias=False)) self.out_projection = MaskedFunction( XavierLinear(model_dim, model_dim, bias=False)) self.attn_dropout = nn.Dropout(dropout)
def __init__(self, h, d_model, attn_p=0.1, static=True, share=3): super(MultiHeadAttention, self).__init__() self.h = h self.d = d_model self.share = share assert d_model % h == 0 self.d_head = d_model // h self.fc_query = MaskedFunction( XavierLinear(d_model, h * self.d_head, bias=False)) self.fc_key = MaskedFunction( XavierLinear(d_model, h * self.d_head, bias=False)) self.fc_value = MaskedFunction( XavierLinear(d_model, h * self.d_head, bias=False)) self.fc_concat = MaskedFunction( XavierLinear(h * self.d_head, d_model, bias=False)) self.sm = nn.Softmax(dim=-1) if static: self.attn_dropout = StaticDropout(attn_p) else: self.attn_dropout = nn.Dropout(attn_p)
def build_feed_forward(self): self.preprocess_ffn = self.get_preprocessing_module() self.feed_forward = MaskedFunction( get_feed_forward(self.feed_forward_type, self.model_dim, self.feed_forward_dim, self.feed_forward_dropout, self.weight_norm)) self.postprocess_ffn = self.get_postprocessing_module()
def __init__(self, model_dim, sequence='nda', dropout=0.0, elementwise_affine=True, gated_residuals=False, masking=False): super(PrePostProcessing, self).__init__() self.masking = masking self.gated_residuals = gated_residuals self.steps = sequence if self.gated_residuals: self.k = nn.Parameter(torch.ones(1)) if 'n' in self.steps: layer_norm = nn.LayerNorm([model_dim], elementwise_affine=elementwise_affine) self.layer_norm = MaskedFunction(layer_norm) if 'd' in self.steps: self.dropout = nn.Dropout(dropout, inplace=False)