def __init__(self, d_model, dropout_p, sequence='nda', variational=False, elementwise_affine=True, multilingual=False, n_languages=1): super(PrePostProcessing, self).__init__() self.d_model = d_model self.dropout_p = dropout_p self.multilingual = multilingual self.steps = list(sequence) if onmt.constants.residual_type == 'gated': # gated residual # initialize k with one self.k = nn.Parameter(torch.ones(1)) if 'n' in self.steps: if not multilingual: ln = LayerNorm((self.d_model,), elementwise_affine=elementwise_affine) self.layer_norm = Bottle(ln) else: ln = MultilingualLayerNorm((self.d_model,), eps=1e-5, elementwise_affine=True, n_languages=n_languages) self.layer_norm = ln if 'd' in self.steps: if variational: self.dropout = VariationalDropout(self.dropout_p, batch_first=False) else: self.dropout = nn.Dropout(self.dropout_p)
def __init__(self, d_model, d_ff, p, variational=False): super(FeedForward, self).__init__() self.d_model = d_model self.d_ff = d_ff self.fc_1 = Linear(d_model, d_ff, nonlinearity="relu") self.fc_2 = Linear(d_ff, d_model) if variational: self.dropout = VariationalDropout(p) else: self.dropout = nn.Dropout(p)
def __init__(self, d_model, d_ff, p, variational=False): super(FeedForwardSwish, self).__init__() self.d_model = d_model self.d_ff = d_ff self.fc_1 = XavierLinear(d_model, d_ff) self.fc_2 = XavierLinear(d_ff, d_model) self.swish = torch.nn.SilU() if variational: self.dropout = VariationalDropout(p) else: self.dropout = nn.Dropout(p)