def __init__(self, isize, ncomb=2, hsize=None, dropout=0.0, use_GeLU=use_adv_act_default, enable_bias=enable_prev_ln_bias_default): super(ResidueCombiner, self).__init__() _hsize = isize * 2 * ncomb if hsize is None else hsize # should dropout be in front of sigmoid or not? self.net = nn.Sequential( Linear(isize * ncomb, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Dropout(dropout, inplace=inplace_after_GeLU), Linear(_hsize, isize, bias=enable_bias), Dropout(dropout, inplace=True)) if dropout > 0.0 else nn.Sequential( Linear(isize * ncomb, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Linear(_hsize, isize, bias=enable_bias)) self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)
def __init__(self, isize, hsize=None, dropout=0.0, num_pos=cache_len_default, use_GeLU=use_adv_act_default): super(AverageAttn, self).__init__() _hsize = isize if hsize is None else hsize self.num_pos = num_pos self.register_buffer('w', torch.Tensor(num_pos, num_pos)) self.ffn = nn.Sequential( Linear(isize, _hsize), Dropout(dropout, inplace=True), GeLU() if use_GeLU else nn.ReLU(inplace=True), Linear(_hsize, isize), Dropout( dropout, inplace=True)) if dropout > 0.0 else nn.Sequential( Linear(isize, _hsize), GeLU() if use_GeLU else nn.ReLU( inplace=True), Linear(_hsize, isize)) self.gw = Linear(isize * 2, isize * 2) self.reset_parameters()
def __init__(self, isize, hsize=None, dropout=0.0, norm_residual=norm_residual_default, use_GeLU=use_adv_act_default, enable_bias=enable_prev_ln_bias_default): super(PositionwiseFF, self).__init__() _hsize = isize * 4 if hsize is None else hsize self.net = nn.Sequential( Linear(isize, _hsize), GeLU() if use_GeLU else nn.ReLU( inplace=True), Dropout(dropout, inplace=inplace_after_GeLU), Linear(_hsize, isize, bias=enable_bias), Dropout( dropout, inplace=True)) if dropout > 0.0 else nn.Sequential( Linear(isize, _hsize), GeLU() if use_GeLU else nn.ReLU( inplace=True), Linear(_hsize, isize, bias=enable_bias)) self.normer = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) self.norm_residual = norm_residual
def __init__(self, isize, hsize=None, dropout=0.0, use_GeLU=use_adv_act_default): super(DATTNCombiner, self).__init__() _hsize = isize * 4 if hsize is None else hsize self.net = nn.Sequential( Linear(isize * 2, _hsize), Dropout(dropout, inplace=True), GeLU() if use_GeLU else nn.Sigmoid(), Scorer( _hsize, bias=False)) if dropout > 0.0 else nn.Sequential( Linear(isize * 2, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Scorer(_hsize, bias=False))
def __init__(self, isize, osize=None, dropout=0.0, use_GeLU=use_adv_act_default, enable_bias=enable_residual_bias_default): super(LSTMCell4RNMT, self).__init__() _osize = isize if osize is None else osize # layer normalization is also applied for the computation of hidden for efficiency. bias might be disabled in case provided by LayerNorm self.trans = Linear(isize + _osize, _osize * 4, bias=enable_bias) self.normer = nn.LayerNorm((4, _osize), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) self.act = GeLU() if use_GeLU else nn.Tanh() self.drop = Dropout( dropout, inplace=inplace_after_GeLU) if dropout > 0.0 else None self.osize = _osize
def __init__(self, isize, osize=None, dropout=0.0, use_GeLU=use_adv_act_default, enable_bias=enable_residual_bias_default): super(GRUCell4RNMT, self).__init__() _osize = isize if osize is None else osize self.trans = Linear(isize + _osize, _osize * 2, bias=enable_bias) self.transi = Linear(isize, _osize) self.transh = Linear(_osize, _osize) self.normer = nn.LayerNorm((2, _osize), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) self.act = GeLU() if use_GeLU else nn.Tanh() self.drop = Dropout( dropout, inplace=inplace_after_GeLU) if dropout > 0.0 else None self.osize = _osize