def __init__(self, dim, in_dim, num_pixel, num_heads=12, in_num_head=4, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() # Inner transformer self.norm_in = norm_layer(in_dim) self.attn_in = Attention( in_dim, in_dim, num_heads=in_num_head, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) self.norm_mlp_in = norm_layer(in_dim) self.mlp_in = Mlp(in_features=in_dim, hidden_features=int(in_dim * 4), out_features=in_dim, act_layer=act_layer, drop=drop) self.norm1_proj = norm_layer(in_dim) self.proj = nn.Linear(in_dim * num_pixel, dim, bias=True) # Outer transformer self.norm_out = norm_layer(dim) self.attn_out = Attention( dim, dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm_mlp = norm_layer(dim) self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), out_features=dim, act_layer=act_layer, drop=drop)
def __init__(self, dim, num_heads, head_dim=None, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.norm1 = norm_layer(dim) self.attn = ClassAttention(dim, num_heads=num_heads, head_dim=head_dim, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) # NOTE: drop path for stochastic depth self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer)
def __init__(self, dim, kernel_size, padding, stride=1, num_heads=1, mlp_ratio=3., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, qkv_bias=False): super().__init__() self.norm1 = norm_layer(dim) self.attn = OutlookAttention(dim, num_heads, kernel_size=kernel_size, padding=padding, stride=stride, qkv_bias=qkv_bias, attn_drop=attn_drop) self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer)
def __init__(self, dims, num_heads, mlp_ratios=[], qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, shared_cpes=None, shared_crpes=None): super().__init__() # Conv-Attention. self.cpes = shared_cpes self.norm12 = norm_layer(dims[1]) self.norm13 = norm_layer(dims[2]) self.norm14 = norm_layer(dims[3]) self.factoratt_crpe2 = FactorAtt_ConvRelPosEnc( dims[1], num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, shared_crpe=shared_crpes[1]) self.factoratt_crpe3 = FactorAtt_ConvRelPosEnc( dims[2], num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, shared_crpe=shared_crpes[2]) self.factoratt_crpe4 = FactorAtt_ConvRelPosEnc( dims[3], num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, shared_crpe=shared_crpes[3]) self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() # MLP. self.norm22 = norm_layer(dims[1]) self.norm23 = norm_layer(dims[2]) self.norm24 = norm_layer(dims[3]) # In parallel block, we assume dimensions are the same and share the linear transformation. assert dims[1] == dims[2] == dims[3] assert mlp_ratios[1] == mlp_ratios[2] == mlp_ratios[3] mlp_hidden_dim = int(dims[1] * mlp_ratios[1]) self.mlp2 = self.mlp3 = self.mlp4 = Mlp(in_features=dims[1], hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, shared_cpe=None, shared_crpe=None): super().__init__() # Conv-Attention. self.cpe = shared_cpe self.norm1 = norm_layer(dim) self.factoratt_crpe = FactorAtt_ConvRelPosEnc(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, shared_crpe=shared_crpe) self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() # MLP. self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)