def __init__(self, dim, in_dim, num_pixel, num_heads=12, in_num_head=4, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., drop_path=0., act_layer=gelu, norm_layer=nn.LayerNorm): super().__init__() # Inner transformer self.norm_in = norm_layer(in_dim) self.attn_in = Attention( in_dim, in_dim, num_heads=in_num_head, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) self.norm_mlp_in = norm_layer(in_dim) self.mlp_in = Mlp(in_features=in_dim, hidden_features=int(in_dim * 4), out_features=in_dim, act_layer=act_layer, drop=drop) self.norm1_proj = norm_layer(in_dim) self.proj = nn.Linear(in_dim * num_pixel, dim, bias=True) # Outer transformer self.norm_out = norm_layer(dim) self.attn_out = Attention( dim, dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm_mlp = norm_layer(dim) self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), out_features=dim, act_layer=act_layer, drop=drop)
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, num_tokens=197): super().__init__() # First stage self.mlp1 = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) self.norm1 = norm_layer(dim) # Second stage self.mlp2 = Mlp(in_features=num_tokens, hidden_features=int( num_tokens * mlp_ratio), act_layer=act_layer, drop=drop) self.norm2 = norm_layer(num_tokens) # Dropout (or a variant) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rpe_config=None): super().__init__() self.norm1 = norm_layer(dim) self.attn = RPEAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, rpe_config=rpe_config) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_paths=[0.], act_layer=nn.GELU, norm_layer=nn.LayerNorm, rpe_config=None, repeated_times=1, use_transform=False): super().__init__() assert len(drop_paths) == repeated_times if repeated_times > 1: self.norm1 = RepeatedModuleList([norm_layer(dim) for _ in range(repeated_times)], repeated_times) self.norm2 = RepeatedModuleList([norm_layer(dim) for _ in range(repeated_times)], repeated_times) self.attn = MiniAttention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, rpe_config=rpe_config, repeated_times=repeated_times, use_transform=use_transform) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_paths = nn.ModuleList([DropPath(drop_path) if drop_path > 0. else nn.Identity() for drop_path in drop_paths]) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, eta=None, tokens_norm=False): super().__init__() self.norm1 = norm_layer(dim) self.attn = ClassAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) if eta is not None: # LayerScale Initialization (no layerscale when None) self.gamma1 = nn.Parameter(eta * torch.ones(dim), requires_grad=True) self.gamma2 = nn.Parameter(eta * torch.ones(dim), requires_grad=True) else: self.gamma1, self.gamma2 = 1.0, 1.0 # FIXME: A hack for models pre-trained with layernorm over all the tokens not just the CLS self.tokens_norm = tokens_norm
def __init__(self, dim, drop=0., drop_path=0., act_layer=nn.GELU, init_values=1e-4, num_patches=196): super().__init__() self.norm1 = Affine(dim) self.attn = nn.Linear(num_patches, num_patches) self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() self.norm2 = Affine(dim) self.mlp = Mlp(in_features=dim, hidden_features=int(4.0 * dim), act_layer=act_layer, drop=drop) self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, eta=None): super().__init__() self.norm1 = norm_layer(dim) self.attn = XCA(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) self.norm3 = norm_layer(dim) self.local_mp = LPI(in_features=dim, act_layer=act_layer) self.gamma1 = nn.Parameter(eta * torch.ones(dim), requires_grad=True) self.gamma2 = nn.Parameter(eta * torch.ones(dim), requires_grad=True) self.gamma3 = nn.Parameter(eta * torch.ones(dim), requires_grad=True)