예제 #1
0
 def __init__(self, dim, in_dim, num_pixel, num_heads=12, in_num_head=4, mlp_ratio=4.,
         qkv_bias=False, drop=0., attn_drop=0., drop_path=0., act_layer=gelu, norm_layer=nn.LayerNorm):
     super().__init__()
     # Inner transformer
     self.norm_in = norm_layer(in_dim)
     self.attn_in = Attention(
         in_dim, in_dim, num_heads=in_num_head, qkv_bias=qkv_bias,
         attn_drop=attn_drop, proj_drop=drop)
     
     self.norm_mlp_in = norm_layer(in_dim)
     self.mlp_in = Mlp(in_features=in_dim, hidden_features=int(in_dim * 4),
         out_features=in_dim, act_layer=act_layer, drop=drop)
     
     self.norm1_proj = norm_layer(in_dim)
     self.proj = nn.Linear(in_dim * num_pixel, dim, bias=True)
     # Outer transformer
     self.norm_out = norm_layer(dim)
     self.attn_out = Attention(
         dim, dim, num_heads=num_heads, qkv_bias=qkv_bias,
         attn_drop=attn_drop, proj_drop=drop)
     self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
     
     self.norm_mlp = norm_layer(dim)
     self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio),
         out_features=dim, act_layer=act_layer, drop=drop)
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, num_tokens=197):
        super().__init__()

        # First stage
        self.mlp1 = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
        self.norm1 = norm_layer(dim)

        # Second stage
        self.mlp2 = Mlp(in_features=num_tokens, hidden_features=int(
            num_tokens * mlp_ratio), act_layer=act_layer, drop=drop)
        self.norm2 = norm_layer(num_tokens)

        # Dropout (or a variant)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
예제 #3
0
 def __init__(self,
              dim,
              num_heads,
              mlp_ratio=4.,
              qkv_bias=False,
              qk_scale=None,
              drop=0.,
              attn_drop=0.,
              drop_path=0.,
              act_layer=nn.GELU,
              norm_layer=nn.LayerNorm,
              rpe_config=None):
     super().__init__()
     self.norm1 = norm_layer(dim)
     self.attn = RPEAttention(dim,
                              num_heads=num_heads,
                              qkv_bias=qkv_bias,
                              qk_scale=qk_scale,
                              attn_drop=attn_drop,
                              proj_drop=drop,
                              rpe_config=rpe_config)
     # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
     self.drop_path = DropPath(
         drop_path) if drop_path > 0. else nn.Identity()
     self.norm2 = norm_layer(dim)
     mlp_hidden_dim = int(dim * mlp_ratio)
     self.mlp = Mlp(in_features=dim,
                    hidden_features=mlp_hidden_dim,
                    act_layer=act_layer,
                    drop=drop)
예제 #4
0
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_paths=[0.], act_layer=nn.GELU, norm_layer=nn.LayerNorm, rpe_config=None, repeated_times=1, use_transform=False):
        super().__init__()
        assert len(drop_paths) == repeated_times

        if repeated_times > 1:
            self.norm1 = RepeatedModuleList([norm_layer(dim) for _ in range(repeated_times)], repeated_times)
            self.norm2 = RepeatedModuleList([norm_layer(dim) for _ in range(repeated_times)], repeated_times)

        self.attn = MiniAttention(
            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, rpe_config=rpe_config,
            repeated_times=repeated_times,
            use_transform=use_transform)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_paths = nn.ModuleList([DropPath(drop_path) if drop_path > 0. else nn.Identity() for drop_path in drop_paths])
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
예제 #5
0
    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 eta=None,
                 tokens_norm=False):
        super().__init__()
        self.norm1 = norm_layer(dim)

        self.attn = ClassAttention(dim,
                                   num_heads=num_heads,
                                   qkv_bias=qkv_bias,
                                   qk_scale=qk_scale,
                                   attn_drop=attn_drop,
                                   proj_drop=drop)

        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim,
                       hidden_features=mlp_hidden_dim,
                       act_layer=act_layer,
                       drop=drop)

        if eta is not None:  # LayerScale Initialization (no layerscale when None)
            self.gamma1 = nn.Parameter(eta * torch.ones(dim),
                                       requires_grad=True)
            self.gamma2 = nn.Parameter(eta * torch.ones(dim),
                                       requires_grad=True)
        else:
            self.gamma1, self.gamma2 = 1.0, 1.0

        # FIXME: A hack for models pre-trained with layernorm over all the tokens not just the CLS
        self.tokens_norm = tokens_norm
예제 #6
0
 def __init__(self,
              dim,
              drop=0.,
              drop_path=0.,
              act_layer=nn.GELU,
              init_values=1e-4,
              num_patches=196):
     super().__init__()
     self.norm1 = Affine(dim)
     self.attn = nn.Linear(num_patches, num_patches)
     self.drop_path = DropPath(
         drop_path) if drop_path > 0. else nn.Identity()
     self.norm2 = Affine(dim)
     self.mlp = Mlp(in_features=dim,
                    hidden_features=int(4.0 * dim),
                    act_layer=act_layer,
                    drop=drop)
     self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),
                                 requires_grad=True)
     self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),
                                 requires_grad=True)
예제 #7
0
    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 eta=None):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = XCA(dim,
                        num_heads=num_heads,
                        qkv_bias=qkv_bias,
                        qk_scale=qk_scale,
                        attn_drop=attn_drop,
                        proj_drop=drop)
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)

        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim,
                       hidden_features=mlp_hidden_dim,
                       act_layer=act_layer,
                       drop=drop)

        self.norm3 = norm_layer(dim)
        self.local_mp = LPI(in_features=dim, act_layer=act_layer)

        self.gamma1 = nn.Parameter(eta * torch.ones(dim), requires_grad=True)
        self.gamma2 = nn.Parameter(eta * torch.ones(dim), requires_grad=True)
        self.gamma3 = nn.Parameter(eta * torch.ones(dim), requires_grad=True)