def __init__( self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None ): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) patches_resolution = [ img_size[0] // patch_size[0], img_size[1] // patch_size[1], ] self.img_size = img_size self.patch_size = patch_size self.patches_resolution = patches_resolution self.num_patches = patches_resolution[0] * patches_resolution[1] self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv2D( in_chans, embed_dim, kernel_size=patch_size, stride=patch_size ) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None
def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768): super().__init__() assert isinstance(backbone, nn.Layer) img_size = to_2tuple(img_size) self.img_size = img_size self.backbone = backbone if feature_size is None: with paddle.no_grad(): training = backbone.training if training: backbone.eval() o = self.backbone( paddle.zeros((1, in_chans, img_size[0], img_size[1])))[-1] feature_size = o.shape[-2:] feature_dim = o.shape[1] backbone.train(training) else: feature_size = to_2tuple(feature_size) feature_dim = self.backbone.feature_info.channels()[-1] self.num_patches = feature_size[0] * feature_size[1] self.proj = nn.Conv2D(feature_dim, embed_dim, kernel_size=1)
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) self.img_size = img_size self.patch_size = patch_size self.num_patches = num_patches self.proj = nn.Conv2D( in_chans, embed_dim, kernel_size=patch_size, stride=patch_size )
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): super().__init__() new_patch_size = to_2tuple(patch_size // 2) img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) self.img_size = img_size self.patch_size = patch_size self.num_patches = num_patches self.embed_dim = embed_dim self.conv1 = nn.Conv2D(in_chans, 128, kernel_size=7, stride=2, padding=3, bias_attr=False) # 112x112 self.bn1 = nn.BatchNorm2D(128) self.relu = nn.ReLU() self.conv2 = nn.Conv2D(128, 128, kernel_size=3, stride=1, padding=1, bias_attr=False) # 112x112 self.bn2 = nn.BatchNorm2D(128) self.conv3 = nn.Conv2D(128, 128, kernel_size=3, stride=1, padding=1, bias_attr=False) self.bn3 = nn.BatchNorm2D(128) self.proj = nn.Conv2D(128, embed_dim, kernel_size=new_patch_size, stride=new_patch_size)
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) self.img_size = img_size self.patch_size = patch_size assert ( img_size[0] % patch_size[0] == 0 and img_size[1] % patch_size[1] == 0 ), f"img_size {img_size} should be divided by patch_size {patch_size}." # Note: self.H, self.W and self.num_patches are not used self.H, self.W = img_size[0] // patch_size[0], img_size[ 1] // patch_size[1] # since the image size may change on the fly. self.num_patches = self.H * self.W self.proj = nn.Conv2D(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) self.norm = nn.LayerNorm(embed_dim)
def __init__( self, dim, input_resolution, num_heads, window_size=7, shift_size=0, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, ): super().__init__() self.dim = dim self.input_resolution = input_resolution self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio if min(self.input_resolution) <= self.window_size: # if window size is larger than input resolution, we don't partition windows self.shift_size = 0 self.window_size = min(self.input_resolution) assert ( 0 <= self.shift_size < self.window_size ), "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, ) if self.shift_size > 0: # calculate attention mask for SW-MSA H, W = self.input_resolution img_mask = paddle.zeros((1, H, W, 1)) # 1 H W 1 h_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) w_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 # nW, window_size, window_size, 1 mask_windows = window_partition(img_mask, self.window_size) mask_windows = mask_windows.reshape( (-1, self.window_size * self.window_size) ) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) _h = paddle.full_like(attn_mask, -100.0, dtype="float32") _z = paddle.full_like(attn_mask, 0.0, dtype="float32") attn_mask = paddle.where(attn_mask != 0, _h, _z) else: attn_mask = None self.register_buffer("attn_mask", attn_mask)