示例#1
0
    def __init__(self,
                 dim,
                 in_dim,
                 head_cnt=1,
                 kernel_ratio=0.5,
                 dp1=0.1,
                 dp2=0.1):
        super().__init__()
        self.emb = in_dim * head_cnt  # we use 1, so it is no need here
        self.kqv = nn.Linear(dim, 3 * self.emb)
        self.dp = nn.Dropout(dp1)
        self.proj = nn.Linear(self.emb, self.emb)
        self.head_cnt = head_cnt
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(self.emb)
        self.epsilon = 1e-8  # for stable in division

        self.mlp = nn.Sequential(
            nn.Linear(self.emb, 1 * self.emb),
            nn.GELU(),
            nn.Linear(1 * self.emb, self.emb),
            nn.Dropout(dp2),
        )

        self.m = int(self.emb * kernel_ratio)
        self.w = paddle.randn((self.m, self.emb))

        self.w = add_parameter(self, orthogonal_(self.w) * math.sqrt(self.m))
示例#2
0
    def __init__(self, in_planes, out_channels, groups=1, bias=True):
        super(GroupLinear, self).__init__()
        assert in_planes % groups == 0
        assert out_channels % groups == 0
        self.in_dim = in_planes
        self.out_dim = out_channels
        self.groups = groups
        self.group_in_dim = int(self.in_dim / self.groups)
        self.group_out_dim = int(self.out_dim / self.groups)

        self.group_weight = add_parameter(
            self,
            paddle.zeros((self.groups, self.group_in_dim, self.group_out_dim)))

        if bias is True:
            self.group_bias = add_parameter(self, paddle.zeros(
                (self.out_dim, )))
        else:
            self.group_bias = None
示例#3
0
    def __init__(self, *args, **kwargs):
        super(DistilledPoolingTransformer, self).__init__(*args, **kwargs)
        self.cls_token = add_parameter(
            self, paddle.randn((1, 2, self.base_dims[0] * self.heads[0])))

        if self.class_dim > 0:
            self.head_dist = nn.Linear(self.base_dims[-1] * self.heads[-1],
                                       self.class_dim)

        trunc_normal_(self.cls_token)
        self.head_dist.apply(self._init_weights)
示例#4
0
 def __init__(
     self,
     dim,
     num_heads,
     mlp_ratio=4.0,
     qkv_bias=False,
     qk_scale=None,
     drop=0.0,
     attn_drop=0.0,
     drop_path=0.0,
     act_layer=nn.GELU,
     norm_layer=nn.LayerNorm,
     epsilon=1e-6,
     Attention_block=Attention_talking_head,
     Mlp_block=Mlp,
     init_values=1e-4,
 ):
     super().__init__()
     self.norm1 = norm_layer(dim, epsilon=epsilon)
     self.attn = Attention_block(
         dim,
         num_heads=num_heads,
         qkv_bias=qkv_bias,
         qk_scale=qk_scale,
         attn_drop=attn_drop,
         proj_drop=drop,
     )
     self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
     self.norm2 = norm_layer(dim, epsilon=epsilon)
     mlp_hidden_dim = int(dim * mlp_ratio)
     self.mlp = Mlp_block(
         in_features=dim,
         hidden_features=mlp_hidden_dim,
         act_layer=act_layer,
         drop=drop,
     )
     self.gamma_1 = add_parameter(self, init_values * paddle.ones((dim, )))
     self.gamma_2 = add_parameter(self, init_values * paddle.ones((dim, )))
示例#5
0
    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
                 mlp_ratio=4,
                 qkv_bias=False,
                 norm_layer=nn.LayerNorm,
                 epsilon=1e-5,
                 class_dim=1000,
                 **kwargs):
        super().__init__(img_size=img_size,
                         patch_size=patch_size,
                         class_dim=class_dim,
                         embed_dim=embed_dim,
                         depth=depth,
                         num_heads=num_heads,
                         mlp_ratio=mlp_ratio,
                         qkv_bias=qkv_bias,
                         norm_layer=norm_layer,
                         epsilon=epsilon,
                         **kwargs)

        self.pos_embed = add_parameter(
            self,
            paddle.zeros(
                (1, self.patch_embed.num_patches + 2, self.embed_dim)))
        self.dist_token = add_parameter(self,
                                        paddle.zeros((1, 1, self.embed_dim)))

        if class_dim > 0:
            self.head_dist = nn.Linear(self.embed_dim, self.class_dim)
            self.head_dist.apply(self._init_weights)

        trunc_normal_(self.dist_token)
        trunc_normal_(self.pos_embed)
示例#6
0
    def __init__(
        self,
        dim,
        window_size,
        num_heads,
        qkv_bias=True,
        qk_scale=None,
        attn_drop=0.0,
        proj_drop=0.0,
    ):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = add_parameter(
            self,
            paddle.zeros(
                ((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
            ),
        )

        # get pair-wise relative position index for each token inside the window
        coords_h = paddle.arange(self.window_size[0])
        coords_w = paddle.arange(self.window_size[1])
        coords = paddle.stack(paddle.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten.unsqueeze(-1) - coords_flatten.unsqueeze(
            1
        )  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.transpose((1, 2, 0))  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1

        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer("relative_position_index", relative_position_index)

        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        trunc_normal_(self.relative_position_bias_table)
        self.softmax = nn.Softmax(axis=-1)
示例#7
0
    def __init__(
        self,
        img_size=224,
        tokens_type="performer",
        in_chans=3,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4.0,
        qkv_bias=False,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=nn.LayerNorm,
        token_dim=64,
        class_dim=1000,
    ):
        super().__init__()
        self.class_dim = class_dim
        self.num_features = (
            self.embed_dim
        ) = embed_dim  # num_features for consistency with other models

        self.tokens_to_token = T2T_Layer(
            img_size=img_size,
            tokens_type=tokens_type,
            in_chans=in_chans,
            embed_dim=embed_dim,
            token_dim=token_dim,
        )

        num_patches = self.tokens_to_token.num_patches

        self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim)))
        self.pos_embed = add_parameter(
            self,
            get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim))
        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = np.linspace(0, drop_path_rate,
                          depth)  # stochastic depth decay rule
        self.blocks = nn.LayerList([
            Block(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i],
                norm_layer=norm_layer,
            ) for i in range(depth)
        ])
        self.norm = norm_layer(embed_dim)

        # Classifier head
        if class_dim > 0:
            self.head = nn.Linear(embed_dim, class_dim)

        trunc_normal_(self.cls_token)
        self.apply(self._init_weights)
示例#8
0
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=nn.LayerNorm,
        epsilon=1e-6,
        block_layers=LayerScale_Block,
        block_layers_token=LayerScale_Block_CA,
        Patch_layer=PatchEmbed,
        act_layer=nn.GELU,
        Attention_block=Attention_talking_head,
        Mlp_block=Mlp,
        init_scale=1e-4,
        Attention_block_token_only=Class_Attention,
        Mlp_block_token_only=Mlp,
        depth_token_only=2,
        mlp_ratio_clstk=4.0,
        class_dim=1000,
    ):
        super().__init__()

        self.class_dim = class_dim
        self.num_features = self.embed_dim = embed_dim

        self.patch_embed = Patch_layer(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
        )

        num_patches = self.patch_embed.num_patches

        self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim)))
        self.pos_embed = add_parameter(
            self, paddle.zeros((1, num_patches, embed_dim)))
        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = [drop_path_rate for i in range(depth)]
        self.blocks = nn.LayerList([
            block_layers(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i],
                norm_layer=norm_layer,
                epsilon=epsilon,
                act_layer=act_layer,
                Attention_block=Attention_block,
                Mlp_block=Mlp_block,
                init_values=init_scale,
            ) for i in range(depth)
        ])

        self.blocks_token_only = nn.LayerList([
            block_layers_token(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio_clstk,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=0.0,
                attn_drop=0.0,
                drop_path=0.0,
                norm_layer=norm_layer,
                epsilon=epsilon,
                act_layer=act_layer,
                Attention_block=Attention_block_token_only,
                Mlp_block=Mlp_block_token_only,
                init_values=init_scale,
            ) for i in range(depth_token_only)
        ])

        self.norm = norm_layer(embed_dim, epsilon=epsilon)

        # Classifier head
        if class_dim > 0:
            self.head = nn.Linear(embed_dim, class_dim)

        trunc_normal_(self.pos_embed)
        trunc_normal_(self.cls_token)
        self.apply(self._init_weights)
示例#9
0
    def __init__(
        self,
        img_size=224,
        patch_size=4,
        in_chans=3,
        embed_dim=96,
        depths=[2, 2, 6, 2],
        num_heads=[3, 6, 12, 24],
        window_size=7,
        mlp_ratio=4.0,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.1,
        norm_layer=nn.LayerNorm,
        ape=False,
        patch_norm=True,
        class_dim=1000,
        with_pool=True,
        **kwargs,
    ):
        super().__init__()
        self.class_dim = class_dim
        self.with_pool = with_pool

        self.num_layers = len(depths)
        self.embed_dim = embed_dim
        self.ape = ape
        self.patch_norm = patch_norm
        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
        self.mlp_ratio = mlp_ratio

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None,
        )
        num_patches = self.patch_embed.num_patches
        patches_resolution = self.patch_embed.patches_resolution
        self.patches_resolution = patches_resolution

        # absolute position embedding
        if self.ape:
            self.absolute_pos_embed = add_parameter(
                self, paddle.zeros((1, num_patches, embed_dim))
            )
            trunc_normal_(self.absolute_pos_embed)

        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth
        dpr = np.linspace(0, drop_path_rate, sum(depths))

        # build layers
        self.layers = nn.LayerList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(
                dim=int(embed_dim * 2 ** i_layer),
                input_resolution=(
                    patches_resolution[0] // (2 ** i_layer),
                    patches_resolution[1] // (2 ** i_layer),
                ),
                depth=depths[i_layer],
                num_heads=num_heads[i_layer],
                window_size=window_size,
                mlp_ratio=self.mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
                norm_layer=norm_layer,
                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
            )
            self.layers.append(layer)

        self.norm = norm_layer(self.num_features)

        if with_pool:
            self.avgpool = nn.AdaptiveAvgPool1D(1)

        if class_dim > 0:
            self.head = nn.Linear(self.num_features, class_dim)

        self.apply(self._init_weights)
示例#10
0
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        qkv_bias=False,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=nn.LayerNorm,
        epsilon=1e-5,
        class_dim=1000,
    ):
        super().__init__()
        self.class_dim = class_dim
        self.num_features = self.embed_dim = embed_dim

        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
        )
        num_patches = self.patch_embed.num_patches

        self.pos_embed = add_parameter(
            self, paddle.zeros((1, num_patches + 1, embed_dim))
        )
        self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim)))

        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = np.linspace(0, drop_path_rate, depth)

        self.blocks = nn.LayerList(
            [
                Block(
                    dim=embed_dim,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    epsilon=epsilon,
                )
                for i in range(depth)
            ]
        )

        self.norm = norm_layer(embed_dim, epsilon=epsilon)

        # Classifier head
        if class_dim > 0:
            self.head = nn.Linear(embed_dim, class_dim)

        if paddle.in_dynamic_mode():
            trunc_normal_(self.pos_embed)
            trunc_normal_(self.cls_token)
            self.apply(self._init_weights)
示例#11
0
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dims=[0, 0, 0, 0],
        serial_depths=[0, 0, 0, 0],
        parallel_depth=0,
        num_heads=0,
        mlp_ratios=[0, 0, 0, 0],
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=nn.LayerNorm,
        epsilon=1e-6,
        return_interm_layers=False,
        out_features=None,
        crpe_window={
            3: 2,
            5: 3,
            7: 3
        },
        class_dim=1000,
        **kwargs,
    ):
        super().__init__()
        self.return_interm_layers = return_interm_layers
        self.out_features = out_features
        self.class_dim = class_dim

        # Patch embeddings.
        self.patch_embed1 = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dims[0],
        )
        self.patch_embed2 = PatchEmbed(
            img_size=img_size // 4,
            patch_size=2,
            in_chans=embed_dims[0],
            embed_dim=embed_dims[1],
        )
        self.patch_embed3 = PatchEmbed(
            img_size=img_size // 8,
            patch_size=2,
            in_chans=embed_dims[1],
            embed_dim=embed_dims[2],
        )
        self.patch_embed4 = PatchEmbed(
            img_size=img_size // 16,
            patch_size=2,
            in_chans=embed_dims[2],
            embed_dim=embed_dims[3],
        )

        # Class tokens.
        self.cls_token1 = add_parameter(self,
                                        paddle.zeros((1, 1, embed_dims[0])))
        self.cls_token2 = add_parameter(self,
                                        paddle.zeros((1, 1, embed_dims[1])))
        self.cls_token3 = add_parameter(self,
                                        paddle.zeros((1, 1, embed_dims[2])))
        self.cls_token4 = add_parameter(self,
                                        paddle.zeros((1, 1, embed_dims[3])))

        # Convolutional position encodings.
        self.cpe1 = ConvPosEnc(dim=embed_dims[0], k=3)
        self.cpe2 = ConvPosEnc(dim=embed_dims[1], k=3)
        self.cpe3 = ConvPosEnc(dim=embed_dims[2], k=3)
        self.cpe4 = ConvPosEnc(dim=embed_dims[3], k=3)

        # Convolutional relative position encodings.
        self.crpe1 = ConvRelPosEnc(Ch=embed_dims[0] // num_heads,
                                   h=num_heads,
                                   window=crpe_window)
        self.crpe2 = ConvRelPosEnc(Ch=embed_dims[1] // num_heads,
                                   h=num_heads,
                                   window=crpe_window)
        self.crpe3 = ConvRelPosEnc(Ch=embed_dims[2] // num_heads,
                                   h=num_heads,
                                   window=crpe_window)
        self.crpe4 = ConvRelPosEnc(Ch=embed_dims[3] // num_heads,
                                   h=num_heads,
                                   window=crpe_window)

        # Disable stochastic depth.
        dpr = drop_path_rate
        assert dpr == 0.0

        # Serial blocks 1.
        self.serial_blocks1 = nn.LayerList([
            SerialBlock(
                dim=embed_dims[0],
                num_heads=num_heads,
                mlp_ratio=mlp_ratios[0],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr,
                norm_layer=norm_layer,
                epsilon=epsilon,
                shared_cpe=self.cpe1,
                shared_crpe=self.crpe1,
            ) for _ in range(serial_depths[0])
        ])

        # Serial blocks 2.
        self.serial_blocks2 = nn.LayerList([
            SerialBlock(
                dim=embed_dims[1],
                num_heads=num_heads,
                mlp_ratio=mlp_ratios[1],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr,
                norm_layer=norm_layer,
                epsilon=epsilon,
                shared_cpe=self.cpe2,
                shared_crpe=self.crpe2,
            ) for _ in range(serial_depths[1])
        ])

        # Serial blocks 3.
        self.serial_blocks3 = nn.LayerList([
            SerialBlock(
                dim=embed_dims[2],
                num_heads=num_heads,
                mlp_ratio=mlp_ratios[2],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr,
                norm_layer=norm_layer,
                epsilon=epsilon,
                shared_cpe=self.cpe3,
                shared_crpe=self.crpe3,
            ) for _ in range(serial_depths[2])
        ])

        # Serial blocks 4.
        self.serial_blocks4 = nn.LayerList([
            SerialBlock(
                dim=embed_dims[3],
                num_heads=num_heads,
                mlp_ratio=mlp_ratios[3],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr,
                norm_layer=norm_layer,
                epsilon=epsilon,
                shared_cpe=self.cpe4,
                shared_crpe=self.crpe4,
            ) for _ in range(serial_depths[3])
        ])

        # Parallel blocks.
        self.parallel_depth = parallel_depth
        if self.parallel_depth > 0:
            self.parallel_blocks = nn.LayerList([
                ParallelBlock(
                    dims=embed_dims,
                    num_heads=num_heads,
                    mlp_ratios=mlp_ratios,
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr,
                    norm_layer=norm_layer,
                    epsilon=epsilon,
                    shared_cpes=[self.cpe1, self.cpe2, self.cpe3, self.cpe4],
                    shared_crpes=[
                        self.crpe1, self.crpe2, self.crpe3, self.crpe4
                    ],
                ) for _ in range(parallel_depth)
            ])

        # Classification head(s).
        if not self.return_interm_layers:
            self.norm1 = norm_layer(embed_dims[0], epsilon=epsilon)
            self.norm2 = norm_layer(embed_dims[1], epsilon=epsilon)
            self.norm3 = norm_layer(embed_dims[2], epsilon=epsilon)
            self.norm4 = norm_layer(embed_dims[3], epsilon=epsilon)

            # CoaT series: Aggregate features of last three scales for classification.
            if self.parallel_depth > 0:
                assert embed_dims[1] == embed_dims[2] == embed_dims[3]
                self.aggregate = nn.Conv1D(in_channels=3,
                                           out_channels=1,
                                           kernel_size=1)
                self.head = nn.Linear(embed_dims[3], class_dim)
            else:
                # CoaT-Lite series: Use feature of last scale for classification.
                self.head = nn.Linear(embed_dims[3], class_dim)

        # Initialize weights.
        trunc_normal_(self.cls_token1)
        trunc_normal_(self.cls_token2)
        trunc_normal_(self.cls_token3)
        trunc_normal_(self.cls_token4)
        self.apply(self._init_weights)
示例#12
0
    def __init__(
        self,
        image_size,
        patch_size,
        stride,
        base_dims,
        depth,
        heads,
        mlp_ratio,
        in_chans=3,
        attn_drop_rate=0.0,
        drop_rate=0.0,
        drop_path_rate=0.0,
        class_dim=1000,
    ):
        super(PoolingTransformer, self).__init__()

        total_block = sum(depth)
        padding = 0
        block_idx = 0

        width = math.floor((image_size + 2 * padding - patch_size) / stride +
                           1)

        self.base_dims = base_dims
        self.heads = heads
        self.class_dim = class_dim

        self.patch_size = patch_size

        self.pos_embed = add_parameter(
            self, paddle.randn((1, base_dims[0] * heads[0], width, width)))

        self.patch_embed = conv_embedding(in_chans, base_dims[0] * heads[0],
                                          patch_size, stride, padding)

        self.cls_token = add_parameter(
            self, paddle.randn((1, 1, base_dims[0] * heads[0])))

        self.pos_drop = nn.Dropout(p=drop_rate)

        self.transformers = nn.LayerList([])
        self.pools = nn.LayerList([])

        for stage in range(len(depth)):
            drop_path_prob = [
                drop_path_rate * i / total_block
                for i in range(block_idx, block_idx + depth[stage])
            ]
            block_idx += depth[stage]

            self.transformers.append(
                Transformer(
                    base_dims[stage],
                    depth[stage],
                    heads[stage],
                    mlp_ratio,
                    drop_rate,
                    attn_drop_rate,
                    drop_path_prob,
                ))
            if stage < len(heads) - 1:
                self.pools.append(
                    conv_head_pooling(
                        base_dims[stage] * heads[stage],
                        base_dims[stage + 1] * heads[stage + 1],
                        stride=2,
                    ))

        self.norm = nn.LayerNorm(base_dims[-1] * heads[-1], epsilon=1e-6)
        self.embed_dim = base_dims[-1] * heads[-1]

        # Classifier head
        if class_dim > 0:
            self.head = nn.Linear(base_dims[-1] * heads[-1], class_dim)

        trunc_normal_(self.pos_embed)
        trunc_normal_(self.cls_token)
        self.apply(self._init_weights)
示例#13
0
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=3.0,
        qkv_bias=False,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        drop_path_decay="linear",
        hybrid_backbone=None,
        norm_layer=nn.LayerNorm,
        p_emb="4_2",
        head_dim=None,
        skip_lam=1.0,
        order=None,
        mix_token=True,
        return_dense=True,
        class_dim=1000,
    ):
        super().__init__()
        self.class_dim = class_dim
        # num_features for consistency with other models
        self.num_features = self.embed_dim = embed_dim
        self.output_dim = embed_dim if class_dim == 0 else class_dim

        if hybrid_backbone is not None:
            self.patch_embed = HybridEmbed(
                hybrid_backbone,
                img_size=img_size,
                in_chans=in_chans,
                embed_dim=embed_dim,
            )
        else:
            if p_emb == "4_2":
                patch_embed_fn = PatchEmbed4_2
            elif p_emb == "4_2_128":
                patch_embed_fn = PatchEmbed4_2_128
            else:
                patch_embed_fn = PatchEmbedNaive

            self.patch_embed = patch_embed_fn(
                img_size=img_size,
                patch_size=patch_size,
                in_chans=in_chans,
                embed_dim=embed_dim,
            )

        num_patches = self.patch_embed.num_patches

        self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim)))
        self.pos_embed = add_parameter(
            self, paddle.zeros((1, num_patches + 1, embed_dim)))
        self.pos_drop = nn.Dropout(p=drop_rate)

        if order is None:
            dpr = get_dpr(drop_path_rate, depth, drop_path_decay)
            self.blocks = nn.LayerList([
                Block(
                    dim=embed_dim,
                    num_heads=num_heads,
                    head_dim=head_dim,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    skip_lam=skip_lam,
                ) for i in range(depth)
            ])
        else:
            # use given order to sequentially generate modules
            dpr = get_dpr(drop_path_rate, len(order), drop_path_decay)
            self.blocks = nn.LayerList([
                get_block(
                    order[i],
                    dim=embed_dim,
                    num_heads=num_heads,
                    head_dim=head_dim,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    skip_lam=skip_lam,
                ) for i in range(len(order))
            ])

        self.norm = norm_layer(embed_dim)

        if class_dim > 0:
            self.head = nn.Linear(embed_dim, class_dim)

        self.return_dense = return_dense
        self.mix_token = mix_token

        if (return_dense) and (class_dim > 0):
            self.aux_head = nn.Linear(embed_dim, class_dim)

        if mix_token:
            self.beta = 1.0
            assert return_dense, "always return all features when mixtoken is enabled"

        trunc_normal_(self.pos_embed)
        trunc_normal_(self.cls_token)
        self.apply(self._init_weights)
示例#14
0
    def __init__(
        self,
        img_size=224,
        patch_size=4,
        in_chans=3,
        embed_dims=[64, 128, 320, 512],
        num_heads=[1, 2, 5, 8],
        mlp_ratios=[8, 8, 4, 4],
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=nn.LayerNorm,
        epsilon=1e-6,
        depths=[3, 4, 6, 3],
        sr_ratios=[8, 4, 2, 1],
        class_dim=1000,
    ):
        super().__init__()
        self.class_dim = class_dim
        self.depths = depths

        # patch_embed
        self.patch_embed1 = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dims[0],
        )
        self.patch_embed2 = PatchEmbed(
            img_size=img_size // 4,
            patch_size=2,
            in_chans=embed_dims[0],
            embed_dim=embed_dims[1],
        )
        self.patch_embed3 = PatchEmbed(
            img_size=img_size // 8,
            patch_size=2,
            in_chans=embed_dims[1],
            embed_dim=embed_dims[2],
        )
        self.patch_embed4 = PatchEmbed(
            img_size=img_size // 16,
            patch_size=2,
            in_chans=embed_dims[2],
            embed_dim=embed_dims[3],
        )

        # pos_embed
        self.pos_embed1 = add_parameter(
            self,
            paddle.zeros((1, self.patch_embed1.num_patches, embed_dims[0])))
        self.pos_drop1 = nn.Dropout(p=drop_rate)

        self.pos_embed2 = add_parameter(
            self,
            paddle.zeros((1, self.patch_embed2.num_patches, embed_dims[1])))
        self.pos_drop2 = nn.Dropout(p=drop_rate)

        self.pos_embed3 = add_parameter(
            self,
            paddle.zeros((1, self.patch_embed3.num_patches, embed_dims[2])))
        self.pos_drop3 = nn.Dropout(p=drop_rate)

        self.pos_embed4 = add_parameter(
            self,
            paddle.zeros(
                (1, self.patch_embed4.num_patches + 1, embed_dims[3])))
        self.pos_drop4 = nn.Dropout(p=drop_rate)

        # transformer encoder
        dpr = np.linspace(0, drop_path_rate, sum(depths))
        cur = 0
        self.block1 = nn.LayerList([
            Block(
                dim=embed_dims[0],
                num_heads=num_heads[0],
                mlp_ratio=mlp_ratios[0],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[cur + i],
                norm_layer=norm_layer,
                epsilon=epsilon,
                sr_ratio=sr_ratios[0],
            ) for i in range(depths[0])
        ])

        cur += depths[0]
        self.block2 = nn.LayerList([
            Block(
                dim=embed_dims[1],
                num_heads=num_heads[1],
                mlp_ratio=mlp_ratios[1],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[cur + i],
                norm_layer=norm_layer,
                epsilon=epsilon,
                sr_ratio=sr_ratios[1],
            ) for i in range(depths[1])
        ])

        cur += depths[1]
        self.block3 = nn.LayerList([
            Block(
                dim=embed_dims[2],
                num_heads=num_heads[2],
                mlp_ratio=mlp_ratios[2],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[cur + i],
                norm_layer=norm_layer,
                epsilon=epsilon,
                sr_ratio=sr_ratios[2],
            ) for i in range(depths[2])
        ])

        cur += depths[2]
        self.block4 = nn.LayerList([
            Block(
                dim=embed_dims[3],
                num_heads=num_heads[3],
                mlp_ratio=mlp_ratios[3],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[cur + i],
                norm_layer=norm_layer,
                epsilon=epsilon,
                sr_ratio=sr_ratios[3],
            ) for i in range(depths[3])
        ])
        self.norm = norm_layer(embed_dims[3], epsilon=epsilon)

        # cls_token
        self.cls_token = add_parameter(self, paddle.zeros(
            (1, 1, embed_dims[3])))

        # classification head
        if class_dim > 0:
            self.head = nn.Linear(embed_dims[3], class_dim)

        # init weights
        trunc_normal_(self.pos_embed1)
        trunc_normal_(self.pos_embed2)
        trunc_normal_(self.pos_embed3)
        trunc_normal_(self.pos_embed4)
        trunc_normal_(self.cls_token)
        self.apply(self._init_weights)