def __init__( self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm, epsilon=1e-6, block_layers=LayerScale_Block, block_layers_token=LayerScale_Block_CA, Patch_layer=PatchEmbed, act_layer=nn.GELU, Attention_block=Attention_talking_head, Mlp_block=Mlp, init_scale=1e-4, Attention_block_token_only=Class_Attention, Mlp_block_token_only=Mlp, depth_token_only=2, mlp_ratio_clstk=4.0, class_dim=1000, ): super().__init__() self.class_dim = class_dim self.num_features = self.embed_dim = embed_dim self.patch_embed = Patch_layer( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ) num_patches = self.patch_embed.num_patches self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim))) self.pos_embed = add_parameter( self, paddle.zeros((1, num_patches, embed_dim))) self.pos_drop = nn.Dropout(p=drop_rate) dpr = [drop_path_rate for i in range(depth)] self.blocks = nn.LayerList([ block_layers( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, epsilon=epsilon, act_layer=act_layer, Attention_block=Attention_block, Mlp_block=Mlp_block, init_values=init_scale, ) for i in range(depth) ]) self.blocks_token_only = nn.LayerList([ block_layers_token( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio_clstk, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=0.0, attn_drop=0.0, drop_path=0.0, norm_layer=norm_layer, epsilon=epsilon, act_layer=act_layer, Attention_block=Attention_block_token_only, Mlp_block=Mlp_block_token_only, init_values=init_scale, ) for i in range(depth_token_only) ]) self.norm = norm_layer(embed_dim, epsilon=epsilon) # Classifier head if class_dim > 0: self.head = nn.Linear(embed_dim, class_dim) trunc_normal_(self.pos_embed) trunc_normal_(self.cls_token) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, in_dim=48, depth=12, num_heads=12, in_num_head=4, mlp_ratio=4.0, qkv_bias=False, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm, first_stride=4, class_dim=1000, ): super().__init__() self.class_dim = class_dim # num_features for consistency with other models self.num_features = self.embed_dim = embed_dim self.pixel_embed = PixelEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, in_dim=in_dim, stride=first_stride, ) num_patches = self.pixel_embed.num_patches self.num_patches = num_patches new_patch_size = self.pixel_embed.new_patch_size num_pixel = new_patch_size ** 2 self.norm1_proj = norm_layer(num_pixel * in_dim) self.proj = nn.Linear(num_pixel * in_dim, embed_dim) self.norm2_proj = norm_layer(embed_dim) self.cls_token = self.create_parameter( shape=(1, 1, embed_dim), default_initializer=zeros_ ) self.add_parameter("cls_token", self.cls_token) self.patch_pos = self.create_parameter( shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_ ) self.add_parameter("patch_pos", self.patch_pos) self.pixel_pos = self.create_parameter( shape=(1, in_dim, new_patch_size, new_patch_size), default_initializer=zeros_, ) self.add_parameter("pixel_pos", self.pixel_pos) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth decay rule dpr = np.linspace(0, drop_path_rate, depth) blocks = [] for i in range(depth): blocks.append( Block( dim=embed_dim, in_dim=in_dim, num_pixel=num_pixel, num_heads=num_heads, in_num_head=in_num_head, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, ) ) self.blocks = nn.LayerList(blocks) self.norm = norm_layer(embed_dim) if class_dim > 0: self.head = nn.Linear(embed_dim, class_dim) trunc_normal_(self.cls_token) trunc_normal_(self.patch_pos) trunc_normal_(self.pixel_pos) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.1, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, class_dim=1000, with_pool=True, **kwargs, ): super().__init__() self.class_dim = class_dim self.with_pool = with_pool self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) self.mlp_ratio = mlp_ratio # split image into non-overlapping patches self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None, ) num_patches = self.patch_embed.num_patches patches_resolution = self.patch_embed.patches_resolution self.patches_resolution = patches_resolution # absolute position embedding if self.ape: self.absolute_pos_embed = add_parameter( self, paddle.zeros((1, num_patches, embed_dim)) ) trunc_normal_(self.absolute_pos_embed) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = np.linspace(0, drop_path_rate, sum(depths)) # build layers self.layers = nn.LayerList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2 ** i_layer), input_resolution=( patches_resolution[0] // (2 ** i_layer), patches_resolution[1] // (2 ** i_layer), ), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, ) self.layers.append(layer) self.norm = norm_layer(self.num_features) if with_pool: self.avgpool = nn.AdaptiveAvgPool1D(1) if class_dim > 0: self.head = nn.Linear(self.num_features, class_dim) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=False, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm, epsilon=1e-5, class_dim=1000, ): super().__init__() self.class_dim = class_dim self.num_features = self.embed_dim = embed_dim self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ) num_patches = self.patch_embed.num_patches self.pos_embed = add_parameter( self, paddle.zeros((1, num_patches + 1, embed_dim)) ) self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim))) self.pos_drop = nn.Dropout(p=drop_rate) dpr = np.linspace(0, drop_path_rate, depth) self.blocks = nn.LayerList( [ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, epsilon=epsilon, ) for i in range(depth) ] ) self.norm = norm_layer(embed_dim, epsilon=epsilon) # Classifier head if class_dim > 0: self.head = nn.Linear(embed_dim, class_dim) if paddle.in_dynamic_mode(): trunc_normal_(self.pos_embed) trunc_normal_(self.cls_token) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=16, in_chans=3, embed_dims=[0, 0, 0, 0], serial_depths=[0, 0, 0, 0], parallel_depth=0, num_heads=0, mlp_ratios=[0, 0, 0, 0], qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm, epsilon=1e-6, return_interm_layers=False, out_features=None, crpe_window={ 3: 2, 5: 3, 7: 3 }, class_dim=1000, **kwargs, ): super().__init__() self.return_interm_layers = return_interm_layers self.out_features = out_features self.class_dim = class_dim # Patch embeddings. self.patch_embed1 = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dims[0], ) self.patch_embed2 = PatchEmbed( img_size=img_size // 4, patch_size=2, in_chans=embed_dims[0], embed_dim=embed_dims[1], ) self.patch_embed3 = PatchEmbed( img_size=img_size // 8, patch_size=2, in_chans=embed_dims[1], embed_dim=embed_dims[2], ) self.patch_embed4 = PatchEmbed( img_size=img_size // 16, patch_size=2, in_chans=embed_dims[2], embed_dim=embed_dims[3], ) # Class tokens. self.cls_token1 = add_parameter(self, paddle.zeros((1, 1, embed_dims[0]))) self.cls_token2 = add_parameter(self, paddle.zeros((1, 1, embed_dims[1]))) self.cls_token3 = add_parameter(self, paddle.zeros((1, 1, embed_dims[2]))) self.cls_token4 = add_parameter(self, paddle.zeros((1, 1, embed_dims[3]))) # Convolutional position encodings. self.cpe1 = ConvPosEnc(dim=embed_dims[0], k=3) self.cpe2 = ConvPosEnc(dim=embed_dims[1], k=3) self.cpe3 = ConvPosEnc(dim=embed_dims[2], k=3) self.cpe4 = ConvPosEnc(dim=embed_dims[3], k=3) # Convolutional relative position encodings. self.crpe1 = ConvRelPosEnc(Ch=embed_dims[0] // num_heads, h=num_heads, window=crpe_window) self.crpe2 = ConvRelPosEnc(Ch=embed_dims[1] // num_heads, h=num_heads, window=crpe_window) self.crpe3 = ConvRelPosEnc(Ch=embed_dims[2] // num_heads, h=num_heads, window=crpe_window) self.crpe4 = ConvRelPosEnc(Ch=embed_dims[3] // num_heads, h=num_heads, window=crpe_window) # Disable stochastic depth. dpr = drop_path_rate assert dpr == 0.0 # Serial blocks 1. self.serial_blocks1 = nn.LayerList([ SerialBlock( dim=embed_dims[0], num_heads=num_heads, mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, epsilon=epsilon, shared_cpe=self.cpe1, shared_crpe=self.crpe1, ) for _ in range(serial_depths[0]) ]) # Serial blocks 2. self.serial_blocks2 = nn.LayerList([ SerialBlock( dim=embed_dims[1], num_heads=num_heads, mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, epsilon=epsilon, shared_cpe=self.cpe2, shared_crpe=self.crpe2, ) for _ in range(serial_depths[1]) ]) # Serial blocks 3. self.serial_blocks3 = nn.LayerList([ SerialBlock( dim=embed_dims[2], num_heads=num_heads, mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, epsilon=epsilon, shared_cpe=self.cpe3, shared_crpe=self.crpe3, ) for _ in range(serial_depths[2]) ]) # Serial blocks 4. self.serial_blocks4 = nn.LayerList([ SerialBlock( dim=embed_dims[3], num_heads=num_heads, mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, epsilon=epsilon, shared_cpe=self.cpe4, shared_crpe=self.crpe4, ) for _ in range(serial_depths[3]) ]) # Parallel blocks. self.parallel_depth = parallel_depth if self.parallel_depth > 0: self.parallel_blocks = nn.LayerList([ ParallelBlock( dims=embed_dims, num_heads=num_heads, mlp_ratios=mlp_ratios, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, epsilon=epsilon, shared_cpes=[self.cpe1, self.cpe2, self.cpe3, self.cpe4], shared_crpes=[ self.crpe1, self.crpe2, self.crpe3, self.crpe4 ], ) for _ in range(parallel_depth) ]) # Classification head(s). if not self.return_interm_layers: self.norm1 = norm_layer(embed_dims[0], epsilon=epsilon) self.norm2 = norm_layer(embed_dims[1], epsilon=epsilon) self.norm3 = norm_layer(embed_dims[2], epsilon=epsilon) self.norm4 = norm_layer(embed_dims[3], epsilon=epsilon) # CoaT series: Aggregate features of last three scales for classification. if self.parallel_depth > 0: assert embed_dims[1] == embed_dims[2] == embed_dims[3] self.aggregate = nn.Conv1D(in_channels=3, out_channels=1, kernel_size=1) self.head = nn.Linear(embed_dims[3], class_dim) else: # CoaT-Lite series: Use feature of last scale for classification. self.head = nn.Linear(embed_dims[3], class_dim) # Initialize weights. trunc_normal_(self.cls_token1) trunc_normal_(self.cls_token2) trunc_normal_(self.cls_token3) trunc_normal_(self.cls_token4) self.apply(self._init_weights)
def __init__( self, image_size, patch_size, stride, base_dims, depth, heads, mlp_ratio, in_chans=3, attn_drop_rate=0.0, drop_rate=0.0, drop_path_rate=0.0, class_dim=1000, ): super(PoolingTransformer, self).__init__() total_block = sum(depth) padding = 0 block_idx = 0 width = math.floor((image_size + 2 * padding - patch_size) / stride + 1) self.base_dims = base_dims self.heads = heads self.class_dim = class_dim self.patch_size = patch_size self.pos_embed = add_parameter( self, paddle.randn((1, base_dims[0] * heads[0], width, width))) self.patch_embed = conv_embedding(in_chans, base_dims[0] * heads[0], patch_size, stride, padding) self.cls_token = add_parameter( self, paddle.randn((1, 1, base_dims[0] * heads[0]))) self.pos_drop = nn.Dropout(p=drop_rate) self.transformers = nn.LayerList([]) self.pools = nn.LayerList([]) for stage in range(len(depth)): drop_path_prob = [ drop_path_rate * i / total_block for i in range(block_idx, block_idx + depth[stage]) ] block_idx += depth[stage] self.transformers.append( Transformer( base_dims[stage], depth[stage], heads[stage], mlp_ratio, drop_rate, attn_drop_rate, drop_path_prob, )) if stage < len(heads) - 1: self.pools.append( conv_head_pooling( base_dims[stage] * heads[stage], base_dims[stage + 1] * heads[stage + 1], stride=2, )) self.norm = nn.LayerNorm(base_dims[-1] * heads[-1], epsilon=1e-6) self.embed_dim = base_dims[-1] * heads[-1] # Classifier head if class_dim > 0: self.head = nn.Linear(base_dims[-1] * heads[-1], class_dim) trunc_normal_(self.pos_embed) trunc_normal_(self.cls_token) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=3.0, qkv_bias=False, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, drop_path_decay="linear", hybrid_backbone=None, norm_layer=nn.LayerNorm, p_emb="4_2", head_dim=None, skip_lam=1.0, order=None, mix_token=True, return_dense=True, class_dim=1000, ): super().__init__() self.class_dim = class_dim # num_features for consistency with other models self.num_features = self.embed_dim = embed_dim self.output_dim = embed_dim if class_dim == 0 else class_dim if hybrid_backbone is not None: self.patch_embed = HybridEmbed( hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim, ) else: if p_emb == "4_2": patch_embed_fn = PatchEmbed4_2 elif p_emb == "4_2_128": patch_embed_fn = PatchEmbed4_2_128 else: patch_embed_fn = PatchEmbedNaive self.patch_embed = patch_embed_fn( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ) num_patches = self.patch_embed.num_patches self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim))) self.pos_embed = add_parameter( self, paddle.zeros((1, num_patches + 1, embed_dim))) self.pos_drop = nn.Dropout(p=drop_rate) if order is None: dpr = get_dpr(drop_path_rate, depth, drop_path_decay) self.blocks = nn.LayerList([ Block( dim=embed_dim, num_heads=num_heads, head_dim=head_dim, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, skip_lam=skip_lam, ) for i in range(depth) ]) else: # use given order to sequentially generate modules dpr = get_dpr(drop_path_rate, len(order), drop_path_decay) self.blocks = nn.LayerList([ get_block( order[i], dim=embed_dim, num_heads=num_heads, head_dim=head_dim, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, skip_lam=skip_lam, ) for i in range(len(order)) ]) self.norm = norm_layer(embed_dim) if class_dim > 0: self.head = nn.Linear(embed_dim, class_dim) self.return_dense = return_dense self.mix_token = mix_token if (return_dense) and (class_dim > 0): self.aux_head = nn.Linear(embed_dim, class_dim) if mix_token: self.beta = 1.0 assert return_dense, "always return all features when mixtoken is enabled" trunc_normal_(self.pos_embed) trunc_normal_(self.cls_token) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=4, in_chans=3, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm, epsilon=1e-6, depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], class_dim=1000, ): super().__init__() self.class_dim = class_dim self.depths = depths # patch_embed self.patch_embed1 = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dims[0], ) self.patch_embed2 = PatchEmbed( img_size=img_size // 4, patch_size=2, in_chans=embed_dims[0], embed_dim=embed_dims[1], ) self.patch_embed3 = PatchEmbed( img_size=img_size // 8, patch_size=2, in_chans=embed_dims[1], embed_dim=embed_dims[2], ) self.patch_embed4 = PatchEmbed( img_size=img_size // 16, patch_size=2, in_chans=embed_dims[2], embed_dim=embed_dims[3], ) # pos_embed self.pos_embed1 = add_parameter( self, paddle.zeros((1, self.patch_embed1.num_patches, embed_dims[0]))) self.pos_drop1 = nn.Dropout(p=drop_rate) self.pos_embed2 = add_parameter( self, paddle.zeros((1, self.patch_embed2.num_patches, embed_dims[1]))) self.pos_drop2 = nn.Dropout(p=drop_rate) self.pos_embed3 = add_parameter( self, paddle.zeros((1, self.patch_embed3.num_patches, embed_dims[2]))) self.pos_drop3 = nn.Dropout(p=drop_rate) self.pos_embed4 = add_parameter( self, paddle.zeros( (1, self.patch_embed4.num_patches + 1, embed_dims[3]))) self.pos_drop4 = nn.Dropout(p=drop_rate) # transformer encoder dpr = np.linspace(0, drop_path_rate, sum(depths)) cur = 0 self.block1 = nn.LayerList([ Block( dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, epsilon=epsilon, sr_ratio=sr_ratios[0], ) for i in range(depths[0]) ]) cur += depths[0] self.block2 = nn.LayerList([ Block( dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, epsilon=epsilon, sr_ratio=sr_ratios[1], ) for i in range(depths[1]) ]) cur += depths[1] self.block3 = nn.LayerList([ Block( dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, epsilon=epsilon, sr_ratio=sr_ratios[2], ) for i in range(depths[2]) ]) cur += depths[2] self.block4 = nn.LayerList([ Block( dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, epsilon=epsilon, sr_ratio=sr_ratios[3], ) for i in range(depths[3]) ]) self.norm = norm_layer(embed_dims[3], epsilon=epsilon) # cls_token self.cls_token = add_parameter(self, paddle.zeros( (1, 1, embed_dims[3]))) # classification head if class_dim > 0: self.head = nn.Linear(embed_dims[3], class_dim) # init weights trunc_normal_(self.pos_embed1) trunc_normal_(self.pos_embed2) trunc_normal_(self.pos_embed3) trunc_normal_(self.pos_embed4) trunc_normal_(self.cls_token) self.apply(self._init_weights)