def __init__(self, embed_dims, feedforward_channels, act_cfg=dict(type='GELU'), ffn_drop=0., dropout_layer=None, use_conv=False, init_cfg=None): super(MixFFN, self).__init__(init_cfg=init_cfg) self.embed_dims = embed_dims self.feedforward_channels = feedforward_channels self.act_cfg = act_cfg activate = build_activation_layer(act_cfg) in_channels = embed_dims fc1 = Conv2d( in_channels=in_channels, out_channels=feedforward_channels, kernel_size=1, stride=1, bias=True) if use_conv: # 3x3 depth wise conv to provide positional encode information dw_conv = Conv2d( in_channels=feedforward_channels, out_channels=feedforward_channels, kernel_size=3, stride=1, padding=(3 - 1) // 2, bias=True, groups=feedforward_channels) fc2 = Conv2d( in_channels=feedforward_channels, out_channels=in_channels, kernel_size=1, stride=1, bias=True) drop = nn.Dropout(ffn_drop) layers = [fc1, activate, drop, fc2, drop] if use_conv: layers.insert(1, dw_conv) self.layers = Sequential(*layers) self.dropout_layer = build_dropout( dropout_layer) if dropout_layer else torch.nn.Identity()
def __init__(self, embed_dims, num_heads, feedforward_channels, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., num_fcs=2, qkv_bias=True, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), sr_ratio=1., init_cfg=None): super(GSAEncoderLayer, self).__init__(init_cfg=init_cfg) self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1] self.attn = GlobalSubsampledAttention(embed_dims=embed_dims, num_heads=num_heads, attn_drop=attn_drop_rate, proj_drop=drop_rate, dropout_layer=dict( type='DropPath', drop_prob=drop_path_rate), qkv_bias=qkv_bias, norm_cfg=norm_cfg, sr_ratio=sr_ratio) self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1] self.ffn = FFN(embed_dims=embed_dims, feedforward_channels=feedforward_channels, num_fcs=num_fcs, ffn_drop=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), act_cfg=act_cfg, add_identity=False) self.drop_path = build_dropout( dict(type='DropPath', drop_prob=drop_path_rate) ) if drop_path_rate > 0. else nn.Identity()
def __init__(self, embed_dims, num_heads, feedforward_channels, attn_drop_rate=0., drop_path_rate=0., num_fcs=2, bias='qv_bias', act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), window_size=None, attn_cfg=dict(), ffn_cfg=dict(add_identity=False), init_values=None): attn_cfg.update(dict(window_size=window_size, qk_scale=None)) super(BEiTTransformerEncoderLayer, self).__init__(embed_dims=embed_dims, num_heads=num_heads, feedforward_channels=feedforward_channels, attn_drop_rate=attn_drop_rate, drop_path_rate=0., drop_rate=0., num_fcs=num_fcs, qkv_bias=bias, act_cfg=act_cfg, norm_cfg=norm_cfg, attn_cfg=attn_cfg, ffn_cfg=ffn_cfg) # NOTE: drop path for stochastic depth, we shall see if # this is better than dropout here dropout_layer = dict(type='DropPath', drop_prob=drop_path_rate) self.drop_path = build_dropout( dropout_layer) if dropout_layer else nn.Identity() self.gamma_1 = nn.Parameter(init_values * torch.ones((embed_dims)), requires_grad=True) self.gamma_2 = nn.Parameter(init_values * torch.ones((embed_dims)), requires_grad=True)
def __init__(self, embed_dims, num_heads, feedforward_channels, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., num_fcs=2, qkv_bias=True, qk_scale=None, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), window_size=1, init_cfg=None): super(LSAEncoderLayer, self).__init__(init_cfg=init_cfg) self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1] self.attn = LocallyGroupedSelfAttention(embed_dims, num_heads, qkv_bias, qk_scale, attn_drop_rate, drop_rate, window_size) self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1] self.ffn = FFN(embed_dims=embed_dims, feedforward_channels=feedforward_channels, num_fcs=num_fcs, ffn_drop=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), act_cfg=act_cfg, add_identity=False) self.drop_path = build_dropout( dict(type='DropPath', drop_prob=drop_path_rate) ) if drop_path_rate > 0. else nn.Identity()