def __init__( self, in_channels=1, out_channels=32, input_dim=312, hidden_dim=32, output_dim=10, ): super(cnn1d_ser, self).__init__() self.classifier = nn.Sequential( nn.Conv1d(in_channels, out_channels, 5, stride=1, padding=2), nn.BatchNorm1d(out_channels), nn.ReLU(), nn.Dropout(0.5), nn.Conv1d(out_channels, out_channels, 5, stride=1, padding=2), nn.BatchNorm1d(out_channels), nn.ReLU(), nn.Dropout(0.5), nn.Flatten(), nn.Linear(input_dim * out_channels, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.ReLU(), nn.Dropout(0.5), nn.Linear(hidden_dim, output_dim), )
def __init__(self, num_classes: int = 1000) -> None: super(QuantizationAlexNet, self).__init__() self.features = nn.Sequential( nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(64, 192, kernel_size=5, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(192, 384, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), ) self.avgpool = nn.AdaptiveAvgPool2d((6, 6)) self.classifier = nn.Sequential( nn.Dropout(), nn.Linear(256 * 6 * 6, 4096), nn.ReLU(inplace=True), nn.Dropout(), nn.Linear(4096, 4096), nn.ReLU(inplace=True), nn.Linear(4096, num_classes), )
def __init__(self, num_classes=10): super(AlexNet, self).__init__() self.features = nn.Sequential( nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2), nn.Conv2d(64, 192, kernel_size=3, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2), nn.Conv2d(192, 384, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2)) self.fc_layers = nn.Sequential( nn.Dropout(0.6), nn.Linear(4096, 2048), nn.ReLU(inplace=True), nn.Dropout(0.6), nn.Linear(2048, 2048), nn.ReLU(inplace=True), nn.Linear(2048, num_classes), )
def __init__(self, in_dim, mlp_dim, out_dim, dropout_rate=0.1): super(MlpBlock, self).__init__() # init layers self.fc1 = nn.Linear(in_dim, mlp_dim) self.fc2 = nn.Linear(mlp_dim, out_dim) self.act = nn.GELU() if dropout_rate > 0.0: self.dropout1 = nn.Dropout(dropout_rate) self.dropout2 = nn.Dropout(dropout_rate) else: self.dropout1 = None self.dropout2 = None
def __init__( self, max_position_embeddings, hidden_size, nheads, dropout=0, position_embedding_type="absolute", is_decoder=False, ): super(BertSelfAttention, self).__init__() if hidden_size % nheads != 0: raise ValueError( f"The hidden size ({hidden_size}) is not a multiple of the number of attention " f"heads ({nheads})") self.num_attention_heads = nheads self.attention_head_size = int(hidden_size / nheads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(hidden_size, self.all_head_size) self.key = nn.Linear(hidden_size, self.all_head_size) self.value = nn.Linear(hidden_size, self.all_head_size) self.dropout = nn.Dropout(dropout) self.position_embedding_type = position_embedding_type if (self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query"): self.max_position_embeddings = max_position_embeddings self.distance_embedding = nn.Embedding( 2 * max_position_embeddings - 1, self.attention_head_size) self.is_decoder = is_decoder
def __init__( self, spatial_feature_size=7, dropout_ratio=0.8, num_classes=101, with_avg_pool=False, temporal_feature_size=1, in_channels=2048, init_std=0.01, fcn_testing=False, ): super(ClsHead, self).__init__() self.with_avg_pool = with_avg_pool self.dropout_ratio = dropout_ratio self.in_channels = in_channels self.dropout_ratio = dropout_ratio self.temporal_feature_size = temporal_feature_size self.spatial_feature_size = spatial_feature_size self.init_std = init_std self.fcn_testing = fcn_testing self.num_classes = num_classes if self.dropout_ratio != 0: self.dropout = nn.Dropout(p=self.dropout_ratio) else: self.dropout = None if self.with_avg_pool: self.avg_pool = nn.AvgPool3d( (temporal_feature_size, spatial_feature_size, spatial_feature_size)) self.fc_cls = nn.Linear(in_channels, num_classes) self.new_cls = None
def __init__( self, word_emb_dim, vocab_size, dim_channel, kernel_wins, dropout_rate, num_class, max_seq_len, training=True, ): super(textCNN, self).__init__() self.embed = nn.Embedding(vocab_size, word_emb_dim) self.convs = nn.ModuleList([ nn.Conv2d(1, dim_channel, (w, word_emb_dim)) for w in kernel_wins ]) self.maxpool = nn.ModuleList([ nn.MaxPool2d((max_seq_len - w + 1, 1), stride=1) for w in kernel_wins ]) # Dropout layer self.dropout = nn.Dropout(dropout_rate) self.training = training # FC layer self.fc = nn.Linear(len(kernel_wins) * dim_channel, num_class)
def __init__( self, c_in, c_cond, c_h, c_out, kernel_size, n_conv_blocks, upsample, act, sn, dropout_rate, ): super(Decoder, self).__init__() self.n_conv_blocks = n_conv_blocks self.upsample = upsample self.act = get_act(act) f = lambda x: x self.in_conv_layer = f(nn.Conv1d(c_in, c_h, kernel_size=1)) self.first_conv_layers = nn.ModuleList([ f(nn.Conv1d(c_h, c_h, kernel_size=kernel_size)) for _ in range(n_conv_blocks) ]) self.second_conv_layers = nn.ModuleList([ f(nn.Conv1d(c_h, c_h * up, kernel_size=kernel_size)) for _, up in zip(range(n_conv_blocks), self.upsample) ]) self.norm_layer = nn.InstanceNorm1d(c_h, affine=False) self.conv_affine_layers = nn.ModuleList( [f(nn.Linear(c_cond, c_h * 2)) for _ in range(n_conv_blocks * 2)]) self.out_conv_layer = f(nn.Conv1d(c_h, c_out, kernel_size=1)) self.dropout_layer = nn.Dropout(p=dropout_rate)
def __init__(self, intermediate_size, config): super().__init__() embed_dim = config.hidden_size self.c_fc = Conv1D(intermediate_size, embed_dim) self.c_proj = Conv1D(embed_dim, intermediate_size) self.act = gelu self.dropout = nn.Dropout(config.resid_pdrop)
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): super().__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Linear(d_model, n_head * d_k) self.w_ks = nn.Linear(d_model, n_head * d_k) self.w_vs = nn.Linear(d_model, n_head * d_v) nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) self.attention = ScaledDotProductAttention(temperature=np.power( d_k, 0.5), attn_dropout=dropout) self.layer_norm = nn.LayerNorm(d_model) self.fc = nn.Linear(n_head * d_v, d_model) nn.init.xavier_normal_(self.fc.weight) self.dropout = nn.Dropout(dropout)
def __init__( self, input_size: int, hidden_size: int, num_layers: int = 1, bias: bool = True, batch_first: bool = False, dropout: float = 0, bidirectional: bool = False, ): super().__init__() self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers self.bias = bias self.batch_first = batch_first self.dropout = dropout self.bidirectional = bidirectional num_directions = 2 if bidirectional else 1 gate_size = 3 * hidden_size self.drop = nn.Dropout(self.dropout) for layer in range(num_layers): for direction in range(num_directions): real_hidden_size = hidden_size layer_input_size = ( input_size if layer == 0 else real_hidden_size * num_directions ) # TODO: Modify after adding the stride attribute # w_ih = flow.nn.Parameter(flow.Tensor(gate_size, layer_input_size)) # w_hh = flow.nn.Parameter(flow.Tensor(gate_size, real_hidden_size)) # b_ih = flow.nn.Parameter(flow.Tensor(gate_size)) # b_hh = flow.nn.Parameter(flow.Tensor(gate_size)) w_ih = flow.nn.Parameter(flow.Tensor(layer_input_size, gate_size)) w_hh = flow.nn.Parameter(flow.Tensor(real_hidden_size, gate_size)) b_ih = flow.nn.Parameter(flow.Tensor(gate_size)) b_hh = flow.nn.Parameter(flow.Tensor(gate_size)) layer_params = () if bias: layer_params = (w_ih, w_hh, b_ih, b_hh) else: layer_params = (w_ih, w_hh) suffix = "_reverse" if direction == 1 else "" param_names = ["weight_ih_l{}{}", "weight_hh_l{}{}"] if bias: param_names += ["bias_ih_l{}{}", "bias_hh_l{}{}"] param_names = [x.format(layer, suffix) for x in param_names] for name, param in zip(param_names, layer_params): setattr(self, name, param) self.reset_parameters()
def __init__(self, source_dim, output_dim, enable_output_proj=True, dropout=0.0): super(BasedAttention, self).__init__() self.enable_output_proj = enable_output_proj if self.enable_output_proj: self.output_proj = nn.Linear(source_dim, output_dim) self.dropout = nn.Dropout(dropout)
def __init__( self, n_heads, d_model, d_ff, memory_dim, slf_attn_dropout=0.0, src_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, normalize_before=False, concat_after=False, relative_positional=False, activation="relu", ): super(TransformerDecoderLayer, self).__init__() self.relative_positional = relative_positional if self.relative_positional: self.slf_attn = MultiHeadedSelfAttentionWithRelPos( n_heads, d_model, slf_attn_dropout ) else: self.slf_attn = MultiHeadedSelfAttention(n_heads, d_model, slf_attn_dropout) self.src_attn = MultiHeadedCrossAttention( n_heads, d_model, memory_dim, src_attn_dropout ) self.feed_forward = PositionwiseFeedForward( d_model, d_ff, ffn_dropout, activation ) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(residual_dropout) self.dropout2 = nn.Dropout(residual_dropout) self.dropout3 = nn.Dropout(residual_dropout) self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear1 = nn.Linear(d_model * 2, d_model) self.concat_linear2 = nn.Linear(d_model * 2, d_model)
def __init__( self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, ): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 # define a parameter table of relative position bias # Author zzk: we add trunc normal here! self.relative_position_bias_table = nn.Parameter( flow.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH self.relative_position_bias_table.trunc_normal_(std=0.02) # get pair-wise relative position index for each token inside the window coords_h = flow.arange(self.window_size[0]) coords_w = flow.arange(self.window_size[1]) coords = flow.stack(flow.meshgrid(*[coords_h, coords_w])) # 2, Wh, Ww coords_flatten = flow.flatten(coords, 1) # 2, Wh*Ww relative_coords = (coords_flatten[:, :, None] - coords_flatten[:, None, :]) # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0) # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.softmax = nn.Softmax(dim=-1)
def __init__(self, input_dim, hidden_dim, output_dim, batch_size): super(lstm_ser, self).__init__() self.classifier = nn.Sequential( LSTM(input_dim, hidden_dim, batch_size), nn.Dropout(0.5), nn.Linear(hidden_dim, 32), nn.ReLU(), nn.Linear(32, output_dim), )
def __init__(self, num_patches, emb_dim, dropout_rate=0.1): super(PositionEmbs, self).__init__() self.pos_embedding = nn.Parameter( flow.tensor(np.random.randn(1, num_patches + 1, emb_dim), dtype=flow.float32)) if dropout_rate > 0: self.dropout = nn.Dropout(dropout_rate) else: self.dropout = None
def __init__(self, features: nn.Module, num_classes: int = 1000, init_weights: bool = True) -> None: super(VGG, self).__init__() self.features = features self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) self.classifier = nn.Sequential( nn.Linear(512 * 7 * 7, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, num_classes), ) if init_weights: self._initialize_weights()
def __init__(self, hidden_size, intermediate_size, layer_norm_eps=1e-5, dropout=0): super(BertOutput, self).__init__() self.dense = nn.Linear(intermediate_size, hidden_size) self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, d_ff, dropout, activation="relu"): super(PositionwiseFeedForward, self).__init__() self.activation = activation assert activation in ["relu", "gelu", "glu", "tanh", "swish"] self.w_1 = nn.Linear(d_model, d_ff * 2 if activation == "glu" else d_ff) self.w_2 = nn.Linear(d_ff, d_model) self.dropout = nn.Dropout(dropout)
def __init__(self, config): super(GPT2Model, self).__init__() self.embed_dim = config.hidden_size self.wte = nn.Embedding(config.vocab_size, self.embed_dim) self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList( [GPT2Block(config) for _ in range(config.num_hidden_layers)]) self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
def __init__( self, hidden_size: int, intermediate_size: int, hidden_dropout_prob: float = 0.1, hidden_act: str = "relu", ) -> None: super().__init__() self.hidden_act = hidden_act self.intermediate = nn.Linear(hidden_size, intermediate_size) self.output = nn.Linear(intermediate_size, hidden_size) self.dropout = nn.Dropout(hidden_dropout_prob)
def __init__(self, config: Callable[..., None]) -> None: super().__init__() self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, cfgs, num_classes=1000, width=1.0, dropout=0.2): super(GhostNet, self).__init__() # setting of inverted residual blocks self.cfgs = cfgs self.dropout = dropout # building first layer output_channel = _make_divisible(16 * width, 4) self.conv_stem = nn.Conv2d(3, output_channel, 3, 2, 1, bias=False) self.bn1 = nn.BatchNorm2d(output_channel) self.act1 = nn.ReLU(inplace=True) input_channel = output_channel # building inverted residual blocks stages = [] block = GhostBottleneck for cfg in self.cfgs: layers = [] for k, exp_size, c, se_ratio, s in cfg: output_channel = _make_divisible(c * width, 4) hidden_channel = _make_divisible(exp_size * width, 4) layers.append( block( input_channel, hidden_channel, output_channel, k, s, se_ratio=se_ratio, )) input_channel = output_channel stages.append(nn.Sequential(*layers)) output_channel = _make_divisible(exp_size * width, 4) stages.append( nn.Sequential(ConvBnAct(input_channel, output_channel, 1))) input_channel = output_channel self.blocks = nn.Sequential(*stages) # building last several layers output_channel = 1280 self.global_pool = nn.AdaptiveAvgPool2d((1, 1)) self.conv_head = nn.Conv2d(input_channel, output_channel, 1, 1, 0, bias=True) self.act2 = nn.ReLU(inplace=True) self.classifier = nn.Linear(output_channel, num_classes) self.dropout = nn.Dropout(p=self.dropout)
def __init__( self, input_size, in_channel, out_channel, kernel_size, stride, dropout=0.1, batch_norm=False, residual=False, act_func_type="relu", ): super(Conv2dLayer, self).__init__() self.input_size = input_size self.in_channel = in_channel self.out_channel = out_channel self.batch_norm = batch_norm self.kernel_size = kernel_size self.stride = stride self.padding = ( 0, kernel_size // 2 if isinstance(self.kernel_size, int) else kernel_size[1] // 2, ) self.residual = residual self.act_func_type = act_func_type self.conv_layer = nn.Conv2d( in_channels=in_channel, out_channels=out_channel, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, ) self.output_size = cal_width_dim_2d( input_size, self.kernel_size if isinstance(self.kernel_size, int) else self.kernel_size[1], self.stride if isinstance(self.stride, int) else self.stride[1], padding=self.padding if isinstance(self.padding, int) else self.padding[1], ) if self.batch_norm: self.norm = nn.BatchNorm2d(out_channel) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, dropout=0.1, max_len=5000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(p=dropout) pe = flow.zeros((max_len, d_model)) position = flow.arange(0, max_len, dtype=flow.float).unsqueeze(1) div_term = flow.exp( flow.arange(0, d_model, 2).to(flow.float) * (-math.log(10000.0) / d_model) ).unsqueeze(0) pe[:, 0::2] = flow.sin(position * div_term) pe[:, 1::2] = flow.cos(position * div_term) pe = pe.unsqueeze(0).transpose(0, 1) self.pe = flow.nn.Parameter(pe, requires_grad=False)
def __init__( self, sos_id, eos_id, n_tgt_vocab, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, dropout=0.1, tgt_emb_prj_weight_sharing=True, pe_maxlen=5000, ): super(Decoder, self).__init__() # parameters self.sos_id = sos_id self.eos_id = eos_id self.n_tgt_vocab = n_tgt_vocab self.d_word_vec = d_word_vec self.n_layers = n_layers self.n_head = n_head self.d_k = d_k self.d_v = d_v self.d_model = d_model self.d_inner = d_inner self.dropout = dropout self.tgt_emb_prj_weight_sharing = tgt_emb_prj_weight_sharing self.pe_maxlen = pe_maxlen self.tgt_word_emb = nn.Embedding(n_tgt_vocab, d_word_vec) self.positional_encoding = PositionalEncoding(d_model, max_len=pe_maxlen) self.dropout = nn.Dropout(dropout) self.layer_stack = nn.ModuleList([ DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers) ]) self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False) nn.init.xavier_normal_(self.tgt_word_prj.weight) if tgt_emb_prj_weight_sharing: # Share the weight matrix between target word embedding & the final logit dense layer self.tgt_word_prj.weight = self.tgt_word_emb.weight self.x_logit_scale = d_model**0.5 else: self.x_logit_scale = 1.0
def build_conv_block(self, dim, padding_type, norm_layer, use_dropout, use_bias): """Construct a convolutional block. Parameters: dim (int) -- the number of channels in the conv layer. padding_type (str) -- the name of padding layer: reflect | replicate | zero norm_layer -- normalization layer use_dropout (bool) -- if use dropout layers. use_bias (bool) -- if the conv layer uses bias or not Returns a conv block (with a conv layer, a normalization layer, and a non-linearity layer (ReLU)) """ conv_block = [] p = 0 if padding_type == "reflect": conv_block += [nn.ReflectionPad2d(1)] elif padding_type == "replicate": conv_block += [nn.ReplicationPad2d(1)] elif padding_type == "zero": p = 1 else: raise NotImplementedError("padding [%s] is not implemented" % padding_type) conv_block += [ nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias), norm_layer(dim), nn.ReLU(True), ] if use_dropout: conv_block += [nn.Dropout(0.5)] p = 0 if padding_type == "reflect": conv_block += [nn.ReflectionPad2d(1)] elif padding_type == "replicate": conv_block += [nn.ReplicationPad2d(1)] elif padding_type == "zero": p = 1 else: raise NotImplementedError("padding [%s] is not implemented" % padding_type) conv_block += [ nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias), norm_layer(dim), ] return nn.Sequential(*conv_block)
def __init__(self, in_dim, heads=8, dropout_rate=0.1): super(SelfAttention, self).__init__() self.heads = heads self.head_dim = in_dim // heads self.scale = self.head_dim**0.5 self.query = nn.Linear(in_dim, self.heads * self.head_dim) self.key = nn.Linear(in_dim, self.heads * self.head_dim) self.value = nn.Linear(in_dim, self.heads * self.head_dim) self.out = nn.Linear(self.heads * self.head_dim, in_dim) if dropout_rate > 0: self.dropout = nn.Dropout(dropout_rate) else: self.dropout = None
def __init__(self, config): super(GPT2Attention, self).__init__() max_positions = config.max_position_embeddings self.register_buffer( "bias", flow.tril( flow.ones((max_positions, max_positions), dtype=flow.int8)).view(1, 1, max_positions, max_positions), ) self.register_buffer("masked_bias", flow.tensor(-1e4)) self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads assert self.embed_dim % self.num_heads == 0 self.head_dim = self.embed_dim // self.num_heads self.scale_attn_weights = config.scale_attn_weights self.c_attn = Conv1D(self.embed_dim * 3, self.embed_dim) self.c_proj = Conv1D(self.embed_dim, self.embed_dim) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop)
def __init__( self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0, ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop)