Exemplo n.º 1
0
 def __init__(
     self,
     in_channels=1,
     out_channels=32,
     input_dim=312,
     hidden_dim=32,
     output_dim=10,
 ):
     super(cnn1d_ser, self).__init__()
     self.classifier = nn.Sequential(
         nn.Conv1d(in_channels, out_channels, 5, stride=1, padding=2),
         nn.BatchNorm1d(out_channels),
         nn.ReLU(),
         nn.Dropout(0.5),
         nn.Conv1d(out_channels, out_channels, 5, stride=1, padding=2),
         nn.BatchNorm1d(out_channels),
         nn.ReLU(),
         nn.Dropout(0.5),
         nn.Flatten(),
         nn.Linear(input_dim * out_channels, hidden_dim),
         nn.BatchNorm1d(hidden_dim),
         nn.ReLU(),
         nn.Dropout(0.5),
         nn.Linear(hidden_dim, output_dim),
     )
Exemplo n.º 2
0
 def __init__(self, num_classes: int = 1000) -> None:
     super(QuantizationAlexNet, self).__init__()
     self.features = nn.Sequential(
         nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
         nn.Conv2d(64, 192, kernel_size=5, padding=2),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
         nn.Conv2d(192, 384, kernel_size=3, padding=1),
         nn.ReLU(inplace=True),
         nn.Conv2d(384, 256, kernel_size=3, padding=1),
         nn.ReLU(inplace=True),
         nn.Conv2d(256, 256, kernel_size=3, padding=1),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
     )
     self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
     self.classifier = nn.Sequential(
         nn.Dropout(),
         nn.Linear(256 * 6 * 6, 4096),
         nn.ReLU(inplace=True),
         nn.Dropout(),
         nn.Linear(4096, 4096),
         nn.ReLU(inplace=True),
         nn.Linear(4096, num_classes),
     )
Exemplo n.º 3
0
 def __init__(self, num_classes=10):
     super(AlexNet, self).__init__()
     self.features = nn.Sequential(
         nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=2),
         nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2),
         nn.Conv2d(64, 192, kernel_size=3,
                   padding=2), nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=2),
         nn.Conv2d(192, 384, kernel_size=3, padding=1),
         nn.ReLU(inplace=True), nn.Conv2d(384,
                                          256,
                                          kernel_size=3,
                                          padding=1), nn.ReLU(inplace=True),
         nn.Conv2d(256, 256, kernel_size=3,
                   padding=1), nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2))
     self.fc_layers = nn.Sequential(
         nn.Dropout(0.6),
         nn.Linear(4096, 2048),
         nn.ReLU(inplace=True),
         nn.Dropout(0.6),
         nn.Linear(2048, 2048),
         nn.ReLU(inplace=True),
         nn.Linear(2048, num_classes),
     )
Exemplo n.º 4
0
    def __init__(self, in_dim, mlp_dim, out_dim, dropout_rate=0.1):
        super(MlpBlock, self).__init__()

        # init layers
        self.fc1 = nn.Linear(in_dim, mlp_dim)
        self.fc2 = nn.Linear(mlp_dim, out_dim)
        self.act = nn.GELU()
        if dropout_rate > 0.0:
            self.dropout1 = nn.Dropout(dropout_rate)
            self.dropout2 = nn.Dropout(dropout_rate)
        else:
            self.dropout1 = None
            self.dropout2 = None
Exemplo n.º 5
0
    def __init__(
        self,
        max_position_embeddings,
        hidden_size,
        nheads,
        dropout=0,
        position_embedding_type="absolute",
        is_decoder=False,
    ):
        super(BertSelfAttention, self).__init__()
        if hidden_size % nheads != 0:
            raise ValueError(
                f"The hidden size ({hidden_size}) is not a multiple of the number of attention "
                f"heads ({nheads})")

        self.num_attention_heads = nheads
        self.attention_head_size = int(hidden_size / nheads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(hidden_size, self.all_head_size)
        self.key = nn.Linear(hidden_size, self.all_head_size)
        self.value = nn.Linear(hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(dropout)
        self.position_embedding_type = position_embedding_type
        if (self.position_embedding_type == "relative_key"
                or self.position_embedding_type == "relative_key_query"):
            self.max_position_embeddings = max_position_embeddings
            self.distance_embedding = nn.Embedding(
                2 * max_position_embeddings - 1, self.attention_head_size)

        self.is_decoder = is_decoder
Exemplo n.º 6
0
    def __init__(
        self,
        spatial_feature_size=7,
        dropout_ratio=0.8,
        num_classes=101,
        with_avg_pool=False,
        temporal_feature_size=1,
        in_channels=2048,
        init_std=0.01,
        fcn_testing=False,
    ):

        super(ClsHead, self).__init__()

        self.with_avg_pool = with_avg_pool
        self.dropout_ratio = dropout_ratio
        self.in_channels = in_channels
        self.dropout_ratio = dropout_ratio
        self.temporal_feature_size = temporal_feature_size
        self.spatial_feature_size = spatial_feature_size
        self.init_std = init_std
        self.fcn_testing = fcn_testing
        self.num_classes = num_classes

        if self.dropout_ratio != 0:
            self.dropout = nn.Dropout(p=self.dropout_ratio)
        else:
            self.dropout = None
        if self.with_avg_pool:
            self.avg_pool = nn.AvgPool3d(
                (temporal_feature_size, spatial_feature_size,
                 spatial_feature_size))

        self.fc_cls = nn.Linear(in_channels, num_classes)
        self.new_cls = None
Exemplo n.º 7
0
 def __init__(
     self,
     word_emb_dim,
     vocab_size,
     dim_channel,
     kernel_wins,
     dropout_rate,
     num_class,
     max_seq_len,
     training=True,
 ):
     super(textCNN, self).__init__()
     self.embed = nn.Embedding(vocab_size, word_emb_dim)
     self.convs = nn.ModuleList([
         nn.Conv2d(1, dim_channel, (w, word_emb_dim)) for w in kernel_wins
     ])
     self.maxpool = nn.ModuleList([
         nn.MaxPool2d((max_seq_len - w + 1, 1), stride=1)
         for w in kernel_wins
     ])
     # Dropout layer
     self.dropout = nn.Dropout(dropout_rate)
     self.training = training
     # FC layer
     self.fc = nn.Linear(len(kernel_wins) * dim_channel, num_class)
Exemplo n.º 8
0
 def __init__(
     self,
     c_in,
     c_cond,
     c_h,
     c_out,
     kernel_size,
     n_conv_blocks,
     upsample,
     act,
     sn,
     dropout_rate,
 ):
     super(Decoder, self).__init__()
     self.n_conv_blocks = n_conv_blocks
     self.upsample = upsample
     self.act = get_act(act)
     f = lambda x: x
     self.in_conv_layer = f(nn.Conv1d(c_in, c_h, kernel_size=1))
     self.first_conv_layers = nn.ModuleList([
         f(nn.Conv1d(c_h, c_h, kernel_size=kernel_size))
         for _ in range(n_conv_blocks)
     ])
     self.second_conv_layers = nn.ModuleList([
         f(nn.Conv1d(c_h, c_h * up, kernel_size=kernel_size))
         for _, up in zip(range(n_conv_blocks), self.upsample)
     ])
     self.norm_layer = nn.InstanceNorm1d(c_h, affine=False)
     self.conv_affine_layers = nn.ModuleList(
         [f(nn.Linear(c_cond, c_h * 2)) for _ in range(n_conv_blocks * 2)])
     self.out_conv_layer = f(nn.Conv1d(c_h, c_out, kernel_size=1))
     self.dropout_layer = nn.Dropout(p=dropout_rate)
Exemplo n.º 9
0
 def __init__(self, intermediate_size, config):
     super().__init__()
     embed_dim = config.hidden_size
     self.c_fc = Conv1D(intermediate_size, embed_dim)
     self.c_proj = Conv1D(embed_dim, intermediate_size)
     self.act = gelu
     self.dropout = nn.Dropout(config.resid_pdrop)
Exemplo n.º 10
0
    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        nn.init.normal_(self.w_qs.weight,
                        mean=0,
                        std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight,
                        mean=0,
                        std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight,
                        mean=0,
                        std=np.sqrt(2.0 / (d_model + d_v)))

        self.attention = ScaledDotProductAttention(temperature=np.power(
            d_k, 0.5),
                                                   attn_dropout=dropout)
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout)
Exemplo n.º 11
0
Arquivo: rnn.py Projeto: zzk0/oneflow
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_layers: int = 1,
        bias: bool = True,
        batch_first: bool = False,
        dropout: float = 0,
        bidirectional: bool = False,
    ):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.batch_first = batch_first
        self.dropout = dropout
        self.bidirectional = bidirectional
        num_directions = 2 if bidirectional else 1
        gate_size = 3 * hidden_size
        self.drop = nn.Dropout(self.dropout)

        for layer in range(num_layers):
            for direction in range(num_directions):

                real_hidden_size = hidden_size
                layer_input_size = (
                    input_size if layer == 0 else real_hidden_size * num_directions
                )

                # TODO: Modify after adding the stride attribute
                # w_ih = flow.nn.Parameter(flow.Tensor(gate_size, layer_input_size))
                # w_hh = flow.nn.Parameter(flow.Tensor(gate_size, real_hidden_size))
                # b_ih = flow.nn.Parameter(flow.Tensor(gate_size))
                # b_hh = flow.nn.Parameter(flow.Tensor(gate_size))

                w_ih = flow.nn.Parameter(flow.Tensor(layer_input_size, gate_size))
                w_hh = flow.nn.Parameter(flow.Tensor(real_hidden_size, gate_size))
                b_ih = flow.nn.Parameter(flow.Tensor(gate_size))
                b_hh = flow.nn.Parameter(flow.Tensor(gate_size))

                layer_params = ()

                if bias:
                    layer_params = (w_ih, w_hh, b_ih, b_hh)
                else:
                    layer_params = (w_ih, w_hh)

                suffix = "_reverse" if direction == 1 else ""
                param_names = ["weight_ih_l{}{}", "weight_hh_l{}{}"]
                if bias:
                    param_names += ["bias_ih_l{}{}", "bias_hh_l{}{}"]
                param_names = [x.format(layer, suffix) for x in param_names]

                for name, param in zip(param_names, layer_params):
                    setattr(self, name, param)

        self.reset_parameters()
Exemplo n.º 12
0
    def __init__(self, source_dim, output_dim, enable_output_proj=True, dropout=0.0):
        super(BasedAttention, self).__init__()

        self.enable_output_proj = enable_output_proj
        if self.enable_output_proj:
            self.output_proj = nn.Linear(source_dim, output_dim)

        self.dropout = nn.Dropout(dropout)
Exemplo n.º 13
0
    def __init__(
        self,
        n_heads,
        d_model,
        d_ff,
        memory_dim,
        slf_attn_dropout=0.0,
        src_attn_dropout=0.0,
        ffn_dropout=0.0,
        residual_dropout=0.1,
        normalize_before=False,
        concat_after=False,
        relative_positional=False,
        activation="relu",
    ):
        super(TransformerDecoderLayer, self).__init__()

        self.relative_positional = relative_positional

        if self.relative_positional:
            self.slf_attn = MultiHeadedSelfAttentionWithRelPos(
                n_heads, d_model, slf_attn_dropout
            )
        else:
            self.slf_attn = MultiHeadedSelfAttention(n_heads, d_model, slf_attn_dropout)
        self.src_attn = MultiHeadedCrossAttention(
            n_heads, d_model, memory_dim, src_attn_dropout
        )
        self.feed_forward = PositionwiseFeedForward(
            d_model, d_ff, ffn_dropout, activation
        )

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(residual_dropout)
        self.dropout2 = nn.Dropout(residual_dropout)
        self.dropout3 = nn.Dropout(residual_dropout)

        self.normalize_before = normalize_before
        self.concat_after = concat_after

        if self.concat_after:
            self.concat_linear1 = nn.Linear(d_model * 2, d_model)
            self.concat_linear2 = nn.Linear(d_model * 2, d_model)
Exemplo n.º 14
0
    def __init__(
        self,
        dim,
        window_size,
        num_heads,
        qkv_bias=True,
        qk_scale=None,
        attn_drop=0.0,
        proj_drop=0.0,
    ):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        # define a parameter table of relative position bias
        # Author zzk: we add trunc normal here!
        self.relative_position_bias_table = nn.Parameter(
            flow.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                       num_heads))  # 2*Wh-1 * 2*Ww-1, nH
        self.relative_position_bias_table.trunc_normal_(std=0.02)

        # get pair-wise relative position index for each token inside the window
        coords_h = flow.arange(self.window_size[0])
        coords_w = flow.arange(self.window_size[1])
        coords = flow.stack(flow.meshgrid(*[coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = flow.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = (coords_flatten[:, :, None] -
                           coords_flatten[:, None, :])  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(1, 2, 0)  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :,
                        0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer("relative_position_index",
                             relative_position_index)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
        self.softmax = nn.Softmax(dim=-1)
Exemplo n.º 15
0
 def __init__(self, input_dim, hidden_dim, output_dim, batch_size):
     super(lstm_ser, self).__init__()
     self.classifier = nn.Sequential(
         LSTM(input_dim, hidden_dim, batch_size),
         nn.Dropout(0.5),
         nn.Linear(hidden_dim, 32),
         nn.ReLU(),
         nn.Linear(32, output_dim),
     )
Exemplo n.º 16
0
 def __init__(self, num_patches, emb_dim, dropout_rate=0.1):
     super(PositionEmbs, self).__init__()
     self.pos_embedding = nn.Parameter(
         flow.tensor(np.random.randn(1, num_patches + 1, emb_dim),
                     dtype=flow.float32))
     if dropout_rate > 0:
         self.dropout = nn.Dropout(dropout_rate)
     else:
         self.dropout = None
Exemplo n.º 17
0
 def __init__(self,
              features: nn.Module,
              num_classes: int = 1000,
              init_weights: bool = True) -> None:
     super(VGG, self).__init__()
     self.features = features
     self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
     self.classifier = nn.Sequential(
         nn.Linear(512 * 7 * 7, 4096),
         nn.ReLU(True),
         nn.Dropout(),
         nn.Linear(4096, 4096),
         nn.ReLU(True),
         nn.Dropout(),
         nn.Linear(4096, num_classes),
     )
     if init_weights:
         self._initialize_weights()
Exemplo n.º 18
0
 def __init__(self,
              hidden_size,
              intermediate_size,
              layer_norm_eps=1e-5,
              dropout=0):
     super(BertOutput, self).__init__()
     self.dense = nn.Linear(intermediate_size, hidden_size)
     self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
     self.dropout = nn.Dropout(dropout)
Exemplo n.º 19
0
    def __init__(self, d_model, d_ff, dropout, activation="relu"):
        super(PositionwiseFeedForward, self).__init__()
        self.activation = activation

        assert activation in ["relu", "gelu", "glu", "tanh", "swish"]

        self.w_1 = nn.Linear(d_model,
                             d_ff * 2 if activation == "glu" else d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
Exemplo n.º 20
0
    def __init__(self, config):
        super(GPT2Model, self).__init__()
        self.embed_dim = config.hidden_size

        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)

        self.drop = nn.Dropout(config.embd_pdrop)
        self.h = nn.ModuleList(
            [GPT2Block(config) for _ in range(config.num_hidden_layers)])
        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
Exemplo n.º 21
0
 def __init__(
     self,
     hidden_size: int,
     intermediate_size: int,
     hidden_dropout_prob: float = 0.1,
     hidden_act: str = "relu",
 ) -> None:
     super().__init__()
     self.hidden_act = hidden_act
     self.intermediate = nn.Linear(hidden_size, intermediate_size)
     self.output = nn.Linear(intermediate_size, hidden_size)
     self.dropout = nn.Dropout(hidden_dropout_prob)
Exemplo n.º 22
0
    def __init__(self, config: Callable[..., None]) -> None:
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size,
                                             config.hidden_size,
                                             padding_idx=0)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
                                                  config.hidden_size)

        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
Exemplo n.º 23
0
    def __init__(self, cfgs, num_classes=1000, width=1.0, dropout=0.2):
        super(GhostNet, self).__init__()
        # setting of inverted residual blocks
        self.cfgs = cfgs
        self.dropout = dropout

        # building first layer
        output_channel = _make_divisible(16 * width, 4)
        self.conv_stem = nn.Conv2d(3, output_channel, 3, 2, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(output_channel)
        self.act1 = nn.ReLU(inplace=True)
        input_channel = output_channel

        # building inverted residual blocks
        stages = []
        block = GhostBottleneck
        for cfg in self.cfgs:
            layers = []
            for k, exp_size, c, se_ratio, s in cfg:
                output_channel = _make_divisible(c * width, 4)
                hidden_channel = _make_divisible(exp_size * width, 4)
                layers.append(
                    block(
                        input_channel,
                        hidden_channel,
                        output_channel,
                        k,
                        s,
                        se_ratio=se_ratio,
                    ))
                input_channel = output_channel
            stages.append(nn.Sequential(*layers))

        output_channel = _make_divisible(exp_size * width, 4)
        stages.append(
            nn.Sequential(ConvBnAct(input_channel, output_channel, 1)))
        input_channel = output_channel

        self.blocks = nn.Sequential(*stages)

        # building last several layers
        output_channel = 1280
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.conv_head = nn.Conv2d(input_channel,
                                   output_channel,
                                   1,
                                   1,
                                   0,
                                   bias=True)
        self.act2 = nn.ReLU(inplace=True)
        self.classifier = nn.Linear(output_channel, num_classes)
        self.dropout = nn.Dropout(p=self.dropout)
Exemplo n.º 24
0
    def __init__(
        self,
        input_size,
        in_channel,
        out_channel,
        kernel_size,
        stride,
        dropout=0.1,
        batch_norm=False,
        residual=False,
        act_func_type="relu",
    ):
        super(Conv2dLayer, self).__init__()

        self.input_size = input_size
        self.in_channel = in_channel
        self.out_channel = out_channel

        self.batch_norm = batch_norm
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = (
            0,
            kernel_size //
            2 if isinstance(self.kernel_size, int) else kernel_size[1] // 2,
        )

        self.residual = residual

        self.act_func_type = act_func_type

        self.conv_layer = nn.Conv2d(
            in_channels=in_channel,
            out_channels=out_channel,
            kernel_size=self.kernel_size,
            stride=self.stride,
            padding=self.padding,
        )

        self.output_size = cal_width_dim_2d(
            input_size,
            self.kernel_size
            if isinstance(self.kernel_size, int) else self.kernel_size[1],
            self.stride if isinstance(self.stride, int) else self.stride[1],
            padding=self.padding
            if isinstance(self.padding, int) else self.padding[1],
        )

        if self.batch_norm:
            self.norm = nn.BatchNorm2d(out_channel)

        self.dropout = nn.Dropout(dropout)
Exemplo n.º 25
0
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = flow.zeros((max_len, d_model))
        position = flow.arange(0, max_len, dtype=flow.float).unsqueeze(1)
        div_term = flow.exp(
            flow.arange(0, d_model, 2).to(flow.float) * (-math.log(10000.0) / d_model)
        ).unsqueeze(0)
        pe[:, 0::2] = flow.sin(position * div_term)
        pe[:, 1::2] = flow.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.pe = flow.nn.Parameter(pe, requires_grad=False)
Exemplo n.º 26
0
    def __init__(
        self,
        sos_id,
        eos_id,
        n_tgt_vocab,
        d_word_vec,
        n_layers,
        n_head,
        d_k,
        d_v,
        d_model,
        d_inner,
        dropout=0.1,
        tgt_emb_prj_weight_sharing=True,
        pe_maxlen=5000,
    ):
        super(Decoder, self).__init__()
        # parameters
        self.sos_id = sos_id
        self.eos_id = eos_id
        self.n_tgt_vocab = n_tgt_vocab
        self.d_word_vec = d_word_vec
        self.n_layers = n_layers
        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model
        self.d_inner = d_inner
        self.dropout = dropout
        self.tgt_emb_prj_weight_sharing = tgt_emb_prj_weight_sharing
        self.pe_maxlen = pe_maxlen

        self.tgt_word_emb = nn.Embedding(n_tgt_vocab, d_word_vec)
        self.positional_encoding = PositionalEncoding(d_model,
                                                      max_len=pe_maxlen)
        self.dropout = nn.Dropout(dropout)

        self.layer_stack = nn.ModuleList([
            DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)
        ])

        self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False)
        nn.init.xavier_normal_(self.tgt_word_prj.weight)

        if tgt_emb_prj_weight_sharing:
            # Share the weight matrix between target word embedding & the final logit dense layer
            self.tgt_word_prj.weight = self.tgt_word_emb.weight
            self.x_logit_scale = d_model**0.5
        else:
            self.x_logit_scale = 1.0
Exemplo n.º 27
0
    def build_conv_block(self, dim, padding_type, norm_layer, use_dropout,
                         use_bias):
        """Construct a convolutional block.

        Parameters:
            dim (int)           -- the number of channels in the conv layer.
            padding_type (str)  -- the name of padding layer: reflect | replicate | zero
            norm_layer          -- normalization layer
            use_dropout (bool)  -- if use dropout layers.
            use_bias (bool)     -- if the conv layer uses bias or not

        Returns a conv block (with a conv layer, a normalization layer, and a non-linearity layer (ReLU))
        """
        conv_block = []
        p = 0
        if padding_type == "reflect":
            conv_block += [nn.ReflectionPad2d(1)]
        elif padding_type == "replicate":
            conv_block += [nn.ReplicationPad2d(1)]
        elif padding_type == "zero":
            p = 1
        else:
            raise NotImplementedError("padding [%s] is not implemented" %
                                      padding_type)

        conv_block += [
            nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias),
            norm_layer(dim),
            nn.ReLU(True),
        ]
        if use_dropout:
            conv_block += [nn.Dropout(0.5)]

        p = 0
        if padding_type == "reflect":
            conv_block += [nn.ReflectionPad2d(1)]
        elif padding_type == "replicate":
            conv_block += [nn.ReplicationPad2d(1)]
        elif padding_type == "zero":
            p = 1
        else:
            raise NotImplementedError("padding [%s] is not implemented" %
                                      padding_type)
        conv_block += [
            nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias),
            norm_layer(dim),
        ]

        return nn.Sequential(*conv_block)
Exemplo n.º 28
0
    def __init__(self, in_dim, heads=8, dropout_rate=0.1):
        super(SelfAttention, self).__init__()
        self.heads = heads
        self.head_dim = in_dim // heads
        self.scale = self.head_dim**0.5

        self.query = nn.Linear(in_dim, self.heads * self.head_dim)
        self.key = nn.Linear(in_dim, self.heads * self.head_dim)
        self.value = nn.Linear(in_dim, self.heads * self.head_dim)
        self.out = nn.Linear(self.heads * self.head_dim, in_dim)

        if dropout_rate > 0:
            self.dropout = nn.Dropout(dropout_rate)
        else:
            self.dropout = None
Exemplo n.º 29
0
    def __init__(self, config):
        super(GPT2Attention, self).__init__()
        max_positions = config.max_position_embeddings

        self.register_buffer(
            "bias",
            flow.tril(
                flow.ones((max_positions, max_positions),
                          dtype=flow.int8)).view(1, 1, max_positions,
                                                 max_positions),
        )
        self.register_buffer("masked_bias", flow.tensor(-1e4))

        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        assert self.embed_dim % self.num_heads == 0
        self.head_dim = self.embed_dim // self.num_heads
        self.scale_attn_weights = config.scale_attn_weights

        self.c_attn = Conv1D(self.embed_dim * 3, self.embed_dim)
        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)

        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
Exemplo n.º 30
0
 def __init__(
     self,
     in_features,
     hidden_features=None,
     out_features=None,
     act_layer=nn.GELU,
     drop=0.0,
 ):
     super().__init__()
     out_features = out_features or in_features
     hidden_features = hidden_features or in_features
     self.fc1 = nn.Linear(in_features, hidden_features)
     self.act = act_layer()
     self.fc2 = nn.Linear(hidden_features, out_features)
     self.drop = nn.Dropout(drop)