示例#1
0
 def __init__(
     self,
     num_classes: int,  # number of classfication
     max_length:
     int = 120,  # a maximum allowed length for the sequence to be processed
     hidden_dim: int = 1024,  # dimension of RNN`s hidden state vector
     sos_id: int = 1,  # start of sentence token`s id
     eos_id: int = 2,  # end of sentence token`s id
     attn_mechanism: str = 'multi-head',  # type of attention mechanism
     num_heads: int = 4,  # number of attention heads
     num_layers: int = 2,  # number of RNN layers
     rnn_type: str = 'lstm',  # type of RNN cell
     dropout_p: float = 0.3,  # dropout probability
     device: str = 'cuda'
 ) -> None:  # device - 'cuda' or 'cpu'
     super(LanguageDecoderRNN,
           self).__init__(hidden_dim, hidden_dim, num_layers, rnn_type,
                          dropout_p, False, device)
     self.num_classes = num_classes
     self.num_heads = num_heads
     self.max_length = max_length
     self.eos_id = eos_id
     self.sos_id = sos_id
     self.attn_mechanism = attn_mechanism.lower()
     self.embedding = nn.Embedding(num_classes, hidden_dim)
     self.input_dropout = nn.Dropout(dropout_p)
     self.attention = AddNorm(MultiHeadAttention(hidden_dim), hidden_dim)
     self.projection = AddNorm(Linear(hidden_dim, hidden_dim, bias=True),
                               hidden_dim)
     self.generator = Linear(hidden_dim, num_classes, bias=False)
示例#2
0
    def __init__(
            self,
            dim: int = 512,
            num_heads: int = 16,
            dropout_p: float = 0.1,
    ) -> None:
        super(RelativeMultiHeadAttention, self).__init__()
        assert dim % num_heads == 0, "d_model % num_heads should be zero."

        self.dim = dim
        self.d_head = int(dim / num_heads)
        self.num_heads = num_heads
        self.sqrt_dim = math.sqrt(dim)

        self.query_proj = Linear(dim, dim)
        self.key_proj = Linear(dim, dim)
        self.value_proj = Linear(dim, dim)
        self.pos_proj = Linear(dim, dim, bias=False)

        self.dropout = nn.Dropout(p=dropout_p)
        self.u_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
        self.v_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
        torch.nn.init.xavier_uniform_(self.u_bias)
        torch.nn.init.xavier_uniform_(self.v_bias)

        self.out_proj = Linear(dim, dim)
示例#3
0
    def __init__(self,
                 num_classes: int,                    # number of classfication
                 max_length: int = 120,               # a maximum allowed length for the sequence to be processed
                 hidden_dim: int = 1024,              # dimension of RNN`s hidden state vector
                 sos_id: int = 1,                     # start of sentence token`s id
                 eos_id: int = 2,                     # end of sentence token`s id
                 attn_mechanism: str = 'multi-head',  # type of attention mechanism
                 num_heads: int = 4,                  # number of attention heads
                 num_layers: int = 2,                 # number of RNN layers
                 rnn_type: str = 'lstm',              # type of RNN cell
                 dropout_p: float = 0.3,              # dropout probability
                 device: str = 'cuda') -> None:       # device - 'cuda' or 'cpu'
        super(SpeechDecoderRNN, self).__init__(hidden_dim, hidden_dim, num_layers, rnn_type, dropout_p, False, device)
        self.num_classes = num_classes
        self.num_heads = num_heads
        self.max_length = max_length
        self.eos_id = eos_id
        self.sos_id = sos_id
        self.attn_mechanism = attn_mechanism.lower()
        self.embedding = nn.Embedding(num_classes, hidden_dim)
        self.input_dropout = nn.Dropout(dropout_p)

        if self.attn_mechanism == 'loc':
            self.attention = AddNorm(LocationAwareAttention(hidden_dim, smoothing=True), hidden_dim)
        elif self.attn_mechanism == 'multi-head':
            self.attention = AddNorm(MultiHeadAttention(hidden_dim, num_heads), hidden_dim)
        elif self.attn_mechanism == 'additive':
            self.attention = AddNorm(AdditiveAttention(hidden_dim), hidden_dim)
        elif self.attn_mechanism == 'scaled-dot':
            self.attention = AddNorm(ScaledDotProductAttention(hidden_dim), hidden_dim)
        else:
            raise ValueError("Unsupported attention: %s".format(attn_mechanism))

        self.projection = AddNorm(Linear(hidden_dim, hidden_dim, bias=True), hidden_dim)
        self.generator = Linear(hidden_dim, num_classes, bias=False)
示例#4
0
    def __init__(self,
                 d_model: int = 512,
                 d_ff: int = 2048,
                 dropout_p: float = 0.3,
                 ffnet_style: str = 'ff') -> None:
        super(PositionWiseFeedForwardNet, self).__init__()
        self.ffnet_style = ffnet_style.lower()
        if self.ffnet_style == 'ff':
            self.feed_forward = nn.Sequential(
                Linear(d_model, d_ff),
                nn.Dropout(dropout_p),
                nn.ReLU(),
                Linear(d_ff, d_model),
                nn.Dropout(dropout_p),
            )

        elif self.ffnet_style == 'conv':
            self.conv1 = nn.Conv1d(in_channels=d_model,
                                   out_channels=d_ff,
                                   kernel_size=1)
            self.relu = nn.ReLU()
            self.conv2 = nn.Conv1d(in_channels=d_ff,
                                   out_channels=d_model,
                                   kernel_size=1)

        else:
            raise ValueError("Unsupported mode: {0}".format(self.mode))
示例#5
0
 def __init__(self, dim: int = 1024, attn_dim: int = 1024, smoothing: bool = False) -> None:
     super(LocationAwareAttention, self).__init__()
     self.location_conv = nn.Conv1d(in_channels=1, out_channels=attn_dim, kernel_size=3, padding=1)
     self.query_proj = Linear(dim, attn_dim, bias=False)
     self.value_proj = Linear(dim, attn_dim, bias=False)
     self.bias = nn.Parameter(torch.rand(attn_dim).uniform_(-0.1, 0.1))
     self.fc = Linear(attn_dim, 1, bias=True)
     self.smoothing = smoothing
示例#6
0
    def __init__(
        self,
        num_classes: int,
        max_length: int = 150,
        hidden_state_dim: int = 1024,
        pad_id: int = 0,
        sos_id: int = 1,
        eos_id: int = 2,
        attn_mechanism: str = 'multi-head',
        num_heads: int = 4,
        num_layers: int = 2,
        rnn_type: str = 'lstm',
        dropout_p: float = 0.3,
    ) -> None:
        super(DecoderRNN, self).__init__()
        self.hidden_state_dim = hidden_state_dim
        self.num_classes = num_classes
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.max_length = max_length
        self.eos_id = eos_id
        self.sos_id = sos_id
        self.pad_id = pad_id
        self.attn_mechanism = attn_mechanism.lower()
        self.embedding = nn.Embedding(num_classes, hidden_state_dim)
        self.input_dropout = nn.Dropout(dropout_p)
        rnn_cell = self.supported_rnns[rnn_type.lower()]
        self.rnn = rnn_cell(
            input_size=hidden_state_dim,
            hidden_size=hidden_state_dim,
            num_layers=num_layers,
            bias=True,
            batch_first=True,
            dropout=dropout_p,
            bidirectional=False,
        )

        if self.attn_mechanism == 'loc':
            self.attention = LocationAwareAttention(hidden_state_dim,
                                                    attn_dim=hidden_state_dim,
                                                    smoothing=False)
        elif self.attn_mechanism == 'multi-head':
            self.attention = MultiHeadAttention(hidden_state_dim,
                                                num_heads=num_heads)
        elif self.attn_mechanism == 'additive':
            self.attention = AdditiveAttention(hidden_state_dim)
        elif self.attn_mechanism == 'scaled-dot':
            self.attention = ScaledDotProductAttention(dim=hidden_state_dim)
        else:
            raise ValueError(
                "Unsupported attention: %s".format(attn_mechanism))

        self.fc = nn.Sequential(
            Linear(hidden_state_dim << 1, hidden_state_dim),
            nn.Tanh(),
            View(shape=(-1, self.hidden_state_dim), contiguous=True),
            Linear(hidden_state_dim, num_classes),
        )
示例#7
0
    def __init__(self, dim: int = 512, num_heads: int = 8) -> None:
        super(MultiHeadAttention, self).__init__()

        assert dim % num_heads == 0, "hidden_dim % num_heads should be zero."

        self.d_head = int(dim / num_heads)
        self.num_heads = num_heads
        self.query_proj = Linear(dim, self.d_head * num_heads)
        self.key_proj = Linear(dim, self.d_head * num_heads)
        self.value_proj = Linear(dim, self.d_head * num_heads)
        self.scaled_dot_attn = ScaledDotProductAttention(self.d_head, scale=True)
示例#8
0
    def __init__(self, d_model: int = 512, num_heads: int = 8) -> None:
        super(MultiHeadAttention, self).__init__()

        assert d_model % num_heads == 0, "hidden_dim % num_heads should be zero."

        self.d_head = int(d_model / num_heads)
        self.num_heads = num_heads
        self.query_proj = Linear(d_model, self.d_head * num_heads)
        self.key_proj = Linear(d_model, self.d_head * num_heads)
        self.value_proj = Linear(d_model, self.d_head * num_heads)
        self.sqrt_dim = np.sqrt(d_model)
示例#9
0
    def __init__(self, d_model: int = 512, num_heads: int = 8) -> None:
        super(MultiHeadAttention, self).__init__()

        assert d_model % num_heads == 0, "d_model % num_heads should be zero."

        self.d_head = int(d_model / num_heads)
        self.num_heads = num_heads
        self.scaled_dot_attn = ScaledDotProductAttention(d_model)
        self.query_proj = Linear(d_model, self.d_head * num_heads)
        self.key_proj = Linear(d_model, self.d_head * num_heads)
        self.value_proj = Linear(d_model, self.d_head * num_heads)
示例#10
0
 def __init__(self, d_model: int = 512, smoothing: bool = True) -> None:
     super(LocationAwareAttention, self).__init__()
     self.d_model = d_model
     self.conv1d = nn.Conv1d(in_channels=1,
                             out_channels=d_model,
                             kernel_size=3,
                             padding=1)
     self.query_proj = Linear(d_model, d_model, bias=False)
     self.value_proj = Linear(d_model, d_model, bias=False)
     self.bias = nn.Parameter(torch.rand(d_model).uniform_(-0.1, 0.1))
     self.score_proj = Linear(d_model, 1, bias=True)
     self.smoothing = smoothing
示例#11
0
 def __init__(self,
              d_model: int = 512,
              d_ff: int = 2048,
              dropout_p: float = 0.3) -> None:
     super(PositionwiseFeedForward, self).__init__()
     self.feed_forward = nn.Sequential(
         Linear(d_model, d_ff),
         nn.Dropout(dropout_p),
         nn.ReLU(),
         Linear(d_ff, d_model),
         nn.Dropout(dropout_p),
     )
示例#12
0
 def __init__(self,
              encoder_dim: int = 512,
              expansion_factor: int = 4,
              dropout_p: float = 0.1,
              device: torch.device = 'cuda') -> None:
     super(FeedForwardModule, self).__init__()
     self.device = device
     self.sequential = nn.Sequential(
         LayerNorm(encoder_dim),
         Linear(encoder_dim, encoder_dim * expansion_factor, bias=True),
         Swish(),
         nn.Dropout(p=dropout_p),
         Linear(encoder_dim * expansion_factor, encoder_dim, bias=True),
         nn.Dropout(p=dropout_p),
     )
示例#13
0
 def __init__(
     self,
     encoder: TransducerEncoder,
     decoder: TransducerDecoder,
     d_model: int,
     num_classes: int,
 ) -> None:
     super(TransducerModel, self).__init__()
     self.encoder = encoder
     self.decoder = decoder
     self.fc = nn.Sequential(
         Linear(d_model << 1, d_model),
         nn.Tanh(),
         Linear(d_model, num_classes, bias=False),
     )
示例#14
0
 def __init__(
         self,
         input_dim: int,  # dimension of feature vector
         extractor: str = 'vgg',  # convolutional extractor
         d_model: int = 512,  # dimension of model
         d_ff: int = 2048,  # dimension of feed forward network
         num_layers: int = 6,  # number of encoder layers
         num_heads: int = 8,  # number of attention heads
         dropout_p: float = 0.3,  # probability of dropout
         joint_ctc_attention:
     bool = False,  # use CTC Loss & Cross Entropy Joint Learning
         num_classes: int = None,  # number of classification
 ) -> None:
     super(TransformerEncoder,
           self).__init__(input_dim=input_dim,
                          extractor=extractor,
                          d_model=d_model,
                          num_classes=num_classes,
                          dropout_p=dropout_p,
                          joint_ctc_attention=joint_ctc_attention)
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.input_proj = Linear(self.conv_output_dim, d_model)
     self.input_layer_norm = LayerNorm(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.positional_encoding = PositionalEncoding(d_model)
     self.layers = nn.ModuleList([
         TransformerEncoderLayer(
             d_model=d_model,
             num_heads=num_heads,
             d_ff=d_ff,
             dropout_p=dropout_p,
         ) for _ in range(num_layers)
     ])
示例#15
0
    def __init__(self,
                 num_classes: int,
                 d_model: int = 512,
                 input_dim: int = 80,
                 pad_id: int = 0,
                 eos_id: int = 2,
                 d_ff: int = 2048,
                 num_heads: int = 8,
                 num_encoder_layers: int = 6,
                 num_decoder_layers: int = 6,
                 dropout_p: float = 0.3,
                 ffnet_style: str = 'ff') -> None:
        super(Transformer, self).__init__()

        assert d_model % num_heads == 0, "d_model % num_heads should be zero."

        self.eos_id = eos_id
        self.pad_id = pad_id
        self.encoder = TransformerEncoder(d_model, input_dim, d_ff,
                                          num_encoder_layers, num_heads,
                                          ffnet_style, dropout_p, pad_id)
        self.decoder = TransformerDecoder(num_classes, d_model, d_ff,
                                          num_decoder_layers, num_heads,
                                          ffnet_style, dropout_p, pad_id)
        self.generator = Linear(d_model, num_classes)
示例#16
0
 def __init__(
     self,
     input_dim: int,
     hidden_state_dim: int,
     output_dim: int,
     num_layers: int,
     rnn_type: str = 'lstm',
     dropout_p: float = 0.2,
     bidirectional: bool = True,
 ):
     super(EncoderRNNT, self).__init__()
     self.hidden_state_dim = hidden_state_dim
     rnn_cell = self.supported_rnns[rnn_type.lower()]
     self.rnn = rnn_cell(
         input_size=input_dim,
         hidden_size=hidden_state_dim,
         num_layers=num_layers,
         bias=True,
         batch_first=True,
         dropout=dropout_p,
         bidirectional=bidirectional,
     )
     self.out_proj = Linear(
         hidden_state_dim << 1 if bidirectional else hidden_state_dim,
         output_dim)
示例#17
0
    def __init__(
        self,
        input_dim: int,
        extractor: str = 'vgg',
        d_model: int = None,
        num_classes: int = None,
        dropout_p: float = None,
        activation: str = 'hardtanh',
        joint_ctc_attention: bool = False,
    ) -> None:
        super(BaseEncoder, self).__init__()
        if joint_ctc_attention:
            assert num_classes, "If `joint_ctc_attention` True, `num_classes` should be not None"
            assert dropout_p, "If `joint_ctc_attention` True, `dropout_p` should be not None"
            assert d_model, "If `joint_ctc_attention` True, `d_model` should be not None"

        if extractor is not None:
            extractor = self.supported_extractors[extractor.lower()]
            self.conv = extractor(input_dim=input_dim, activation=activation)

        self.conv_output_dim = self.conv.get_output_dim()
        self.num_classes = num_classes
        self.joint_ctc_attention = joint_ctc_attention

        if self.joint_ctc_attention:
            self.fc = nn.Sequential(
                nn.BatchNorm1d(d_model),
                Transpose(shape=(1, 2)),
                nn.Dropout(dropout_p),
                Linear(d_model, num_classes, bias=False),
            )
示例#18
0
    def __init__(
        self,
        input_dim: int,
        num_classes: int,
        rnn_type='gru',
        num_rnn_layers: int = 5,
        rnn_hidden_dim: int = 512,
        dropout_p: float = 0.1,
        bidirectional: bool = True,
        activation: str = 'hardtanh',
        device: torch.device = 'cuda',
    ):
        super(DeepSpeech2, self).__init__()
        self.device = device
        self.conv = DeepSpeech2Extractor(input_dim, activation=activation)
        self.rnn_layers = nn.ModuleList()
        rnn_output_size = rnn_hidden_dim << 1 if bidirectional else rnn_hidden_dim

        for idx in range(num_rnn_layers):
            self.rnn_layers.append(
                BNReluRNN(
                    input_size=self.conv.get_output_dim()
                    if idx == 0 else rnn_output_size,
                    hidden_state_dim=rnn_hidden_dim,
                    rnn_type=rnn_type,
                    bidirectional=bidirectional,
                    dropout_p=dropout_p,
                ))

        self.fc = nn.Sequential(
            LayerNorm(rnn_output_size),
            Linear(rnn_output_size, num_classes, bias=False),
        )
示例#19
0
文件: model.py 项目: pjhool/KoSpeech
 def __init__(
         self,
         d_model: int = 512,  # dimension of model
         input_dim: int = 80,  # dimension of feature vector
         d_ff: int = 2048,  # dimension of feed forward network
         num_layers: int = 6,  # number of encoder layers
         num_heads: int = 8,  # number of attention heads
         ffnet_style: str = 'ff',  # style of feed forward network [ff, conv]
         dropout_p: float = 0.3,  # probability of dropout
         pad_id: int = 0,  # identification of pad token
 ) -> None:
     super(SpeechTransformerEncoder, self).__init__()
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.pad_id = pad_id
     self.input_proj = Linear(input_dim, d_model)
     self.input_norm = LayerNorm(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.positional_encoding = PositionalEncoding(d_model)
     self.layers = nn.ModuleList([
         SpeechTransformerEncoderLayer(d_model, num_heads, d_ff, dropout_p,
                                       ffnet_style)
         for _ in range(num_layers)
     ])
示例#20
0
 def __init__(
     self,
     num_classes: int,
     hidden_state_dim: int,
     output_dim: int,
     num_layers: int,
     rnn_type: str = 'lstm',
     sos_id: int = 1,
     eos_id: int = 2,
     dropout_p: float = 0.2,
 ):
     super(DecoderRNNT, self).__init__()
     self.hidden_state_dim = hidden_state_dim
     self.sos_id = sos_id
     self.eos_id = eos_id
     self.embedding = nn.Embedding(num_classes, hidden_state_dim)
     rnn_cell = self.supported_rnns[rnn_type.lower()]
     self.rnn = rnn_cell(
         input_size=hidden_state_dim,
         hidden_size=hidden_state_dim,
         num_layers=num_layers,
         bias=True,
         batch_first=True,
         dropout=dropout_p,
         bidirectional=False,
     )
     self.out_proj = Linear(hidden_state_dim, output_dim)
示例#21
0
文件: model.py 项目: Rhcsky/KoSpeech
 def __init__(
     self,
     encoder: TransducerEncoder,
     decoder: TransducerDecoder,
     d_model: int,
     num_classes: int,
 ) -> None:
     super(TransducerModel, self).__init__()
     self.encoder = encoder
     self.decoder = decoder
     self.fc = Linear(d_model << 1, num_classes, bias=False)
示例#22
0
 def __init__(self, d_model: int = 512, input_dim: int = 80, d_ff: int = 2048,
              num_layers: int = 6, num_heads: int = 8, ffnet_style: str = 'ff',
              dropout_p: float = 0.3, pad_id: int = 0) -> None:
     super(TransformerEncoder, self).__init__()
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.pad_id = pad_id
     self.input_proj = Linear(input_dim, d_model)
     self.input_layer_norm = LayerNorm(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.pos_encoding = PositionalEncoding(d_model)
     self.layers = nn.ModuleList(
         [TransformerEncoderLayer(d_model, num_heads, d_ff, dropout_p, ffnet_style) for _ in range(num_layers)]
     )
示例#23
0
    def __init__(
        self,
        input_size: int,  # size of input
        num_classes: int,  # number of classfication
        rnn_type='gru',  # type of RNN cell
        num_rnn_layers: int = 5,  # number of RNN layers
        rnn_hidden_dim: int = 512,  # dimension of RNN`s hidden state
        dropout_p: float = 0.1,  # dropout probability
        bidirectional: bool = True,  # if True, becomes a bidirectional rnn
        activation: str = 'hardtanh',  # type of activation function
        device: torch.device = 'cuda'  # device - 'cuda' or 'cpu'
    ):
        super(DeepSpeech2, self).__init__()
        self.rnn_layers = list()
        self.device = device

        input_size = int(math.floor(input_size + 2 * 20 - 41) / 2 + 1)
        input_size = int(math.floor(input_size + 2 * 10 - 21) / 2 + 1)
        input_size <<= 5
        rnn_output_size = rnn_hidden_dim << 1 if bidirectional else rnn_hidden_dim

        self.conv = DeepSpeech2Extractor(activation, mask_conv=True)

        for idx in range(num_rnn_layers):
            self.rnn_layers.append(
                BNReluRNN(
                    input_size=input_size if idx == 0 else rnn_output_size,
                    hidden_dim=rnn_hidden_dim,
                    rnn_type=rnn_type,
                    bidirectional=bidirectional,
                    dropout_p=dropout_p,
                    device=device))

        self.fc = nn.Sequential(
            Linear(rnn_output_size, rnn_hidden_dim), nn.ReLU(),
            Linear(rnn_hidden_dim, num_classes, bias=False))
示例#24
0
    def __init__(
            self,
            input_size: int,  # size of input
            num_classes: int,  # number of class
            hidden_dim: int = 512,  # dimension of RNN`s hidden state
            device: str = 'cuda',  # device - 'cuda' or 'cpu'
            dropout_p: float = 0.3,  # dropout probability
            num_layers: int = 3,  # number of RNN layers
            bidirectional:
        bool = True,  # if True, becomes a bidirectional encoder
            rnn_type: str = 'lstm',  # type of RNN cell
            extractor: str = 'vgg',  # type of CNN extractor
            activation: str = 'hardtanh',  # type of activation function
            mask_conv:
        bool = False,  # flag indication whether apply mask convolution or not
            joint_ctc_attention:
        bool = False,  # Use CTC Loss & Cross Entropy Joint Learning
    ) -> None:
        self.mask_conv = mask_conv
        self.extractor = extractor.lower()
        self.joint_ctc_attention = joint_ctc_attention

        if self.extractor == 'vgg':
            input_size = (input_size -
                          1) << 5 if input_size % 2 else input_size << 5
            super(Listener,
                  self).__init__(input_size, hidden_dim, num_layers, rnn_type,
                                 dropout_p, bidirectional, device)
            self.conv = VGGExtractor(activation, mask_conv)

        elif self.extractor == 'ds2':
            input_size = int(math.floor(input_size + 2 * 20 - 41) / 2 + 1)
            input_size = int(math.floor(input_size + 2 * 10 - 21) / 2 + 1)
            input_size <<= 6
            super(Listener,
                  self).__init__(input_size, hidden_dim, num_layers, rnn_type,
                                 dropout_p, bidirectional, device)
            self.conv = DeepSpeech2Extractor(activation, mask_conv)

        else:
            raise ValueError("Unsupported Extractor : {0}".format(extractor))

        if self.joint_ctc_attention:
            assert self.mask_conv, "if joint_ctc_attention training, mask_conv should be True"
            self.fc = nn.Sequential(
                nn.BatchNorm1d(self.hidden_dim << 1), Transpose(shape=(1, 2)),
                nn.Dropout(dropout_p),
                Linear(self.hidden_dim << 1, num_classes, bias=False))
示例#25
0
    def __init__(self,
                 num_classes: int,                      # the number of classfication
                 d_model: int = 512,                    # dimension of model
                 input_dim: int = 80,                   # dimension of input
                 pad_id: int = 0,                       # identification of <PAD_token>
                 eos_id: int = 2,                       # identification of <EOS_token>
                 d_ff: int = 2048,                      # dimension of feed forward network
                 num_heads: int = 8,                    # number of attention heads
                 num_encoder_layers: int = 6,           # number of encoder layers
                 num_decoder_layers: int = 6,           # number of decoder layers
                 dropout_p: float = 0.3,                # dropout probability
                 ffnet_style: str = 'ff') -> None:      # feed forward network style 'ff' or 'conv'
        super(Transformer, self).__init__()

        assert d_model % num_heads == 0, "d_model % num_heads should be zero."

        self.eos_id = eos_id
        self.pad_id = pad_id
        self.encoder = TransformerEncoder(d_model, input_dim, d_ff,  num_encoder_layers, num_heads, ffnet_style, dropout_p, pad_id)
        self.decoder = TransformerDecoder(num_classes, d_model, d_ff, num_decoder_layers, num_heads, ffnet_style, dropout_p, pad_id)
        self.generator = Linear(d_model, num_classes)
示例#26
0
 def __init__(
     self,
     input_dim: int = 80,
     encoder_dim: int = 512,
     num_layers: int = 17,
     num_attention_heads: int = 8,
     feed_forward_expansion_factor: int = 4,
     conv_expansion_factor: int = 2,
     input_dropout_p: float = 0.1,
     feed_forward_dropout_p: float = 0.1,
     attention_dropout_p: float = 0.1,
     conv_dropout_p: float = 0.1,
     conv_kernel_size: int = 31,
     half_step_residual: bool = True,
     device: torch.device = 'cuda',
 ):
     super(ConformerEncoder, self).__init__()
     self.conv_subsample = Conv2dSubsampling(input_dim,
                                             in_channels=1,
                                             out_channels=encoder_dim)
     self.input_projection = nn.Sequential(
         Linear(self.conv_subsample.get_output_dim(), encoder_dim),
         nn.Dropout(p=input_dropout_p),
     )
     self.layers = nn.ModuleList([
         ConformerBlock(
             encoder_dim=encoder_dim,
             num_attention_heads=num_attention_heads,
             feed_forward_expansion_factor=feed_forward_expansion_factor,
             conv_expansion_factor=conv_expansion_factor,
             feed_forward_dropout_p=feed_forward_dropout_p,
             attention_dropout_p=attention_dropout_p,
             conv_dropout_p=conv_dropout_p,
             conv_kernel_size=conv_kernel_size,
             half_step_residual=half_step_residual,
             device=device,
         ).to(device) for _ in range(num_layers)
     ])
示例#27
0
 def __init__(
     self,
     vocab_size: int,  # size of vocab
     hidden_dim: int = 512,  # dimension of RNN`s hidden state
     device: str = 'cuda',  # device - 'cuda' or 'cpu'
     dropout_p: float = 0.3,  # dropout probability
     num_layers: int = 3,  # number of RNN layers
     bidirectional: bool = True,  # if True, becomes a bidirectional encoder
     rnn_type: str = 'lstm'  # type of RNN cell
 ) -> None:
     super(SpellingCorrectorEncoder, self).__init__()
     self.embedding = nn.Embedding(vocab_size, hidden_dim)
     self.layers = nn.ModuleList([
         SpellingCorrectorEncoderLayer(hidden_dim=hidden_dim,
                                       device=device,
                                       dropout_p=dropout_p,
                                       num_layers=1,
                                       bidirectional=bidirectional,
                                       rnn_type=rnn_type)
         for _ in range(num_layers)
     ])
     self.dropout = nn.Dropout(p=dropout_p)
     self.fc = Linear(hidden_dim, hidden_dim, bias=True)
示例#28
0
    def __init__(
            self,
            num_classes: int,  # number of classes
            d_model: int = 512,  # dimension of model
            d_ff: int = 512,  # dimension of feed forward network
            num_layers: int = 6,  # number of decoder layers
            num_heads: int = 8,  # number of attention heads
            dropout_p: float = 0.3,  # probability of dropout
            pad_id: int = 0,  # identification of pad token
            sos_id: int = 1,  # identification of start of sentence token
            eos_id: int = 2,  # identification of end of sentence token
            max_length: int = 400,  # max length of decoding
    ) -> None:
        super(TransformerDecoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.max_length = max_length
        self.pad_id = pad_id
        self.sos_id = sos_id
        self.eos_id = eos_id

        self.embedding = Embedding(num_classes, pad_id, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.input_dropout = nn.Dropout(p=dropout_p)
        self.layers = nn.ModuleList([
            TransformerDecoderLayer(
                d_model=d_model,
                num_heads=num_heads,
                d_ff=d_ff,
                dropout_p=dropout_p,
            ) for _ in range(num_layers)
        ])
        self.fc = nn.Sequential(
            nn.LayerNorm(d_model),
            Linear(d_model, num_classes, bias=False),
        )
示例#29
0
文件: model.py 项目: pjhool/KoSpeech
    def __init__(
        self,
        num_classes: int,  # the number of classfication
        d_model: int = 512,  # dimension of model
        input_dim: int = 80,  # dimension of input
        pad_id: int = 0,  # identification of <PAD_token>
        eos_id: int = 2,  # identification of <EOS_token>
        d_ff: int = 2048,  # dimension of feed forward network
        num_heads: int = 8,  # number of attention heads
        num_encoder_layers: int = 6,  # number of encoder layers
        num_decoder_layers: int = 6,  # number of decoder layers
        dropout_p: float = 0.3,  # dropout probability
        ffnet_style: str = 'ff',  # feed forward network style 'ff' or 'conv'
        extractor: str = 'vgg'  # CNN extractor [vgg, ds2]
    ) -> None:
        super(SpeechTransformer, self).__init__()

        assert d_model % num_heads == 0, "d_model % num_heads should be zero."

        if extractor.lower() == 'vgg':
            input_dim = (input_dim -
                         1) << 5 if input_dim % 2 else input_dim << 5
            self.conv = nn.Sequential(
                nn.Conv2d(1,
                          64,
                          kernel_size=3,
                          stride=1,
                          padding=1,
                          bias=False), nn.BatchNorm2d(num_features=64),
                nn.Hardtanh(0, 20, inplace=True),
                nn.Conv2d(64,
                          64,
                          kernel_size=3,
                          stride=1,
                          padding=1,
                          bias=False), nn.BatchNorm2d(num_features=64),
                nn.Hardtanh(0, 20, inplace=True), nn.MaxPool2d(2, stride=2),
                nn.Conv2d(64,
                          128,
                          kernel_size=3,
                          stride=1,
                          padding=1,
                          bias=False), nn.BatchNorm2d(num_features=128),
                nn.Hardtanh(0, 20, inplace=True),
                nn.Conv2d(128,
                          128,
                          kernel_size=3,
                          stride=1,
                          padding=1,
                          bias=False), nn.BatchNorm2d(num_features=128),
                nn.Hardtanh(0, 20, inplace=True), nn.MaxPool2d(2, stride=2))

        elif extractor.lower() == 'ds2':
            input_dim = int(math.floor(input_dim + 2 * 20 - 41) / 2 + 1)
            input_dim = int(math.floor(input_dim + 2 * 10 - 21) / 2 + 1)
            input_dim <<= 5
            self.conv = nn.Sequential(
                nn.Conv2d(1,
                          32,
                          kernel_size=(41, 11),
                          stride=(2, 2),
                          padding=(20, 5),
                          bias=False),
                nn.BatchNorm2d(32),
                nn.Hardtanh(0, 20, inplace=True),
                nn.Conv2d(32,
                          32,
                          kernel_size=(21, 11),
                          stride=(2, 1),
                          padding=(10, 5),
                          bias=False),
                nn.BatchNorm2d(32),
                nn.Hardtanh(0, 20, inplace=True),
            )

        else:
            raise ValueError("Unsupported Extractor : {0}".format(extractor))

        self.encoder = SpeechTransformerEncoder(d_model=d_model,
                                                input_dim=input_dim,
                                                d_ff=d_ff,
                                                num_layers=num_encoder_layers,
                                                num_heads=num_heads,
                                                ffnet_style=ffnet_style,
                                                dropout_p=dropout_p,
                                                pad_id=pad_id)
        self.decoder = SpeechTransformerDecoder(num_classes=num_classes,
                                                d_model=d_model,
                                                d_ff=d_ff,
                                                num_layers=num_decoder_layers,
                                                num_heads=num_heads,
                                                ffnet_style=ffnet_style,
                                                dropout_p=dropout_p,
                                                pad_id=pad_id,
                                                eos_id=eos_id)

        self.eos_id = eos_id
        self.pad_id = pad_id
        self.generator = Linear(d_model, num_classes)
示例#30
0
 def __init__(self, d_model: int) -> None:
     super(AdditiveAttention, self).__init__()
     self.query_proj = Linear(d_model, d_model, bias=False)
     self.key_proj = Linear(d_model, d_model, bias=False)
     self.bias = nn.Parameter(torch.rand(d_model).uniform_(-0.1, 0.1))
     self.score_proj = Linear(d_model, 1)