def __init__( self, input_dim: int, extractor: str = 'vgg', d_model: int = None, num_classes: int = None, dropout_p: float = None, activation: str = 'hardtanh', joint_ctc_attention: bool = False, ) -> None: super(BaseEncoder, self).__init__() if joint_ctc_attention: assert num_classes, "If `joint_ctc_attention` True, `num_classes` should be not None" assert dropout_p, "If `joint_ctc_attention` True, `dropout_p` should be not None" assert d_model, "If `joint_ctc_attention` True, `d_model` should be not None" if extractor is not None: extractor = self.supported_extractors[extractor.lower()] self.conv = extractor(input_dim=input_dim, activation=activation) self.conv_output_dim = self.conv.get_output_dim() self.num_classes = num_classes self.joint_ctc_attention = joint_ctc_attention if self.joint_ctc_attention: self.fc = nn.Sequential( nn.BatchNorm1d(d_model), Transpose(shape=(1, 2)), nn.Dropout(dropout_p), Linear(d_model, num_classes, bias=False), )
def __init__( self, input_size: int, # size of input num_classes: int, # number of class hidden_dim: int = 512, # dimension of RNN`s hidden state device: str = 'cuda', # device - 'cuda' or 'cpu' dropout_p: float = 0.3, # dropout probability num_layers: int = 3, # number of RNN layers bidirectional: bool = True, # if True, becomes a bidirectional encoder rnn_type: str = 'lstm', # type of RNN cell extractor: str = 'vgg', # type of CNN extractor activation: str = 'hardtanh', # type of activation function mask_conv: bool = False, # flag indication whether apply mask convolution or not joint_ctc_attention: bool = False, # Use CTC Loss & Cross Entropy Joint Learning ) -> None: self.mask_conv = mask_conv self.extractor = extractor.lower() self.joint_ctc_attention = joint_ctc_attention if self.extractor == 'vgg': input_size = (input_size - 1) << 5 if input_size % 2 else input_size << 5 super(Listener, self).__init__(input_size, hidden_dim, num_layers, rnn_type, dropout_p, bidirectional, device) self.conv = VGGExtractor(activation, mask_conv) elif self.extractor == 'ds2': input_size = int(math.floor(input_size + 2 * 20 - 41) / 2 + 1) input_size = int(math.floor(input_size + 2 * 10 - 21) / 2 + 1) input_size <<= 6 super(Listener, self).__init__(input_size, hidden_dim, num_layers, rnn_type, dropout_p, bidirectional, device) self.conv = DeepSpeech2Extractor(activation, mask_conv) else: raise ValueError("Unsupported Extractor : {0}".format(extractor)) if self.joint_ctc_attention: assert self.mask_conv, "if joint_ctc_attention training, mask_conv should be True" self.fc = nn.Sequential( nn.BatchNorm1d(self.hidden_dim << 1), Transpose(shape=(1, 2)), nn.Dropout(dropout_p), Linear(self.hidden_dim << 1, num_classes, bias=False))
def __init__( self, in_channels: int, kernel_size: int = 31, expansion_factor: int = 2, dropout_p: float = 0.1, device: torch.device = 'cuda', ) -> None: super(ConformerConvModule, self).__init__() assert ( kernel_size - 1 ) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding" assert expansion_factor == 2, "Currently, Only Supports expansion_factor 2" self.device = device self.sequential = nn.Sequential( LayerNorm(in_channels), Transpose(shape=(1, 2)), PointwiseConv1d(in_channels, in_channels * expansion_factor, stride=1, padding=0, bias=True), GLU(dim=1), DepthwiseConv1d(in_channels, in_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2), nn.BatchNorm1d(in_channels), Swish(), PointwiseConv1d(in_channels, in_channels, stride=1, padding=0, bias=True), nn.Dropout(p=dropout_p), )
def __init__( self, num_classes: int, # the number of classfication d_model: int = 512, # dimension of model input_dim: int = 80, # dimension of input pad_id: int = 0, # identification of <PAD_token> sos_id: int = 1, # identification of <SOS_token> eos_id: int = 2, # identification of <EOS_token> d_ff: int = 2048, # dimension of feed forward network num_heads: int = 8, # number of attention heads num_encoder_layers: int = 6, # number of encoder layers num_decoder_layers: int = 6, # number of decoder layers dropout_p: float = 0.3, # dropout probability ffnet_style: str = 'ff', # feed forward network style 'ff' or 'conv' extractor: str = 'vgg', # CNN extractor [vgg, ds2] joint_ctc_attention: bool = False, # flag indication whether to apply joint ctc attention max_length: int = 400 # a maximum allowed length for the sequence to be processed ) -> None: super(SpeechTransformer, self).__init__() assert d_model % num_heads == 0, "d_model % num_heads should be zero." self.num_classes = num_classes self.extractor = extractor self.joint_ctc_attention = joint_ctc_attention self.sos_id = sos_id self.eos_id = eos_id self.pad_id = pad_id self.max_length = max_length if self.extractor == 'vgg': input_dim = (input_dim - 1) << 5 if input_dim % 2 else input_dim << 5 self.conv = VGGExtractor(mask_conv=False) elif self.extractor == 'ds2': input_dim = int(math.floor(input_dim + 2 * 20 - 41) / 2 + 1) input_dim = int(math.floor(input_dim + 2 * 10 - 21) / 2 + 1) input_dim <<= 6 self.conv = DeepSpeech2Extractor(mask_conv=False) else: raise ValueError("Unsupported Extractor : {0}".format(extractor)) self.encoder = SpeechTransformerEncoder( d_model=d_model, input_dim=input_dim, d_ff=d_ff, num_layers=num_encoder_layers, num_heads=num_heads, ffnet_style=ffnet_style, dropout_p=dropout_p, pad_id=pad_id, ) if self.joint_ctc_attention: self.encoder_fc = nn.Sequential( nn.BatchNorm1d(d_model), Transpose(shape=(1, 2)), nn.Dropout(dropout_p), Linear(d_model, num_classes, bias=False), ) self.decoder = SpeechTransformerDecoder( num_classes=num_classes, d_model=d_model, d_ff=d_ff, num_layers=num_decoder_layers, num_heads=num_heads, ffnet_style=ffnet_style, dropout_p=dropout_p, pad_id=pad_id, eos_id=eos_id, ) self.decoder_fc = Linear(d_model, num_classes)