def __init__( self, encoder, in_features, num_classes, n_fft, hop_length, sample_rate, n_mels, fmin, fmax, dropout_rate=0.1, freeze_spectrogram_parameters=True, freeze_logmel_parameters=True, use_spec_augmentation=True, time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2, spec_augmentation_method=None, apply_mixup=False, apply_spec_shuffle=False, spec_shuffle_prob=0, use_gru_layer=False, apply_tta=False, apply_encoder=False, **params, ): super().__init__() self.n_mels = n_mels self.dropout_rate = dropout_rate self.apply_mixup = apply_mixup self.apply_spec_shuffle = apply_spec_shuffle self.spec_shuffle_prob = spec_shuffle_prob self.use_gru_layer = use_gru_layer self.apply_tta = apply_tta # Spectrogram extractor self.spectrogram_extractor = Spectrogram( n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window="hann", center=True, pad_mode="reflect", freeze_parameters=freeze_spectrogram_parameters, ) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank( sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=freeze_logmel_parameters, is_log=False, ) # Spec augmenter self.spec_augmenter = None if use_spec_augmentation and (spec_augmentation_method is None): self.spec_augmenter = SpecAugmentation( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, ) elif use_spec_augmentation and (spec_augmentation_method is not None): self.spec_augmenter = SpecAugmentationPlusPlus( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, method=spec_augmentation_method, ) # encoder self.conformer = nn.Sequential(*[ ConformerBlock( dim=n_mels, dim_head=64, heads=8, ff_mult=4, conv_expansion_factor=2, conv_kernel_size=31, attn_dropout=dropout_rate, ff_dropout=dropout_rate, conv_dropout=dropout_rate, ) for _ in range(3) ]) self.fc = nn.Sequential( nn.Dropout(dropout_rate), nn.Linear(n_mels, num_classes), )
def __init__(self, args, num_mels, num_classes): super(TALNetV3NoMeta, self).__init__() self.__dict__.update(args) # Install all args into self assert self.n_conv_layers % self.n_pool_layers == 0 self.input_n_freq_bins = n_freq_bins = num_mels self.output_size = num_classes self.n_head = self.transfo_head self.d_k = self.d_v = 128 # Conv self.conv = [] self.conv_v2 = [] pool_interval = self.n_conv_layers / self.n_pool_layers n_input = 1 for i in range(self.n_conv_layers): if (i + 1) % pool_interval == 0: # this layer has pooling n_freq_bins /= 2 n_output = self.embedding_size / n_freq_bins pool_stride = (2, 2) if i < pool_interval * 2 else (1, 2) else: n_output = self.embedding_size * 2 / n_freq_bins pool_stride = None layer = ConvBlock(n_input, n_output, self.kernel_size, batch_norm=self.batch_norm, pool_stride=pool_stride,) self.conv.append(layer) self.__setattr__("conv" + str(i + 1), layer) layer_v2 = ConvBlockTALNet2( int(n_input), int(n_output), (int(self.kernel_size), int(self.kernel_size)), norm="GN", pool_stride=pool_stride, pool_strat="max", activation="mish", ) self.conv_v2.append(layer_v2) self.__setattr__("conv_v2" + str(i + 1), layer_v2) n_input = n_output # Spec augmenter self.spec_augmenter = SpecAugmentation( time_drop_width=64, time_stripes_num=2, freq_drop_width=num_mels//8, freq_stripes_num=2, ) self.bn0 = nn.BatchNorm2d(num_mels) # Temp (Transfo + GRU) self.gru = nn.GRU( int(self.embedding_size), int(self.embedding_size / 2), 1, batch_first=True, bidirectional=True, ) self.multihead_v2 = MultiHead(self.n_head, self.embedding_size, self.d_k, self.d_v, self.dropout_transfo) # FC # self.att_block = AttBlock(n_in=(self.embedding_size * 2 + self.meta_emb * self.num_meta), n_out=self.output_size, activation='sigmoid') self.fc_prob = nn.Linear(self.embedding_size * 2, self.output_size) if self.pooling == "att": self.fc_att = nn.Linear(self.embedding_size * 2, self.output_size,) # Better initialization nn.init.orthogonal_(self.gru.weight_ih_l0) nn.init.constant_(self.gru.bias_ih_l0, 0) nn.init.orthogonal_(self.gru.weight_hh_l0) nn.init.constant_(self.gru.bias_hh_l0, 0) nn.init.orthogonal_(self.gru.weight_ih_l0_reverse) nn.init.constant_(self.gru.bias_ih_l0_reverse, 0) nn.init.orthogonal_(self.gru.weight_hh_l0_reverse) nn.init.constant_(self.gru.bias_hh_l0_reverse, 0) nn.init.xavier_uniform_(self.fc_prob.weight) nn.init.constant_(self.fc_prob.bias, 0) if self.pooling == "att": nn.init.xavier_uniform_(self.fc_att.weight) nn.init.constant_(self.fc_att.bias, 0) if self.pooling == "auto": self.autopool = AutoPool(self.output_size)
def __init__( self, encoder, in_features, num_classes, n_fft, hop_length, sample_rate, n_mels, fmin, fmax, dropout_rate=0.5, freeze_spectrogram_parameters=True, freeze_logmel_parameters=True, use_spec_augmentation=True, time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2, spec_augmentation_method=None, apply_mixup=False, apply_spec_shuffle=False, spec_shuffle_prob=0, use_gru_layer=False, apply_tta=False, use_loudness=False, use_spectral_centroid=False, apply_delta_spectrum=False, apply_time_freq_encoding=False, min_db=120, apply_pcen=False, freeze_pcen_parameters=False, use_multisample_dropout=False, multisample_dropout=0.5, num_multisample_dropout=5, pooling_kernel_size=3, **params, ): super().__init__() self.n_mels = n_mels self.dropout_rate = dropout_rate self.apply_mixup = apply_mixup self.apply_spec_shuffle = apply_spec_shuffle self.spec_shuffle_prob = spec_shuffle_prob self.use_gru_layer = use_gru_layer self.apply_tta = apply_tta self.use_loudness = use_loudness self.use_spectral_centroid = use_spectral_centroid self.apply_delta_spectrum = apply_delta_spectrum self.apply_time_freq_encoding = apply_time_freq_encoding self.apply_pcen = apply_pcen self.use_multisample_dropout = use_multisample_dropout self.num_multisample_dropout = num_multisample_dropout self.pooling_kernel_size = pooling_kernel_size # Spectrogram extractor self.spectrogram_extractor = Spectrogram( n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window="hann", center=True, pad_mode="reflect", freeze_parameters=freeze_spectrogram_parameters, ) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank( sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=freeze_logmel_parameters, is_log=False, ) self.power_to_db = torchaudio.transforms.AmplitudeToDB() # Spec augmenter self.spec_augmenter = None if use_spec_augmentation and (spec_augmentation_method is None): self.spec_augmenter = SpecAugmentation( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, ) elif use_spec_augmentation and (spec_augmentation_method is not None): self.spec_augmenter = SpecAugmentationPlusPlus( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, method=spec_augmentation_method, ) if self.use_loudness: self.loudness_bn = nn.BatchNorm1d(1) self.loudness_extractor = Loudness( sr=sample_rate, n_fft=n_fft, min_db=min_db, ) if self.use_spectral_centroid: self.spectral_centroid_bn = nn.BatchNorm1d(1) if self.apply_pcen: self.pcen_transform = PCENTransform( trainable=~freeze_pcen_parameters, ) # layers = list(encoder.children())[:-2] # self.encoder = nn.Sequential(*layers) self.encoder = encoder if self.use_multisample_dropout: self.big_dropout = nn.Dropout(p=multisample_dropout)