Пример #1
0
    def __init__(
        self,
        encoder,
        in_features,
        num_classes,
        n_fft,
        hop_length,
        sample_rate,
        n_mels,
        fmin,
        fmax,
        dropout_rate=0.1,
        freeze_spectrogram_parameters=True,
        freeze_logmel_parameters=True,
        use_spec_augmentation=True,
        time_drop_width=64,
        time_stripes_num=2,
        freq_drop_width=8,
        freq_stripes_num=2,
        spec_augmentation_method=None,
        apply_mixup=False,
        apply_spec_shuffle=False,
        spec_shuffle_prob=0,
        use_gru_layer=False,
        apply_tta=False,
        apply_encoder=False,
        **params,
    ):
        super().__init__()
        self.n_mels = n_mels
        self.dropout_rate = dropout_rate
        self.apply_mixup = apply_mixup
        self.apply_spec_shuffle = apply_spec_shuffle
        self.spec_shuffle_prob = spec_shuffle_prob
        self.use_gru_layer = use_gru_layer
        self.apply_tta = apply_tta

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=n_fft,
            window="hann",
            center=True,
            pad_mode="reflect",
            freeze_parameters=freeze_spectrogram_parameters,
        )

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=n_fft,
            n_mels=n_mels,
            fmin=fmin,
            fmax=fmax,
            ref=1.0,
            amin=1e-10,
            top_db=None,
            freeze_parameters=freeze_logmel_parameters,
            is_log=False,
        )

        # Spec augmenter
        self.spec_augmenter = None
        if use_spec_augmentation and (spec_augmentation_method is None):
            self.spec_augmenter = SpecAugmentation(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
            )
        elif use_spec_augmentation and (spec_augmentation_method is not None):
            self.spec_augmenter = SpecAugmentationPlusPlus(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
                method=spec_augmentation_method,
            )

        # encoder
        self.conformer = nn.Sequential(*[
            ConformerBlock(
                dim=n_mels,
                dim_head=64,
                heads=8,
                ff_mult=4,
                conv_expansion_factor=2,
                conv_kernel_size=31,
                attn_dropout=dropout_rate,
                ff_dropout=dropout_rate,
                conv_dropout=dropout_rate,
            ) for _ in range(3)
        ])

        self.fc = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(n_mels, num_classes),
        )
Пример #2
0
    def __init__(self, args, num_mels, num_classes):
        super(TALNetV3NoMeta, self).__init__()
        self.__dict__.update(args)  # Install all args into self
        assert self.n_conv_layers % self.n_pool_layers == 0
        self.input_n_freq_bins = n_freq_bins = num_mels
        self.output_size = num_classes
        self.n_head = self.transfo_head
        self.d_k = self.d_v = 128
        # Conv
        self.conv = []
        self.conv_v2 = []
        pool_interval = self.n_conv_layers / self.n_pool_layers
        n_input = 1
        for i in range(self.n_conv_layers):
            if (i + 1) % pool_interval == 0:  # this layer has pooling
                n_freq_bins /= 2
                n_output = self.embedding_size / n_freq_bins
                pool_stride = (2, 2) if i < pool_interval * 2 else (1, 2)
            else:
                n_output = self.embedding_size * 2 / n_freq_bins
                pool_stride = None
            layer = ConvBlock(n_input, n_output, self.kernel_size, batch_norm=self.batch_norm, pool_stride=pool_stride,)
            self.conv.append(layer)
            self.__setattr__("conv" + str(i + 1), layer)
            layer_v2 = ConvBlockTALNet2(
                int(n_input),
                int(n_output),
                (int(self.kernel_size), int(self.kernel_size)),
                norm="GN",
                pool_stride=pool_stride,
                pool_strat="max",
                activation="mish",
            )
            self.conv_v2.append(layer_v2)

            self.__setattr__("conv_v2" + str(i + 1), layer_v2)
            n_input = n_output
        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(
            time_drop_width=64, time_stripes_num=2, freq_drop_width=num_mels//8, freq_stripes_num=2,
        )
        self.bn0 = nn.BatchNorm2d(num_mels)

        # Temp (Transfo + GRU)
        self.gru = nn.GRU(
            int(self.embedding_size), int(self.embedding_size / 2), 1, batch_first=True, bidirectional=True,
        )
        self.multihead_v2 = MultiHead(self.n_head, self.embedding_size, self.d_k, self.d_v, self.dropout_transfo)
        # FC
        # self.att_block = AttBlock(n_in=(self.embedding_size * 2 + self.meta_emb * self.num_meta), n_out=self.output_size, activation='sigmoid')
        self.fc_prob = nn.Linear(self.embedding_size * 2, self.output_size)
        if self.pooling == "att":
            self.fc_att = nn.Linear(self.embedding_size * 2, self.output_size,)

        # Better initialization
        nn.init.orthogonal_(self.gru.weight_ih_l0)
        nn.init.constant_(self.gru.bias_ih_l0, 0)
        nn.init.orthogonal_(self.gru.weight_hh_l0)
        nn.init.constant_(self.gru.bias_hh_l0, 0)
        nn.init.orthogonal_(self.gru.weight_ih_l0_reverse)
        nn.init.constant_(self.gru.bias_ih_l0_reverse, 0)
        nn.init.orthogonal_(self.gru.weight_hh_l0_reverse)
        nn.init.constant_(self.gru.bias_hh_l0_reverse, 0)
        nn.init.xavier_uniform_(self.fc_prob.weight)
        nn.init.constant_(self.fc_prob.bias, 0)
        if self.pooling == "att":
            nn.init.xavier_uniform_(self.fc_att.weight)
            nn.init.constant_(self.fc_att.bias, 0)
        if self.pooling == "auto":
            self.autopool = AutoPool(self.output_size)
Пример #3
0
    def __init__(
        self,
        encoder,
        in_features,
        num_classes,
        n_fft,
        hop_length,
        sample_rate,
        n_mels,
        fmin,
        fmax,
        dropout_rate=0.5,
        freeze_spectrogram_parameters=True,
        freeze_logmel_parameters=True,
        use_spec_augmentation=True,
        time_drop_width=64,
        time_stripes_num=2,
        freq_drop_width=8,
        freq_stripes_num=2,
        spec_augmentation_method=None,
        apply_mixup=False,
        apply_spec_shuffle=False,
        spec_shuffle_prob=0,
        use_gru_layer=False,
        apply_tta=False,
        use_loudness=False,
        use_spectral_centroid=False,
        apply_delta_spectrum=False,
        apply_time_freq_encoding=False,
        min_db=120,
        apply_pcen=False,
        freeze_pcen_parameters=False,
        use_multisample_dropout=False,
        multisample_dropout=0.5,
        num_multisample_dropout=5,
        pooling_kernel_size=3,
        **params,
    ):
        super().__init__()
        self.n_mels = n_mels
        self.dropout_rate = dropout_rate
        self.apply_mixup = apply_mixup
        self.apply_spec_shuffle = apply_spec_shuffle
        self.spec_shuffle_prob = spec_shuffle_prob
        self.use_gru_layer = use_gru_layer
        self.apply_tta = apply_tta
        self.use_loudness = use_loudness
        self.use_spectral_centroid = use_spectral_centroid
        self.apply_delta_spectrum = apply_delta_spectrum
        self.apply_time_freq_encoding = apply_time_freq_encoding
        self.apply_pcen = apply_pcen
        self.use_multisample_dropout = use_multisample_dropout
        self.num_multisample_dropout = num_multisample_dropout
        self.pooling_kernel_size = pooling_kernel_size

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=n_fft,
            window="hann",
            center=True,
            pad_mode="reflect",
            freeze_parameters=freeze_spectrogram_parameters,
        )

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=n_fft,
            n_mels=n_mels,
            fmin=fmin,
            fmax=fmax,
            ref=1.0,
            amin=1e-10,
            top_db=None,
            freeze_parameters=freeze_logmel_parameters,
            is_log=False,
        )

        self.power_to_db = torchaudio.transforms.AmplitudeToDB()

        # Spec augmenter
        self.spec_augmenter = None
        if use_spec_augmentation and (spec_augmentation_method is None):
            self.spec_augmenter = SpecAugmentation(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
            )
        elif use_spec_augmentation and (spec_augmentation_method is not None):
            self.spec_augmenter = SpecAugmentationPlusPlus(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
                method=spec_augmentation_method,
            )

        if self.use_loudness:
            self.loudness_bn = nn.BatchNorm1d(1)
            self.loudness_extractor = Loudness(
                sr=sample_rate,
                n_fft=n_fft,
                min_db=min_db,
            )

        if self.use_spectral_centroid:
            self.spectral_centroid_bn = nn.BatchNorm1d(1)

        if self.apply_pcen:
            self.pcen_transform = PCENTransform(
                trainable=~freeze_pcen_parameters, )

        # layers = list(encoder.children())[:-2]
        # self.encoder = nn.Sequential(*layers)
        self.encoder = encoder

        if self.use_multisample_dropout:
            self.big_dropout = nn.Dropout(p=multisample_dropout)