Пример #1
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):

        super(Encoder_B0_Pretrained, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=4,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(mel_bins)
        self.bn0.load_state_dict(torch.load('pretrained_bn0_b0'))
        for p in self.bn0.parameters():
            p.requires_grad = False

        fe = 1280
        fe_features = 2048

        self.fc1 = nn.Linear(fe, fe_features, bias=True)
        self.fc1.load_state_dict(torch.load('pretrained_fc1_b0_fold_0'))
        for p in self.fc1.parameters():
            p.requires_grad = False

        self.att_block = AttBlock(fe_features, classes_num)

        #         self.fe = timm.models.resnest50d_4s2x40d(pretrained=True)
        self.fe = timm.models.tf_efficientnet_b0_ns(pretrained=False)
        self.fe = nn.Sequential(*list(self.fe.children())[:-2])

        self.fe.load_state_dict(torch.load('pretrained_fe_b0_fold_0'))
        for p in self.fe.parameters():
            p.requires_grad = False
Пример #2
0
    def __init__(self, model_name, n_out):
        super(AudioClassifier, self).__init__()

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=80,
                                               time_stripes_num=2,
                                               freq_drop_width=16,
                                               freq_stripes_num=2)
        self.net = timm.create_model(model_name, pretrained=True, in_chans=1)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout1 = nn.Dropout(0.3)
        self.dropout2 = nn.Dropout(0.3)
        if hasattr(self.net, "classifier"):
            n_features = self.net.classifier.in_features
        elif hasattr(self.net, "fc"):
            n_features = self.net.fc.in_features
        self.net_classifier = nn.Linear(n_features, n_out)
        self.init_weight()

        # korrniaのrandom cropはh,wを想定しているため注意
        self.transform = nn.Sequential(
            K.RandomHorizontalFlip(p=0.1),
            # K.GaussianBlur(7, p=0.5),
            # K.RandomCrop((round(IMAGE_HEIGHT*0.7), round(IMAGE_WIDTH*0.7)),p=0.3)
        )
Пример #3
0
    def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num):
        super().__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.interpolate_ratio = 30  # Downsampled ratio

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)
        
        # Model Encoder
        self.encoder = encoder_params[encoder]["init_op"]()
        self.fc1 = nn.Linear(encoder_params[encoder]["features"], 1024, bias=True)
        self.att_block = AttBlock(1024, classes_num, activation="sigmoid")
        self.bn0 = nn.BatchNorm2d(mel_bins)
        self.init_weight()
Пример #4
0
    def __init__(self,
                 base_model_name: str,
                 pretrained=False,
                 num_classes=24,
                 in_channels=1):
        super().__init__()
        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64 // 2,
                                               time_stripes_num=2,
                                               freq_drop_width=8 // 2,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(CFG.n_mels)

        base_model = timm.create_model(base_model_name,
                                       pretrained=pretrained,
                                       in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)

        if hasattr(base_model, "fc"):
            in_features = base_model.fc.in_features
        else:
            in_features = base_model.classifier.in_features
        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(in_features,
                                    num_classes,
                                    activation="sigmoid")

        self.init_weight()
Пример #5
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):
        super(Cnn14, self).__init__()

        window = "hann"
        center = True
        pad_mode = "reflect"
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.dataset_mean = 0.0
        self.dataset_std = 1.0
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=window_size,
            hop_length=hop_size,
            win_length=window_size,
            window=window,
            center=center,
            pad_mode=pad_mode,
            freeze_parameters=True,
        )

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=window_size,
            n_mels=mel_bins,
            fmin=fmin,
            fmax=fmax,
            ref=ref,
            amin=amin,
            top_db=top_db,
            freeze_parameters=True,
        )

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(
            time_drop_width=32,
            time_stripes_num=2,
            freq_drop_width=8,
            freq_stripes_num=2,
        )

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)

        self.init_weight()
Пример #6
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):

        super(Encoder_Transformer, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=4,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(mel_bins)

        fe = 1280
        d_model = 512

        self.fc1 = nn.Linear(fe, d_model, bias=True)

        n_head = 4

        self.multihead = ResidualAttentionBlock(d_model, n_head)

        self.att_block = AttBlock(d_model, classes_num)

        #         self.fe = timm.models.resnest50d_4s2x40d(pretrained=True)
        self.fe = timm.models.tf_efficientnet_b0_ns(pretrained=True)
        self.fe = nn.Sequential(*list(self.fe.children())[:-2])

        self.pos_emb = nn.Embedding(32, d_model)
Пример #7
0
 def __init__(self):
     super(CNNEncoder2, self).__init__()
     self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                            time_stripes_num=4,
                                            freq_drop_width=8,
                                            freq_stripes_num=2)
     self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
     self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
     self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
     self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
Пример #8
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):

        super(Cnn_9layers_Gru_FrameAtt, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)

        self.gru = nn.GRU(input_size=512,
                          hidden_size=256,
                          num_layers=1,
                          bias=True,
                          batch_first=True,
                          bidirectional=True)

        self.att_block = AttBlock(n_in=512, n_out=17, activation='sigmoid')

        self.init_weights()
Пример #9
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):

        super(Cnn_9layers_Transformer_FrameAtt, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)

        n_head = 8
        n_hid = 512
        d_k = 64
        d_v = 64
        dropout = 0.2
        self.multihead = MultiHead(n_head, n_hid, d_k, d_v, dropout)

        self.att_block = AttBlock(n_in=512, n_out=17, activation='sigmoid')

        self.init_weights()
Пример #10
0
    def __init__(self, sample_rate: int, window_size: int, hop_size: int,
                 mel_bins: int, fmin: int, fmax: int, classes_num: int):
        super().__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.interpolate_ratio = 32  # Downsampled ratio

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=window_size,
            hop_length=hop_size,
            win_length=window_size,
            window=window,
            center=center,
            pad_mode=pad_mode,
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=window_size,
            n_mels=mel_bins,
            fmin=fmin,
            fmax=fmax,
            ref=ref,
            amin=amin,
            top_db=top_db,
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(
            time_drop_width=64,
            time_stripes_num=2,
            freq_drop_width=8,
            freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(mel_bins)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.att_block = AttBlock(2048, classes_num, activation='sigmoid')

        self.init_weight()
Пример #11
0
    def __init__(self,
                 sample_rate: int,
                 window_size: int,
                 hop_size: int,
                 mel_bins: int,
                 fmin: int,
                 fmax: int,
                 classes_num: int,
                 apply_aug: bool,
                 top_db=None):
        super().__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        self.interpolate_ratio = 32  # Downsampled ratio
        self.apply_aug = apply_aug

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(mel_bins)

        self.fc1 = nn.Linear(1024, 1024, bias=True)
        self.att_block = AttBlockV2(1024, classes_num, activation='sigmoid')

        self.densenet_features = models.densenet121(pretrained=True).features

        self.init_weight()
Пример #12
0
    def __init__(self,
                 base_model_name: str,
                 pretrained=False,
                 num_classes=24,
                 in_channels=1):
        super().__init__()

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft,
                                                 hop_length=CFG.hop_length,
                                                 win_length=CFG.n_fft,
                                                 window="hann",
                                                 center=True,
                                                 pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate,
                                                 n_fft=CFG.n_fft,
                                                 n_mels=CFG.n_mels,
                                                 fmin=CFG.fmin,
                                                 fmax=CFG.fmax,
                                                 ref=1.0,
                                                 amin=1e-10,
                                                 top_db=None,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(CFG.n_mels)

        base_model = timm.create_model(base_model_name,
                                       pretrained=pretrained,
                                       in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)

        if hasattr(base_model, "fc"):
            in_features = base_model.fc.in_features
        else:
            in_features = base_model.classifier.in_features
        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(in_features,
                                    num_classes,
                                    activation="sigmoid")

        self.init_weight()
Пример #13
0
    def specAug(self, x):
        '''
        spwcAug augmentate data with SpecAugmentation.
        This returns torch tensor.

        args
        x: ndarray or torch tensor
        '''
        augmenter = SpecAugmentation(time_drop_width=32,
                                     time_stripes_num=2,
                                     freq_drop_width=32,
                                     freq_stripes_num=4)
        augmented_data = augmenter(x)

        return augmented_data
Пример #14
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):

        super(Cnn_9layers_FrameAvg, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)

        self.fc = nn.Linear(512, classes_num, bias=True)

        self.init_weights()
Пример #15
0
    def __init__(self):
        super(Tmodel, self).__init__()

        SPEC_HEIGHT = 128
        SPEC_WIDTH = 256
        NUM_MELS = SPEC_HEIGHT
        HOP_LENGTH = int(
            32000 * 5 /
            (SPEC_WIDTH - 1))  # sample rate * duration / spec width - 1 == 627
        FMIN = 500
        FMAX = 12500
        classes_num = 398
        self.interpolate_ratio = 8

        self.spectrogram_extractor = Spectrogram(n_fft=2048,
                                                 hop_length=HOP_LENGTH,
                                                 freeze_parameters=True)

        self.logmel_extractor = LogmelFilterBank(sr=32000,
                                                 n_mels=NUM_MELS,
                                                 fmin=FMIN,
                                                 fmax=FMAX,
                                                 freeze_parameters=True)

        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(128)

        base_model = torch.hub.load('zhanghang1989/ResNeSt',
                                    'resnest50',
                                    pretrained=False)

        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)

        self.gru = nn.GRU(input_size=2048,
                          hidden_size=1024,
                          num_layers=1,
                          bias=True,
                          batch_first=True,
                          bidirectional=True)

        self.att_block = AttBlockV2(2048, classes_num, activation='sigmoid')
        self.init_weights()
Пример #16
0
    def __init__(self, classes_num):

        super(Pre_Cnn14, self).__init__()

        #
        # window = 'hann'
        # center = True
        # pad_mode = 'reflect'
        # ref = 1.0
        # amin = 1e-10
        # top_db = None
        #
        # # Spectrogram extractor
        # self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
        #                                          win_length=window_size, window=window, center=center,
        #                                          pad_mode=pad_mode,
        #                                          freeze_parameters=True)
        #
        # # Logmel feature extractor
        # self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
        #                                          n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin,
        #                                          top_db=top_db,
        #                                          freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)

        self.fc2 = nn.Linear(2048, classes_num, bias=True)

        self.init_weight()
Пример #17
0
    def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins,
                 fmin, fmax, classes_num):
        super().__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.encoder = encoder_params[encoder]["init_op"]()
        self.avg_pool = AdaptiveAvgPool2d((1, 1))
        #self.max_pool = AdaptiveMaxPool2d((1, 1))
        self.dropout = Dropout(0.3)
        self.fc = Linear(encoder_params[encoder]['features'], classes_num)
Пример #18
0
    def __init__(self, classes_num):

        super(Cnn10, self).__init__()

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(
            time_drop_width=24, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2
        )

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)

        self.fc1 = nn.Linear(512, 512, bias=True)
        self.fc_audioset = nn.Linear(512, classes_num, bias=True)

        self.init_weight()
Пример #19
0
    def __init__(self, encoder, sample_rate, window_size, hop_size, 
                 mel_bins, fmin, fmax, classes_num):
        super().__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        #self.interpolate_ratio = 29 # Downsampled ratio
        self.interpolate_ratio = 29

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
                                                 win_length=window_size, window=window, 
                                                 center=center, pad_mode=pad_mode, 
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
                                                 n_mels=mel_bins, fmin=fmin, fmax=fmax, 
                                                 ref=ref, amin=amin, top_db=top_db, 
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
                                               freq_drop_width=8, freq_stripes_num=2)
        
        self.batch_norm = nn.BatchNorm2d(mel_bins)
        self.encoder = encoder_params[encoder]["init_op"]()
        
        #self.encoder.last_linear = Linear(encoder_params[encoder]['features'], 2048, bias=True)
        #self.encoder.classifier = Linear(2048, encoder_params[encoder]['features'], bias=True)
        #self.fc = Linear(encoder_params[encoder]['features'], 2048, bias=True)
        
        #self.encoder.fc = nn.Linear(2048, 2048)
        self.dropout = Dropout(0.3)
        self.att_head = AttentionHead(1000, classes_num, activation='sigmoid')
        #self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.init_weight()
Пример #20
0
    def __init__(self, input_size, classes_num=10, activation="softmax"):
        super(TFNet, self).__init__()

        self.activation = activation

        self.tfblock1 = TFBlock(in_channels=1,
                                out_channels=64,
                                input_size=input_size)
        self.tfblock2 = TFBlock(in_channels=64,
                                out_channels=128,
                                input_size=input_size)
        self.tfblock3 = TFBlock(in_channels=128,
                                out_channels=256,
                                input_size=input_size)
        self.tfblock4 = TFBlock(in_channels=256,
                                out_channels=512,
                                input_size=input_size)
        self.fc = nn.Linear(512, classes_num, bias=True)

        self.spec_augmenter = SpecAugmentation(time_drop_width=24,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)
Пример #21
0
    def __init__(
        self,
        encoder,
        in_features,
        num_classes,
        n_fft,
        hop_length,
        sample_rate,
        n_mels,
        fmin,
        fmax,
        dropout_rate=0.5,
        freeze_spectrogram_parameters=True,
        freeze_logmel_parameters=True,
        use_spec_augmentation=True,
        time_drop_width=64,
        time_stripes_num=2,
        freq_drop_width=8,
        freq_stripes_num=2,
        spec_augmentation_method=None,
        apply_mixup=False,
        apply_spec_shuffle=False,
        spec_shuffle_prob=0,
        use_gru_layer=False,
        apply_tta=False,
        use_loudness=False,
        use_spectral_centroid=False,
        apply_delta_spectrum=False,
        apply_time_freq_encoding=False,
        min_db=120,
        apply_pcen=False,
        freeze_pcen_parameters=False,
        use_multisample_dropout=False,
        multisample_dropout=0.5,
        num_multisample_dropout=5,
        pooling_kernel_size=3,
        **params,
    ):
        super().__init__()
        self.n_mels = n_mels
        self.dropout_rate = dropout_rate
        self.apply_mixup = apply_mixup
        self.apply_spec_shuffle = apply_spec_shuffle
        self.spec_shuffle_prob = spec_shuffle_prob
        self.use_gru_layer = use_gru_layer
        self.apply_tta = apply_tta
        self.use_loudness = use_loudness
        self.use_spectral_centroid = use_spectral_centroid
        self.apply_delta_spectrum = apply_delta_spectrum
        self.apply_time_freq_encoding = apply_time_freq_encoding
        self.apply_pcen = apply_pcen
        self.use_multisample_dropout = use_multisample_dropout
        self.num_multisample_dropout = num_multisample_dropout
        self.pooling_kernel_size = pooling_kernel_size

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=n_fft,
            window="hann",
            center=True,
            pad_mode="reflect",
            freeze_parameters=freeze_spectrogram_parameters,
        )

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=n_fft,
            n_mels=n_mels,
            fmin=fmin,
            fmax=fmax,
            ref=1.0,
            amin=1e-10,
            top_db=None,
            freeze_parameters=freeze_logmel_parameters,
            is_log=False,
        )

        self.power_to_db = torchaudio.transforms.AmplitudeToDB()

        # Spec augmenter
        self.spec_augmenter = None
        if use_spec_augmentation and (spec_augmentation_method is None):
            self.spec_augmenter = SpecAugmentation(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
            )
        elif use_spec_augmentation and (spec_augmentation_method is not None):
            self.spec_augmenter = SpecAugmentationPlusPlus(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
                method=spec_augmentation_method,
            )

        if self.use_loudness:
            self.loudness_bn = nn.BatchNorm1d(1)
            self.loudness_extractor = Loudness(
                sr=sample_rate,
                n_fft=n_fft,
                min_db=min_db,
            )

        if self.use_spectral_centroid:
            self.spectral_centroid_bn = nn.BatchNorm1d(1)

        if self.apply_pcen:
            self.pcen_transform = PCENTransform(
                trainable=~freeze_pcen_parameters, )

        # layers = list(encoder.children())[:-2]
        # self.encoder = nn.Sequential(*layers)
        self.encoder = encoder

        if self.use_multisample_dropout:
            self.big_dropout = nn.Dropout(p=multisample_dropout)
Пример #22
0
    def __init__(self, args, num_mels, num_meta, num_classes):
        super(TALNetV3, self).__init__()
        self.__dict__.update(args.__dict__)  # Install all args into self
        assert self.n_conv_layers % self.n_pool_layers == 0
        self.input_n_freq_bins = n_freq_bins = num_mels
        self.output_size = num_classes
        self.num_meta = num_meta
        self.n_head = self.transfo_head
        self.d_k = self.d_v = 128
        self.meta_emb = self.nb_meta_emb
        # Conv
        self.conv = []
        self.conv_v2 = []
        pool_interval = self.n_conv_layers / self.n_pool_layers
        n_input = 1
        for i in range(self.n_conv_layers):
            if (i + 1) % pool_interval == 0:  # this layer has pooling
                n_freq_bins /= 2
                n_output = self.embedding_size / n_freq_bins
                pool_stride = (2, 2) if i < pool_interval * 2 else (1, 2)
            else:
                n_output = self.embedding_size * 2 / n_freq_bins
                pool_stride = None
            layer = ConvBlock(n_input,
                              n_output,
                              self.kernel_size,
                              batch_norm=self.batch_norm,
                              pool_stride=pool_stride)
            self.conv.append(layer)
            self.__setattr__('conv' + str(i + 1), layer)
            layer_v2 = ConvBlockTALNet(
                int(n_input),
                int(n_output), (int(self.kernel_size), int(self.kernel_size)),
                norm='GN',
                pool_stride=pool_stride,
                pool_strat='max',
                activation='mish')
            self.conv_v2.append(layer_v2)

            self.__setattr__('conv_v2' + str(i + 1), layer_v2)
            n_input = n_output
        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)
        self.bn0 = nn.BatchNorm2d(64)
        # Metadata  + fc
        self.t2v = Time2Vec(self.num_meta, self.meta_emb)
        # Temp (Transfo + GRU)
        self.multihead_meta = MultiHead(self.n_head, self.num_meta, self.d_k,
                                        self.d_v, self.dropout_transfo)
        self.gru = nn.GRU(int(self.embedding_size),
                          int(self.embedding_size / 2),
                          1,
                          batch_first=True,
                          bidirectional=True)
        self.multihead_v2 = MultiHead(self.n_head, self.embedding_size,
                                      self.d_k, self.d_v, self.dropout_transfo)
        # FC
        # self.att_block = AttBlock(n_in=(self.embedding_size * 2 + self.meta_emb * self.num_meta), n_out=self.output_size, activation='sigmoid')
        self.fc_prob = nn.Linear(
            self.embedding_size * 2 + self.meta_emb * self.num_meta,
            self.output_size)
        if self.pooling == 'att':
            self.fc_att = nn.Linear(
                self.embedding_size * 2 + self.meta_emb * self.num_meta,
                self.output_size)

        # Better initialization
        nn.init.orthogonal_(self.gru.weight_ih_l0)
        nn.init.constant_(self.gru.bias_ih_l0, 0)
        nn.init.orthogonal_(self.gru.weight_hh_l0)
        nn.init.constant_(self.gru.bias_hh_l0, 0)
        nn.init.orthogonal_(self.gru.weight_ih_l0_reverse)
        nn.init.constant_(self.gru.bias_ih_l0_reverse, 0)
        nn.init.orthogonal_(self.gru.weight_hh_l0_reverse)
        nn.init.constant_(self.gru.bias_hh_l0_reverse, 0)
        nn.init.xavier_uniform_(self.fc_prob.weight)
        nn.init.constant_(self.fc_prob.bias, 0)
        if self.pooling == 'att':
            nn.init.xavier_uniform_(self.fc_att.weight)
            nn.init.constant_(self.fc_att.bias, 0)
        if self.pooling == 'auto':
            self.autopool = AutoPool(self.output_size)
Пример #23
0
    def __init__(
        self,
        encoder,
        in_features,
        num_classes,
        n_fft,
        hop_length,
        sample_rate,
        n_mels,
        fmin,
        fmax,
        dropout_rate=0.1,
        freeze_spectrogram_parameters=True,
        freeze_logmel_parameters=True,
        use_spec_augmentation=True,
        time_drop_width=64,
        time_stripes_num=2,
        freq_drop_width=8,
        freq_stripes_num=2,
        spec_augmentation_method=None,
        apply_mixup=False,
        apply_spec_shuffle=False,
        spec_shuffle_prob=0,
        use_gru_layer=False,
        apply_tta=False,
        apply_encoder=False,
        **params,
    ):
        super().__init__()
        self.n_mels = n_mels
        self.dropout_rate = dropout_rate
        self.apply_mixup = apply_mixup
        self.apply_spec_shuffle = apply_spec_shuffle
        self.spec_shuffle_prob = spec_shuffle_prob
        self.use_gru_layer = use_gru_layer
        self.apply_tta = apply_tta

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=n_fft,
            window="hann",
            center=True,
            pad_mode="reflect",
            freeze_parameters=freeze_spectrogram_parameters,
        )

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=n_fft,
            n_mels=n_mels,
            fmin=fmin,
            fmax=fmax,
            ref=1.0,
            amin=1e-10,
            top_db=None,
            freeze_parameters=freeze_logmel_parameters,
            is_log=False,
        )

        # Spec augmenter
        self.spec_augmenter = None
        if use_spec_augmentation and (spec_augmentation_method is None):
            self.spec_augmenter = SpecAugmentation(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
            )
        elif use_spec_augmentation and (spec_augmentation_method is not None):
            self.spec_augmenter = SpecAugmentationPlusPlus(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
                method=spec_augmentation_method,
            )

        # encoder
        self.conformer = nn.Sequential(*[
            ConformerBlock(
                dim=n_mels,
                dim_head=64,
                heads=8,
                ff_mult=4,
                conv_expansion_factor=2,
                conv_kernel_size=31,
                attn_dropout=dropout_rate,
                ff_dropout=dropout_rate,
                conv_dropout=dropout_rate,
            ) for _ in range(3)
        ])

        self.fc = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(n_mels, num_classes),
        )