示例#1
0
def my_cnn14(n_fft, n_mels, n_classes=100, hop_size=320, fmin=160, fmax=10300):
    _model_config = {
        "sample_rate": 32000,
        "window_size": 1024,
        "hop_size": 320,
        "mel_bins": 64,
        "fmin": 50,
        "fmax": 14000,
        "classes_num": 527,
    }
    model = Cnn14(**_model_config)
    model.fc_audioset = nn.Linear(2048, n_classes, bias=True)
    init_layer(model.fc_audioset)
    model.spectrogram_extractor = Spectrogram(n_fft=n_fft,
                                              hop_length=hop_size,
                                              win_length=n_fft)
    model.logmel_extractor = LogmelFilterBank(
        sr=SAMPLE_RATE,
        n_fft=n_fft,
        n_mels=n_mels,
        fmin=fmin,
        fmax=fmax,
    )
    model.bn0 = nn.BatchNorm2d(n_mels)
    init_bn(model.bn0)
    return model
    def __init__(self, backbone, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num):
        """Classifier for a new task using pretrained Cnn14 as a sub module.
        """
        super(AudioClassifierHub, self).__init__()    

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        self.bn = nn.BatchNorm2d(3)
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.backbone =  backbone  
        
        in_feat = backbone.classifier.in_features
        self.backbone.classifier = nn.Linear(in_feat, classes_num)
示例#3
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):

        super(Encoder_B0_Pretrained, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=4,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(mel_bins)
        self.bn0.load_state_dict(torch.load('pretrained_bn0_b0'))
        for p in self.bn0.parameters():
            p.requires_grad = False

        fe = 1280
        fe_features = 2048

        self.fc1 = nn.Linear(fe, fe_features, bias=True)
        self.fc1.load_state_dict(torch.load('pretrained_fc1_b0_fold_0'))
        for p in self.fc1.parameters():
            p.requires_grad = False

        self.att_block = AttBlock(fe_features, classes_num)

        #         self.fe = timm.models.resnest50d_4s2x40d(pretrained=True)
        self.fe = timm.models.tf_efficientnet_b0_ns(pretrained=False)
        self.fe = nn.Sequential(*list(self.fe.children())[:-2])

        self.fe.load_state_dict(torch.load('pretrained_fe_b0_fold_0'))
        for p in self.fe.parameters():
            p.requires_grad = False
示例#4
0
    def __init__(self,
                 df: pd.DataFrame,
                 datadir: Path,
                 img_size=224,
                 waveform_transforms=None,
                 period=5,
                 mode=None):
        self.df = df
        self.datadir = datadir
        self.img_size = img_size
        self.waveform_transforms = waveform_transforms
        self.period = period
        self.mode = mode

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft,
                                                 hop_length=CFG.hop_length,
                                                 win_length=CFG.n_fft,
                                                 window="hann",
                                                 center=True,
                                                 pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate,
                                                 n_fft=CFG.n_fft,
                                                 n_mels=CFG.n_mels,
                                                 fmin=CFG.fmin,
                                                 fmax=CFG.fmax,
                                                 ref=1.0,
                                                 amin=1e-10,
                                                 top_db=None,
                                                 freeze_parameters=True)
示例#5
0
    def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num):
        super().__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.interpolate_ratio = 30  # Downsampled ratio

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)
        
        # Model Encoder
        self.encoder = encoder_params[encoder]["init_op"]()
        self.fc1 = nn.Linear(encoder_params[encoder]["features"], 1024, bias=True)
        self.att_block = AttBlock(1024, classes_num, activation="sigmoid")
        self.bn0 = nn.BatchNorm2d(mel_bins)
        self.init_weight()
示例#6
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):
        super(Cnn14, self).__init__()

        window = "hann"
        center = True
        pad_mode = "reflect"
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.dataset_mean = 0.0
        self.dataset_std = 1.0
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=window_size,
            hop_length=hop_size,
            win_length=window_size,
            window=window,
            center=center,
            pad_mode=pad_mode,
            freeze_parameters=True,
        )

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=window_size,
            n_mels=mel_bins,
            fmin=fmin,
            fmax=fmax,
            ref=ref,
            amin=amin,
            top_db=top_db,
            freeze_parameters=True,
        )

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(
            time_drop_width=32,
            time_stripes_num=2,
            freq_drop_width=8,
            freq_stripes_num=2,
        )

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)

        self.init_weight()
示例#7
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):

        super(Encoder_Transformer, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=4,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(mel_bins)

        fe = 1280
        d_model = 512

        self.fc1 = nn.Linear(fe, d_model, bias=True)

        n_head = 4

        self.multihead = ResidualAttentionBlock(d_model, n_head)

        self.att_block = AttBlock(d_model, classes_num)

        #         self.fe = timm.models.resnest50d_4s2x40d(pretrained=True)
        self.fe = timm.models.tf_efficientnet_b0_ns(pretrained=True)
        self.fe = nn.Sequential(*list(self.fe.children())[:-2])

        self.pos_emb = nn.Embedding(32, d_model)
示例#8
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):

        super(Cnn_9layers_Gru_FrameAtt, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)

        self.gru = nn.GRU(input_size=512,
                          hidden_size=256,
                          num_layers=1,
                          bias=True,
                          batch_first=True,
                          bidirectional=True)

        self.att_block = AttBlock(n_in=512, n_out=17, activation='sigmoid')

        self.init_weights()
示例#9
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):

        super(Cnn_9layers_Transformer_FrameAtt, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)

        n_head = 8
        n_hid = 512
        d_k = 64
        d_v = 64
        dropout = 0.2
        self.multihead = MultiHead(n_head, n_hid, d_k, d_v, dropout)

        self.att_block = AttBlock(n_in=512, n_out=17, activation='sigmoid')

        self.init_weights()
示例#10
0
    def __init__(self,
                 sample_rate: int,
                 window_size: int,
                 hop_size: int,
                 mel_bins: int,
                 fmin: int,
                 fmax: int,
                 classes_num: int,
                 apply_aug: bool,
                 top_db=None):
        super().__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        self.interpolate_ratio = 32  # Downsampled ratio
        self.apply_aug = apply_aug

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(mel_bins)

        self.fc1 = nn.Linear(1024, 1024, bias=True)
        self.att_block = AttBlockV2(1024, classes_num, activation='sigmoid')

        self.densenet_features = models.densenet121(pretrained=True).features

        self.init_weight()
示例#11
0
    def __init__(self, sample_rate: int, window_size: int, hop_size: int,
                 mel_bins: int, fmin: int, fmax: int, classes_num: int):
        super().__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.interpolate_ratio = 32  # Downsampled ratio

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=window_size,
            hop_length=hop_size,
            win_length=window_size,
            window=window,
            center=center,
            pad_mode=pad_mode,
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=window_size,
            n_mels=mel_bins,
            fmin=fmin,
            fmax=fmax,
            ref=ref,
            amin=amin,
            top_db=top_db,
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(
            time_drop_width=64,
            time_stripes_num=2,
            freq_drop_width=8,
            freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(mel_bins)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.att_block = AttBlock(2048, classes_num, activation='sigmoid')

        self.init_weight()
示例#12
0
    def __init__(self,
                 base_model_name: str,
                 pretrained=False,
                 num_classes=24,
                 in_channels=1):
        super().__init__()

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft,
                                                 hop_length=CFG.hop_length,
                                                 win_length=CFG.n_fft,
                                                 window="hann",
                                                 center=True,
                                                 pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate,
                                                 n_fft=CFG.n_fft,
                                                 n_mels=CFG.n_mels,
                                                 fmin=CFG.fmin,
                                                 fmax=CFG.fmax,
                                                 ref=1.0,
                                                 amin=1e-10,
                                                 top_db=None,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(CFG.n_mels)

        base_model = timm.create_model(base_model_name,
                                       pretrained=pretrained,
                                       in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)

        if hasattr(base_model, "fc"):
            in_features = base_model.fc.in_features
        else:
            in_features = base_model.classifier.in_features
        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(in_features,
                                    num_classes,
                                    activation="sigmoid")

        self.init_weight()
def get_valid_all_clip_result(fold: int):
    # Load Data
    train_df = pd.read_csv(OUTPUT_DIR / "folds.csv")
    train_df = train_df[train_df["istp"] == 1].reset_index(drop=True)
    species_fmin_fmax = pd.read_csv(OUTPUT_DIR / "species_fmin_fmax.csv")
    f_min_mels = torch.tensor(species_fmin_fmax["f_min_mel"].values, dtype=torch.int)
    f_max_mels = torch.tensor(species_fmin_fmax["f_max_mel"].values, dtype=torch.int)
    # Load model
    model = AudioClassifier(CFG.model_param["encoder"], CFG.model_param["classes_num"])
    model.load_state_dict(torch.load(OUTPUT_DIR / f'fold-{fold}.bin'))
    model = model.to(device)
    # Get valid
    valid_fold = train_df[train_df.kfold == fold].reset_index(drop=True)
    test_dataset = TestDataset(
        df=valid_fold,
        period=CFG.period,
        transforms=None,
        data_path="../input/train",
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=CFG.batch_size//32,
        shuffle=False,
        drop_last=False,
        num_workers=CFG.num_workers
    )
    window = 'hann'
    center = True
    pad_mode = 'reflect'
    ref = 1.0
    amin = 1e-10
    top_db = None
    spectrogram_extractor = Spectrogram(n_fft=WINDOW_SIZE, hop_length=HOP_SIZE,
                                        win_length=WINDOW_SIZE, window=window,
                                        center=center, pad_mode=pad_mode,
                                        freeze_parameters=True).to(device)
    logmel_extractor = LogmelFilterBank(sr=SR, n_fft=WINDOW_SIZE,
                                        n_mels=N_MELS, fmin=FMIN, fmax=FMAX,
                                        ref=ref, amin=amin, top_db=top_db,
                                        freeze_parameters=True).to(device)
    test_pred, ids = test_epoch(model, spectrogram_extractor, logmel_extractor, test_loader,
                                f_min_mels, f_max_mels, device, resize=True)
    test_pred_df = pd.DataFrame({
        "recording_id": valid_fold.recording_id.values
    })
    test_pred_df["kfold"] = fold
    for i in range(24):
        test_pred_df[f"s{i}"] = 0
    test_pred_df[[f's{i}' for i in range(24)]] = test_pred
    return test_pred_df
示例#14
0
文件: models.py 项目: streamride/SED
    def __init__(self, n_classes, n_fft=1024, hop_length=256, n_mels=128, sr=22050, fc_output=1024):
        super().__init__()
        
        self.spectrogram = Spectrogram(n_fft=n_fft, hop_length=hop_length, win_length=n_fft)
        
        self.logmel_extractor = LogmelFilterBank(sr=sr, n_fft=n_fft, n_mels=n_mels)
        
        
        
        self.cnn = EfficientNet.from_pretrained('efficientnet-b3', in_channels=1)
        
        self.fc = nn.Linear(1536, fc_output)

        self.att_block = AttentionBlock(fc_output, n_classes)
        self.bn0 = nn.BatchNorm2d(n_mels)
示例#15
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):

        super(Cnn_9layers_FrameAvg, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)

        self.fc = nn.Linear(512, classes_num, bias=True)

        self.init_weights()
示例#16
0
    def __init__(self):
        super(Tmodel, self).__init__()

        SPEC_HEIGHT = 128
        SPEC_WIDTH = 256
        NUM_MELS = SPEC_HEIGHT
        HOP_LENGTH = int(
            32000 * 5 /
            (SPEC_WIDTH - 1))  # sample rate * duration / spec width - 1 == 627
        FMIN = 500
        FMAX = 12500
        classes_num = 398
        self.interpolate_ratio = 8

        self.spectrogram_extractor = Spectrogram(n_fft=2048,
                                                 hop_length=HOP_LENGTH,
                                                 freeze_parameters=True)

        self.logmel_extractor = LogmelFilterBank(sr=32000,
                                                 n_mels=NUM_MELS,
                                                 fmin=FMIN,
                                                 fmax=FMAX,
                                                 freeze_parameters=True)

        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(128)

        base_model = torch.hub.load('zhanghang1989/ResNeSt',
                                    'resnest50',
                                    pretrained=False)

        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)

        self.gru = nn.GRU(input_size=2048,
                          hidden_size=1024,
                          num_layers=1,
                          bias=True,
                          batch_first=True,
                          bidirectional=True)

        self.att_block = AttBlockV2(2048, classes_num, activation='sigmoid')
        self.init_weights()
    def __init__(self, frames_per_second, classes_num):
        super(Regress_onset_offset_frame_velocity_CRNN, self).__init__()

        sample_rate = 16000
        window_size = 2048
        hop_size = sample_rate // frames_per_second
        mel_bins = 229
        fmin = 30
        fmax = sample_rate // 2

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        midfeat = 1792
        momentum = 0.01

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, 
            hop_length=hop_size, win_length=window_size, window=window, 
            center=center, pad_mode=pad_mode, freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, 
            n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, 
            amin=amin, top_db=top_db, freeze_parameters=True)

        self.bn0 = nn.BatchNorm2d(mel_bins, momentum)

        self.frame_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)
        self.reg_onset_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)
        self.reg_offset_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)
        self.velocity_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)

        self.reg_onset_gru = nn.GRU(input_size=88 * 2, hidden_size=256, num_layers=1, 
            bias=True, batch_first=True, dropout=0., bidirectional=True)
        self.reg_onset_fc = nn.Linear(512, classes_num, bias=True)

        self.frame_gru = nn.GRU(input_size=88 * 3, hidden_size=256, num_layers=1, 
            bias=True, batch_first=True, dropout=0., bidirectional=True)
        self.frame_fc = nn.Linear(512, classes_num, bias=True)

        self.init_weight()
示例#18
0
    def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins,
                 fmin, fmax, classes_num):
        super().__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.encoder = encoder_params[encoder]["init_op"]()
        self.avg_pool = AdaptiveAvgPool2d((1, 1))
        #self.max_pool = AdaptiveMaxPool2d((1, 1))
        self.dropout = Dropout(0.3)
        self.fc = Linear(encoder_params[encoder]['features'], classes_num)
示例#19
0
    def __init__(self, encoder, sample_rate, window_size, hop_size, 
                 mel_bins, fmin, fmax, classes_num):
        super().__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        #self.interpolate_ratio = 29 # Downsampled ratio
        self.interpolate_ratio = 29

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
                                                 win_length=window_size, window=window, 
                                                 center=center, pad_mode=pad_mode, 
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
                                                 n_mels=mel_bins, fmin=fmin, fmax=fmax, 
                                                 ref=ref, amin=amin, top_db=top_db, 
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
                                               freq_drop_width=8, freq_stripes_num=2)
        
        self.batch_norm = nn.BatchNorm2d(mel_bins)
        self.encoder = encoder_params[encoder]["init_op"]()
        
        #self.encoder.last_linear = Linear(encoder_params[encoder]['features'], 2048, bias=True)
        #self.encoder.classifier = Linear(2048, encoder_params[encoder]['features'], bias=True)
        #self.fc = Linear(encoder_params[encoder]['features'], 2048, bias=True)
        
        #self.encoder.fc = nn.Linear(2048, 2048)
        self.dropout = Dropout(0.3)
        self.att_head = AttentionHead(1000, classes_num, activation='sigmoid')
        #self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.init_weight()
    def __init__(self, frames_per_second, classes_num):
        super(Regress_pedal_CRNN, self).__init__()

        sample_rate = 16000
        window_size = 2048
        hop_size = sample_rate // frames_per_second
        mel_bins = 229
        fmin = 30
        fmax = sample_rate // 2

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        midfeat = 1792
        momentum = 0.01

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, 
            hop_length=hop_size, win_length=window_size, window=window, 
            center=center, pad_mode=pad_mode, freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, 
            n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, 
            amin=amin, top_db=top_db, freeze_parameters=True)

        self.bn0 = nn.BatchNorm2d(mel_bins, momentum)

        self.reg_pedal_onset_model = AcousticModelCRnn8Dropout(1, midfeat, momentum)
        self.reg_pedal_offset_model = AcousticModelCRnn8Dropout(1, midfeat, momentum)
        self.reg_pedal_frame_model = AcousticModelCRnn8Dropout(1, midfeat, momentum)
        
        self.init_weight()
示例#21
0
def train_loop(fold):
    LOGGER.info(f"========== fold: {fold} training ==========")
    train_df = pd.read_csv(OUTPUT_DIR / 'folds.csv')
    if CFG.debug:
        train_df = train_df.sample(n=1000, random_state=42)
    train_fold = train_df[train_df.kfold != fold]
    valid_fold = train_df[train_df.kfold == fold]

    columns = [
        "recording_id", "species_id", "t_min", "f_min", "t_max", "f_max",
        "istp", "f_min_mel", "f_max_mel", "kfold"
    ]
    train_fold = train_fold[columns]
    print(f"train fold before concat: {train_fold.shape}")
    train_dataset = AudioDataset(
        df=train_fold,
        period=CFG.period,
        time=CFG.duration,
        transforms=augmenter,
        data_path="../input/train",
    )
    valid_dataset = ValidDataset(df=valid_fold,
                                 period=CFG.period,
                                 transforms=None,
                                 data_path="../input/train")
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=CFG.num_workers,
    )
    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=CFG.batch_size // 4,
                                               shuffle=False,
                                               drop_last=False,
                                               num_workers=CFG.num_workers)
    window = 'hann'
    center = True
    pad_mode = 'reflect'
    ref = 1.0
    amin = 1e-10
    top_db = None
    spectrogram_extractor = Spectrogram(n_fft=WINDOW_SIZE,
                                        hop_length=HOP_SIZE,
                                        win_length=WINDOW_SIZE,
                                        window=window,
                                        center=center,
                                        pad_mode=pad_mode,
                                        freeze_parameters=True).to(device)
    logmel_extractor = LogmelFilterBank(sr=SR,
                                        n_fft=WINDOW_SIZE,
                                        n_mels=N_MELS,
                                        fmin=FMIN,
                                        fmax=FMAX,
                                        ref=ref,
                                        amin=amin,
                                        top_db=top_db,
                                        freeze_parameters=True).to(device)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler == 'ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer,
                                          mode='min',
                                          factor=CFG.factor,
                                          patience=CFG.patience,
                                          verbose=True,
                                          eps=CFG.min_lr)
        elif CFG.scheduler == 'CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer,
                                          T_max=CFG.T_max,
                                          eta_min=CFG.min_lr,
                                          last_epoch=-1)
        elif CFG.scheduler == 'CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer,
                                                    T_0=CFG.T_0,
                                                    T_mult=1,
                                                    eta_min=CFG.min_lr,
                                                    last_epoch=-1)
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = AudioClassifier(CFG.model_param["encoder"],
                            CFG.model_param["classes_num"])
    model = model.to(device)

    # optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False)
    # scheduler = get_scheduler(optimizer)
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr)
    num_train_steps = int(len(train_loader) * CFG.epochs)
    num_warmup_steps = int(0.1 * CFG.epochs * len(train_loader))
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_steps)

    # criterion = nn.BCEWithLogitsLoss()
    criterion = BCEFocalLoss()

    best_score = -np.inf

    for epoch in range(CFG.epochs):

        if epoch < CFG.mixup_epochs:
            p_mixup = CFG.p_mixup
        else:
            p_mixup = 0.

        start_time = time.time()

        # train
        train_avg, train_loss = train_epoch(model,
                                            spectrogram_extractor,
                                            logmel_extractor,
                                            train_loader,
                                            criterion,
                                            optimizer,
                                            scheduler,
                                            epoch,
                                            device,
                                            p_mixup,
                                            spec_aug=True)

        # valid
        valid_avg, valid_loss = valid_epoch(model, spectrogram_extractor,
                                            logmel_extractor, valid_loader,
                                            criterion, epoch, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(valid_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        elapsed = time.time() - start_time

        LOGGER.info(
            f'Epoch {epoch+1} - avg_train_loss: {train_loss:.5f}  avg_val_loss: {valid_loss:.5f}  time: {elapsed:.0f}s'
        )
        LOGGER.info(
            f"Epoch {epoch+1} - train_LWLRAP:{train_avg['lwlrap']:0.5f}  valid_LWLRAP:{valid_avg['lwlrap']:0.5f}"
        )
        LOGGER.info(
            f"Epoch {epoch+1} - train_F1:{train_avg['f1score']:0.5f}  valid_F1:{valid_avg['f1score']:0.5f}"
        )

        if valid_avg['f1score'] > best_score:
            LOGGER.info(
                f">>>>>>>> Model Improved From {best_score} ----> {valid_avg['f1score']}"
            )
            torch.save(model.state_dict(), OUTPUT_DIR / f'fold-{fold}.bin')
            best_score = valid_avg['f1score']
示例#22
0
    def __init__(
        self,
        encoder,
        in_features,
        num_classes,
        n_fft,
        hop_length,
        sample_rate,
        n_mels,
        fmin,
        fmax,
        dropout_rate=0.5,
        freeze_spectrogram_parameters=True,
        freeze_logmel_parameters=True,
        use_spec_augmentation=True,
        time_drop_width=64,
        time_stripes_num=2,
        freq_drop_width=8,
        freq_stripes_num=2,
        spec_augmentation_method=None,
        apply_mixup=False,
        apply_spec_shuffle=False,
        spec_shuffle_prob=0,
        use_gru_layer=False,
        apply_tta=False,
        use_loudness=False,
        use_spectral_centroid=False,
        apply_delta_spectrum=False,
        apply_time_freq_encoding=False,
        min_db=120,
        apply_pcen=False,
        freeze_pcen_parameters=False,
        use_multisample_dropout=False,
        multisample_dropout=0.5,
        num_multisample_dropout=5,
        pooling_kernel_size=3,
        **params,
    ):
        super().__init__()
        self.n_mels = n_mels
        self.dropout_rate = dropout_rate
        self.apply_mixup = apply_mixup
        self.apply_spec_shuffle = apply_spec_shuffle
        self.spec_shuffle_prob = spec_shuffle_prob
        self.use_gru_layer = use_gru_layer
        self.apply_tta = apply_tta
        self.use_loudness = use_loudness
        self.use_spectral_centroid = use_spectral_centroid
        self.apply_delta_spectrum = apply_delta_spectrum
        self.apply_time_freq_encoding = apply_time_freq_encoding
        self.apply_pcen = apply_pcen
        self.use_multisample_dropout = use_multisample_dropout
        self.num_multisample_dropout = num_multisample_dropout
        self.pooling_kernel_size = pooling_kernel_size

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=n_fft,
            window="hann",
            center=True,
            pad_mode="reflect",
            freeze_parameters=freeze_spectrogram_parameters,
        )

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=n_fft,
            n_mels=n_mels,
            fmin=fmin,
            fmax=fmax,
            ref=1.0,
            amin=1e-10,
            top_db=None,
            freeze_parameters=freeze_logmel_parameters,
            is_log=False,
        )

        self.power_to_db = torchaudio.transforms.AmplitudeToDB()

        # Spec augmenter
        self.spec_augmenter = None
        if use_spec_augmentation and (spec_augmentation_method is None):
            self.spec_augmenter = SpecAugmentation(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
            )
        elif use_spec_augmentation and (spec_augmentation_method is not None):
            self.spec_augmenter = SpecAugmentationPlusPlus(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
                method=spec_augmentation_method,
            )

        if self.use_loudness:
            self.loudness_bn = nn.BatchNorm1d(1)
            self.loudness_extractor = Loudness(
                sr=sample_rate,
                n_fft=n_fft,
                min_db=min_db,
            )

        if self.use_spectral_centroid:
            self.spectral_centroid_bn = nn.BatchNorm1d(1)

        if self.apply_pcen:
            self.pcen_transform = PCENTransform(
                trainable=~freeze_pcen_parameters, )

        # layers = list(encoder.children())[:-2]
        # self.encoder = nn.Sequential(*layers)
        self.encoder = encoder

        if self.use_multisample_dropout:
            self.big_dropout = nn.Dropout(p=multisample_dropout)
示例#23
0
    def __init__(
        self,
        encoder,
        in_features,
        num_classes,
        n_fft,
        hop_length,
        sample_rate,
        n_mels,
        fmin,
        fmax,
        dropout_rate=0.1,
        freeze_spectrogram_parameters=True,
        freeze_logmel_parameters=True,
        use_spec_augmentation=True,
        time_drop_width=64,
        time_stripes_num=2,
        freq_drop_width=8,
        freq_stripes_num=2,
        spec_augmentation_method=None,
        apply_mixup=False,
        apply_spec_shuffle=False,
        spec_shuffle_prob=0,
        use_gru_layer=False,
        apply_tta=False,
        apply_encoder=False,
        **params,
    ):
        super().__init__()
        self.n_mels = n_mels
        self.dropout_rate = dropout_rate
        self.apply_mixup = apply_mixup
        self.apply_spec_shuffle = apply_spec_shuffle
        self.spec_shuffle_prob = spec_shuffle_prob
        self.use_gru_layer = use_gru_layer
        self.apply_tta = apply_tta

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=n_fft,
            window="hann",
            center=True,
            pad_mode="reflect",
            freeze_parameters=freeze_spectrogram_parameters,
        )

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=n_fft,
            n_mels=n_mels,
            fmin=fmin,
            fmax=fmax,
            ref=1.0,
            amin=1e-10,
            top_db=None,
            freeze_parameters=freeze_logmel_parameters,
            is_log=False,
        )

        # Spec augmenter
        self.spec_augmenter = None
        if use_spec_augmentation and (spec_augmentation_method is None):
            self.spec_augmenter = SpecAugmentation(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
            )
        elif use_spec_augmentation and (spec_augmentation_method is not None):
            self.spec_augmenter = SpecAugmentationPlusPlus(
                time_drop_width=time_drop_width,
                time_stripes_num=time_stripes_num,
                freq_drop_width=freq_drop_width,
                freq_stripes_num=freq_stripes_num,
                method=spec_augmentation_method,
            )

        # encoder
        self.conformer = nn.Sequential(*[
            ConformerBlock(
                dim=n_mels,
                dim_head=64,
                heads=8,
                ff_mult=4,
                conv_expansion_factor=2,
                conv_kernel_size=31,
                attn_dropout=dropout_rate,
                ff_dropout=dropout_rate,
                conv_dropout=dropout_rate,
            ) for _ in range(3)
        ])

        self.fc = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(n_mels, num_classes),
        )