def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_FrameAtt, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.att_block = AttBlock(n_in=512, n_out=17, activation='sigmoid') self.init_weights()
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_Gru_FrameAvg, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.gru = nn.GRU(input_size=512, hidden_size=256, num_layers=1, bias=True, batch_first=True, bidirectional=True) self.fc = nn.Linear(512, classes_num, bias=True) self.init_weights()
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_Transformer_FrameAvg, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) n_head = 8 n_hid = 512 d_k = 64 d_v = 64 dropout = 0.2 self.multihead = MultiHead(n_head, n_hid, d_k, d_v, dropout) self.fc = nn.Linear(512, classes_num, bias=True) self.init_weights()
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn14, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor_openmic = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor_openmic = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) # self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) # self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) # self.fc1 = nn.Linear(2048, 2048, bias=True) # self.fc_openmic = nn.Linear(2048, classes_num, bias=True) self.fc1 = nn.Linear(512, 512, bias=True) self.fc_monophonic = nn.Linear(512, classes_num, bias=True) self.init_weight()
n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True, ) # Logmel feature extractor logmel_extractor_openmic = LogmelFilterBank( sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True, ) def preprocess(x): # normalize raw numpy array then convert it to torch tensor x = x.astype(float) x /= np.abs(x).max() x = x.reshape(1, -1) x = torch.from_numpy(x).float() return x