def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Encoder_B0_Pretrained, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=4, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(mel_bins) self.bn0.load_state_dict(torch.load('pretrained_bn0_b0')) for p in self.bn0.parameters(): p.requires_grad = False fe = 1280 fe_features = 2048 self.fc1 = nn.Linear(fe, fe_features, bias=True) self.fc1.load_state_dict(torch.load('pretrained_fc1_b0_fold_0')) for p in self.fc1.parameters(): p.requires_grad = False self.att_block = AttBlock(fe_features, classes_num) # self.fe = timm.models.resnest50d_4s2x40d(pretrained=True) self.fe = timm.models.tf_efficientnet_b0_ns(pretrained=False) self.fe = nn.Sequential(*list(self.fe.children())[:-2]) self.fe.load_state_dict(torch.load('pretrained_fe_b0_fold_0')) for p in self.fe.parameters(): p.requires_grad = False
def __init__(self, model_name, n_out): super(AudioClassifier, self).__init__() # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=80, time_stripes_num=2, freq_drop_width=16, freq_stripes_num=2) self.net = timm.create_model(model_name, pretrained=True, in_chans=1) self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) self.dropout1 = nn.Dropout(0.3) self.dropout2 = nn.Dropout(0.3) if hasattr(self.net, "classifier"): n_features = self.net.classifier.in_features elif hasattr(self.net, "fc"): n_features = self.net.fc.in_features self.net_classifier = nn.Linear(n_features, n_out) self.init_weight() # korrniaのrandom cropはh,wを想定しているため注意 self.transform = nn.Sequential( K.RandomHorizontalFlip(p=0.1), # K.GaussianBlur(7, p=0.5), # K.RandomCrop((round(IMAGE_HEIGHT*0.7), round(IMAGE_WIDTH*0.7)),p=0.3) )
def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super().__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None self.interpolate_ratio = 30 # Downsampled ratio # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) # Model Encoder self.encoder = encoder_params[encoder]["init_op"]() self.fc1 = nn.Linear(encoder_params[encoder]["features"], 1024, bias=True) self.att_block = AttBlock(1024, classes_num, activation="sigmoid") self.bn0 = nn.BatchNorm2d(mel_bins) self.init_weight()
def __init__(self, base_model_name: str, pretrained=False, num_classes=24, in_channels=1): super().__init__() # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64 // 2, time_stripes_num=2, freq_drop_width=8 // 2, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(CFG.n_mels) base_model = timm.create_model(base_model_name, pretrained=pretrained, in_chans=in_channels) layers = list(base_model.children())[:-2] self.encoder = nn.Sequential(*layers) if hasattr(base_model, "fc"): in_features = base_model.fc.in_features else: in_features = base_model.classifier.in_features self.fc1 = nn.Linear(in_features, in_features, bias=True) self.att_block = AttBlockV2(in_features, num_classes, activation="sigmoid") self.init_weight()
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn14, self).__init__() window = "hann" center = True pad_mode = "reflect" ref = 1.0 amin = 1e-10 top_db = None self.dataset_mean = 0.0 self.dataset_std = 1.0 # Spectrogram extractor self.spectrogram_extractor = Spectrogram( n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True, ) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank( sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True, ) # Spec augmenter self.spec_augmenter = SpecAugmentation( time_drop_width=32, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2, ) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) self.fc1 = nn.Linear(2048, 2048, bias=True) self.fc_audioset = nn.Linear(2048, classes_num, bias=True) self.init_weight()
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Encoder_Transformer, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=4, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(mel_bins) fe = 1280 d_model = 512 self.fc1 = nn.Linear(fe, d_model, bias=True) n_head = 4 self.multihead = ResidualAttentionBlock(d_model, n_head) self.att_block = AttBlock(d_model, classes_num) # self.fe = timm.models.resnest50d_4s2x40d(pretrained=True) self.fe = timm.models.tf_efficientnet_b0_ns(pretrained=True) self.fe = nn.Sequential(*list(self.fe.children())[:-2]) self.pos_emb = nn.Embedding(32, d_model)
def __init__(self): super(CNNEncoder2, self).__init__() self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=4, freq_drop_width=8, freq_stripes_num=2) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_Gru_FrameAtt, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.gru = nn.GRU(input_size=512, hidden_size=256, num_layers=1, bias=True, batch_first=True, bidirectional=True) self.att_block = AttBlock(n_in=512, n_out=17, activation='sigmoid') self.init_weights()
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_Transformer_FrameAtt, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) n_head = 8 n_hid = 512 d_k = 64 d_v = 64 dropout = 0.2 self.multihead = MultiHead(n_head, n_hid, d_k, d_v, dropout) self.att_block = AttBlock(n_in=512, n_out=17, activation='sigmoid') self.init_weights()
def __init__(self, sample_rate: int, window_size: int, hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int): super().__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None self.interpolate_ratio = 32 # Downsampled ratio # Spectrogram extractor self.spectrogram_extractor = Spectrogram( n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank( sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation( time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(mel_bins) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) self.fc1 = nn.Linear(2048, 2048, bias=True) self.att_block = AttBlock(2048, classes_num, activation='sigmoid') self.init_weight()
def __init__(self, sample_rate: int, window_size: int, hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int, apply_aug: bool, top_db=None): super().__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 self.interpolate_ratio = 32 # Downsampled ratio self.apply_aug = apply_aug # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(mel_bins) self.fc1 = nn.Linear(1024, 1024, bias=True) self.att_block = AttBlockV2(1024, classes_num, activation='sigmoid') self.densenet_features = models.densenet121(pretrained=True).features self.init_weight()
def __init__(self, base_model_name: str, pretrained=False, num_classes=24, in_channels=1): super().__init__() # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft, hop_length=CFG.hop_length, win_length=CFG.n_fft, window="hann", center=True, pad_mode="reflect", freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft, n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(CFG.n_mels) base_model = timm.create_model(base_model_name, pretrained=pretrained, in_chans=in_channels) layers = list(base_model.children())[:-2] self.encoder = nn.Sequential(*layers) if hasattr(base_model, "fc"): in_features = base_model.fc.in_features else: in_features = base_model.classifier.in_features self.fc1 = nn.Linear(in_features, in_features, bias=True) self.att_block = AttBlockV2(in_features, num_classes, activation="sigmoid") self.init_weight()
def specAug(self, x): ''' spwcAug augmentate data with SpecAugmentation. This returns torch tensor. args x: ndarray or torch tensor ''' augmenter = SpecAugmentation(time_drop_width=32, time_stripes_num=2, freq_drop_width=32, freq_stripes_num=4) augmented_data = augmenter(x) return augmented_data
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_FrameAvg, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.fc = nn.Linear(512, classes_num, bias=True) self.init_weights()
def __init__(self): super(Tmodel, self).__init__() SPEC_HEIGHT = 128 SPEC_WIDTH = 256 NUM_MELS = SPEC_HEIGHT HOP_LENGTH = int( 32000 * 5 / (SPEC_WIDTH - 1)) # sample rate * duration / spec width - 1 == 627 FMIN = 500 FMAX = 12500 classes_num = 398 self.interpolate_ratio = 8 self.spectrogram_extractor = Spectrogram(n_fft=2048, hop_length=HOP_LENGTH, freeze_parameters=True) self.logmel_extractor = LogmelFilterBank(sr=32000, n_mels=NUM_MELS, fmin=FMIN, fmax=FMAX, freeze_parameters=True) self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(128) base_model = torch.hub.load('zhanghang1989/ResNeSt', 'resnest50', pretrained=False) layers = list(base_model.children())[:-2] self.encoder = nn.Sequential(*layers) self.gru = nn.GRU(input_size=2048, hidden_size=1024, num_layers=1, bias=True, batch_first=True, bidirectional=True) self.att_block = AttBlockV2(2048, classes_num, activation='sigmoid') self.init_weights()
def __init__(self, classes_num): super(Pre_Cnn14, self).__init__() # # window = 'hann' # center = True # pad_mode = 'reflect' # ref = 1.0 # amin = 1e-10 # top_db = None # # # Spectrogram extractor # self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, # win_length=window_size, window=window, center=center, # pad_mode=pad_mode, # freeze_parameters=True) # # # Logmel feature extractor # self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, # n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, # top_db=top_db, # freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) self.fc1 = nn.Linear(2048, 2048, bias=True) self.fc2 = nn.Linear(2048, classes_num, bias=True) self.init_weight()
def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super().__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.encoder = encoder_params[encoder]["init_op"]() self.avg_pool = AdaptiveAvgPool2d((1, 1)) #self.max_pool = AdaptiveMaxPool2d((1, 1)) self.dropout = Dropout(0.3) self.fc = Linear(encoder_params[encoder]['features'], classes_num)
def __init__(self, classes_num): super(Cnn10, self).__init__() # Spec augmenter self.spec_augmenter = SpecAugmentation( time_drop_width=24, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2 ) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.fc1 = nn.Linear(512, 512, bias=True) self.fc_audioset = nn.Linear(512, classes_num, bias=True) self.init_weight()
def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super().__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None #self.interpolate_ratio = 29 # Downsampled ratio self.interpolate_ratio = 29 # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.batch_norm = nn.BatchNorm2d(mel_bins) self.encoder = encoder_params[encoder]["init_op"]() #self.encoder.last_linear = Linear(encoder_params[encoder]['features'], 2048, bias=True) #self.encoder.classifier = Linear(2048, encoder_params[encoder]['features'], bias=True) #self.fc = Linear(encoder_params[encoder]['features'], 2048, bias=True) #self.encoder.fc = nn.Linear(2048, 2048) self.dropout = Dropout(0.3) self.att_head = AttentionHead(1000, classes_num, activation='sigmoid') #self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) self.init_weight()
def __init__(self, input_size, classes_num=10, activation="softmax"): super(TFNet, self).__init__() self.activation = activation self.tfblock1 = TFBlock(in_channels=1, out_channels=64, input_size=input_size) self.tfblock2 = TFBlock(in_channels=64, out_channels=128, input_size=input_size) self.tfblock3 = TFBlock(in_channels=128, out_channels=256, input_size=input_size) self.tfblock4 = TFBlock(in_channels=256, out_channels=512, input_size=input_size) self.fc = nn.Linear(512, classes_num, bias=True) self.spec_augmenter = SpecAugmentation(time_drop_width=24, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2)
def __init__( self, encoder, in_features, num_classes, n_fft, hop_length, sample_rate, n_mels, fmin, fmax, dropout_rate=0.5, freeze_spectrogram_parameters=True, freeze_logmel_parameters=True, use_spec_augmentation=True, time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2, spec_augmentation_method=None, apply_mixup=False, apply_spec_shuffle=False, spec_shuffle_prob=0, use_gru_layer=False, apply_tta=False, use_loudness=False, use_spectral_centroid=False, apply_delta_spectrum=False, apply_time_freq_encoding=False, min_db=120, apply_pcen=False, freeze_pcen_parameters=False, use_multisample_dropout=False, multisample_dropout=0.5, num_multisample_dropout=5, pooling_kernel_size=3, **params, ): super().__init__() self.n_mels = n_mels self.dropout_rate = dropout_rate self.apply_mixup = apply_mixup self.apply_spec_shuffle = apply_spec_shuffle self.spec_shuffle_prob = spec_shuffle_prob self.use_gru_layer = use_gru_layer self.apply_tta = apply_tta self.use_loudness = use_loudness self.use_spectral_centroid = use_spectral_centroid self.apply_delta_spectrum = apply_delta_spectrum self.apply_time_freq_encoding = apply_time_freq_encoding self.apply_pcen = apply_pcen self.use_multisample_dropout = use_multisample_dropout self.num_multisample_dropout = num_multisample_dropout self.pooling_kernel_size = pooling_kernel_size # Spectrogram extractor self.spectrogram_extractor = Spectrogram( n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window="hann", center=True, pad_mode="reflect", freeze_parameters=freeze_spectrogram_parameters, ) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank( sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=freeze_logmel_parameters, is_log=False, ) self.power_to_db = torchaudio.transforms.AmplitudeToDB() # Spec augmenter self.spec_augmenter = None if use_spec_augmentation and (spec_augmentation_method is None): self.spec_augmenter = SpecAugmentation( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, ) elif use_spec_augmentation and (spec_augmentation_method is not None): self.spec_augmenter = SpecAugmentationPlusPlus( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, method=spec_augmentation_method, ) if self.use_loudness: self.loudness_bn = nn.BatchNorm1d(1) self.loudness_extractor = Loudness( sr=sample_rate, n_fft=n_fft, min_db=min_db, ) if self.use_spectral_centroid: self.spectral_centroid_bn = nn.BatchNorm1d(1) if self.apply_pcen: self.pcen_transform = PCENTransform( trainable=~freeze_pcen_parameters, ) # layers = list(encoder.children())[:-2] # self.encoder = nn.Sequential(*layers) self.encoder = encoder if self.use_multisample_dropout: self.big_dropout = nn.Dropout(p=multisample_dropout)
def __init__(self, args, num_mels, num_meta, num_classes): super(TALNetV3, self).__init__() self.__dict__.update(args.__dict__) # Install all args into self assert self.n_conv_layers % self.n_pool_layers == 0 self.input_n_freq_bins = n_freq_bins = num_mels self.output_size = num_classes self.num_meta = num_meta self.n_head = self.transfo_head self.d_k = self.d_v = 128 self.meta_emb = self.nb_meta_emb # Conv self.conv = [] self.conv_v2 = [] pool_interval = self.n_conv_layers / self.n_pool_layers n_input = 1 for i in range(self.n_conv_layers): if (i + 1) % pool_interval == 0: # this layer has pooling n_freq_bins /= 2 n_output = self.embedding_size / n_freq_bins pool_stride = (2, 2) if i < pool_interval * 2 else (1, 2) else: n_output = self.embedding_size * 2 / n_freq_bins pool_stride = None layer = ConvBlock(n_input, n_output, self.kernel_size, batch_norm=self.batch_norm, pool_stride=pool_stride) self.conv.append(layer) self.__setattr__('conv' + str(i + 1), layer) layer_v2 = ConvBlockTALNet( int(n_input), int(n_output), (int(self.kernel_size), int(self.kernel_size)), norm='GN', pool_stride=pool_stride, pool_strat='max', activation='mish') self.conv_v2.append(layer_v2) self.__setattr__('conv_v2' + str(i + 1), layer_v2) n_input = n_output # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) # Metadata + fc self.t2v = Time2Vec(self.num_meta, self.meta_emb) # Temp (Transfo + GRU) self.multihead_meta = MultiHead(self.n_head, self.num_meta, self.d_k, self.d_v, self.dropout_transfo) self.gru = nn.GRU(int(self.embedding_size), int(self.embedding_size / 2), 1, batch_first=True, bidirectional=True) self.multihead_v2 = MultiHead(self.n_head, self.embedding_size, self.d_k, self.d_v, self.dropout_transfo) # FC # self.att_block = AttBlock(n_in=(self.embedding_size * 2 + self.meta_emb * self.num_meta), n_out=self.output_size, activation='sigmoid') self.fc_prob = nn.Linear( self.embedding_size * 2 + self.meta_emb * self.num_meta, self.output_size) if self.pooling == 'att': self.fc_att = nn.Linear( self.embedding_size * 2 + self.meta_emb * self.num_meta, self.output_size) # Better initialization nn.init.orthogonal_(self.gru.weight_ih_l0) nn.init.constant_(self.gru.bias_ih_l0, 0) nn.init.orthogonal_(self.gru.weight_hh_l0) nn.init.constant_(self.gru.bias_hh_l0, 0) nn.init.orthogonal_(self.gru.weight_ih_l0_reverse) nn.init.constant_(self.gru.bias_ih_l0_reverse, 0) nn.init.orthogonal_(self.gru.weight_hh_l0_reverse) nn.init.constant_(self.gru.bias_hh_l0_reverse, 0) nn.init.xavier_uniform_(self.fc_prob.weight) nn.init.constant_(self.fc_prob.bias, 0) if self.pooling == 'att': nn.init.xavier_uniform_(self.fc_att.weight) nn.init.constant_(self.fc_att.bias, 0) if self.pooling == 'auto': self.autopool = AutoPool(self.output_size)
def __init__( self, encoder, in_features, num_classes, n_fft, hop_length, sample_rate, n_mels, fmin, fmax, dropout_rate=0.1, freeze_spectrogram_parameters=True, freeze_logmel_parameters=True, use_spec_augmentation=True, time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2, spec_augmentation_method=None, apply_mixup=False, apply_spec_shuffle=False, spec_shuffle_prob=0, use_gru_layer=False, apply_tta=False, apply_encoder=False, **params, ): super().__init__() self.n_mels = n_mels self.dropout_rate = dropout_rate self.apply_mixup = apply_mixup self.apply_spec_shuffle = apply_spec_shuffle self.spec_shuffle_prob = spec_shuffle_prob self.use_gru_layer = use_gru_layer self.apply_tta = apply_tta # Spectrogram extractor self.spectrogram_extractor = Spectrogram( n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window="hann", center=True, pad_mode="reflect", freeze_parameters=freeze_spectrogram_parameters, ) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank( sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=freeze_logmel_parameters, is_log=False, ) # Spec augmenter self.spec_augmenter = None if use_spec_augmentation and (spec_augmentation_method is None): self.spec_augmenter = SpecAugmentation( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, ) elif use_spec_augmentation and (spec_augmentation_method is not None): self.spec_augmenter = SpecAugmentationPlusPlus( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, method=spec_augmentation_method, ) # encoder self.conformer = nn.Sequential(*[ ConformerBlock( dim=n_mels, dim_head=64, heads=8, ff_mult=4, conv_expansion_factor=2, conv_kernel_size=31, attn_dropout=dropout_rate, ff_dropout=dropout_rate, conv_dropout=dropout_rate, ) for _ in range(3) ]) self.fc = nn.Sequential( nn.Dropout(dropout_rate), nn.Linear(n_mels, num_classes), )