class RondomStretchMelSpectrogram(nn.Module): def __init__(self, sample_rate, n_fft, top_db, max_perc): super().__init__() self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft // 2 + 1) self.stft = Spectrogram(n_fft=n_fft, power=None) self.com_norm = ComplexNorm(power=2.) self.mel_specgram = MelSpectrogram(sample_rate, n_fft=n_fft, f_max=8000) self.AtoDB = AmplitudeToDB(top_db=top_db) self.dist = Uniform(1. - max_perc, 1 + max_perc) def forward(self, x, train): x = self.stft(x) if train: x = self.time_stretch(x, self.dist.sample().item()) x = self.com_norm(x) x = self.mel_specgram.mel_scale(x) x = self.AtoDB(x) size = torch.tensor(x.size()) if size[3] > 157: x = x[:, :, :, 0:157] else: x = torch.cat([ x, torch.cuda.FloatTensor(size[0], size[1], size[2], 157 - size[3]).fill_(0) ], dim=3) return x
class RondomStretchMelSpectrogram(nn.Module): def __init__(self, sample_rate, n_fft, top_db, max_perc): super().__init__() self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft // 2 + 1) self.stft = Spectrogram(n_fft=n_fft, power=None) self.com_norm = ComplexNorm(power=2.) self.fm = FrequencyMasking(100) self.tm = TimeMasking(100) self.mel_specgram = MelSpectrogram(sample_rate, n_fft=n_fft, f_max=8000) self.AtoDB = AmplitudeToDB(top_db=top_db) self.max_perc = max_perc self.sample_rate = sample_rate self.resamples = [ Resample(sample_rate, sample_rate * 0.6), Resample(sample_rate, sample_rate * 0.7), Resample(sample_rate, sample_rate * 0.8), Resample(sample_rate, sample_rate * 0.9), Resample(sample_rate, sample_rate * 1), Resample(sample_rate, sample_rate * 1.1), Resample(sample_rate, sample_rate * 1.2), Resample(sample_rate, sample_rate * 1.3), Resample(sample_rate, sample_rate * 1.4) ] def forward(self, x, train): x = random.choice(self.resamples)(x) x = self.stft(x) if train: dist = Uniform(1. - self.max_perc, 1 + self.max_perc) x = self.time_stretch(x, dist.sample().item()) x = self.com_norm(x) x = self.fm(x, 0) x = self.tm(x, 0) else: x = self.com_norm(x) x = self.mel_specgram.mel_scale(x) x = self.AtoDB(x) size = torch.tensor(x.size()) if size[3] > 157: x = x[:, :, :, 0:157] else: x = torch.cat([ x, torch.cuda.FloatTensor(size[0], size[1], size[2], 157 - size[3]).fill_(0) ], dim=3) return x
class MelspectrogramStretch(object): def __init__(self): sample_rate = 44100 num_mels = 128 fft_length = 2048 hop_length = fft_length // 2 self.stft = Spectrogram(n_fft=fft_length, win_length=fft_length, hop_length=None, pad=0, power=None, normalized=False) self.mst = MelSpectrogram(sample_rate=sample_rate, n_fft=fft_length, hop_length=hop_length, n_mels=num_mels) # Normalization (pot spec processing) self.complex_norm = ComplexNorm(power=2.) def forward(self, data): tsf = AudioTransforms() sig_t, sr, _ = tsf.apply(data, None) length = torch.tensor(sig_t.size(0)) sr = torch.tensor(sr) data = [d.unsqueeze(0).to("cpu") for d in [sig_t, length, sr]] # x-> (batch, time, channel) x, lengths, _ = data # unpacking seqs, lengths and srs # x-> (batch, channel, time) xt = x.float().transpose(1, 2) # xt -> (batch, channel, freq, time) x = self.stft(xt) # x -> (fft_length//2+1,bins,channel) #print(x.shape) #torch.Size([1, 1, 1025, 173, 2]) x = self.complex_norm(x) #print(x.shape) #torch.Size([1, 1, 1025, 173]) x = self.mst.mel_scale(x) #print(x.shape) #torch.Size([1, 1, 128, 173]) # Normalize melspectrogram # Independent mean, std per batch non_batch_inds = [1, 2, 3] mean = x.mean(non_batch_inds, keepdim=True) std = x.std(non_batch_inds, keepdim=True) x = (x - mean) / std x = x.to('cpu').detach().numpy().copy() lengths = [x.shape[3]] return x, lengths