def transform_audio(self, y): '''Compute the PCEN of the (log-) Mel Spectrogram Parameters ---------- y : np.ndarray The audio buffer Returns ------- data : dict data['mag'] : np.ndarray, shape = (n_frames, n_bins) The PCEN magnitude ''' #extract proper shape S_test = melspectrogram(y=y, sr=self.sr, hop_length=self.hop_length, n_fft=self.n_fft, n_mels=self.n_mels) P_test = pcen(S_test, sr=self.sr, hop_length=self.hop_length, time_constant=1) n_frames = P_test.shape[1] #double audio and reverse pad to prevent zero initial-energy assumption y = np.concatenate((y[::-1], y)) S = melspectrogram(y=y, sr=self.sr, hop_length=self.hop_length, n_fft=self.n_fft, n_mels=self.n_mels) if self.log: S = amplitude_to_db(S, ref=np.max) t_base = (self.hop_length) / (self.sr) #tau, or hop length in time t_constants = t_base * np.array( [2**i for i in range(self.n_t_constants)]) pcen_layers = [] for T in t_constants: P = pcen(S, sr=self.sr, hop_length=self.hop_length, time_constant=T) #source of off-by-one error: P = P[:, P.shape[1] - n_frames + 1:] #remove padded section P = to_dtype(P, self.dtype) pcen_layers.append(P) pcen_layers = to_dtype(np.asarray(pcen_layers), self.dtype) return { 'mag': self._index(pcen_layers) } #copied from mel spectrogram pump feature extractor
def to_pcen(self, gain=0.8, bias=10.0, power=0.25, time_constant=0.06): """ Create PCEN from MelSpectrogram Argument descriptions come from https://librosa.org/doc/latest/generated/librosa.pcen.html?highlight=pcen#librosa-pcen Args: gain: The gain factor. Typical values should be slightly less than 1 [default: 0.8] bias: The bias point of the nonlinear compression [default: 10.0] power: The compression exponent. Typical values should be between 0 and 0.5. Smaller values of power result in stronger compression. At the limit power=0, polynomial compression becomes logarithmic [default: 0.25] time_constant: The time constant for IIR filtering, measured in seconds [default: 0.06] Returns: The per-channel energy normalized version of MelSpectrogram.S """ return MelSpectrogram( pcen( self.S, sr=self.sample_rate, hop_length=self.hop_length, gain=gain, bias=bias, power=power, time_constant=time_constant, ), self.sample_rate, self.hop_length, self.fmin, self.fmax, )
def generate_spectrogram(wav, sample_rate, spec_opts): opts = { "n_fft": spec_opts.get("n_fft", DEFAULT_OPTS["n_fft"]), "hop_length": spec_opts.get("hop_length", DEFAULT_OPTS["hop_length"]), "window": spec_opts.get("window", DEFAULT_OPTS["window"]), } spec = librosa.stft(wav, **opts) if spec_opts.get("type", "mel") == "mel": opts.update({ "n_mels": spec_opts.get("n_mels", DEFAULT_OPTS["n_mels"]), "sr": sample_rate, }) spec = librosa.feature.melspectrogram(S=np.abs(spec)**2, **opts) spec = spec.astype(np.float32) pcen = spec_opts.get("pcen", {}) if pcen: pcen_opts = common_utils.deep_dict_update(DEFAULT_OPTS["pcen"], pcen, copy=True) opts["pcen"] = pcen_opts spec = librosa.pcen(spec * (2**31), **pcen_opts) return spec, opts
def test_pcen_ref(): srand() # Make a power spectrogram X = np.random.randn(100, 50)**2 # Edge cases: # gain=1, bias=0, power=1, b=1 => ones ones = np.ones_like(X) Y = librosa.pcen(X, gain=1, bias=0, power=1, b=1, eps=1e-20) assert np.allclose(Y, ones) # with ref=ones, we should get X / (eps + ones) == X Y2 = librosa.pcen(X, gain=1, bias=0, power=1, b=1, ref=ones, eps=1e-20) assert np.allclose(Y2, X)
def audio_to_pcen(self, audio, pathname=None): # use power=1 to get a magnitude spectrum instead of a power spectrum mag_spectrogram = librosa.feature.melspectrogram(audio, sr=self.sampling_rate, n_mels=self.n_mels, hop_length=self.hop_length, n_fft=self.n_fft, fmin=self.fmin, fmax=self.fmax, power=1) if pathname is not None: np.save(pathname.replace('.wav', '_{}.npy'.format(self.sampling_rate)), mag_spectrogram.astype(np.float32)) return mag_spectrogram # https://www.kaggle.com/c/freesound-audio-tagging-2019/discussion/91859#529792 pcen_spectrogram = librosa.pcen(mag_spectrogram, sr=self.sampling_rate, hop_length=self.hop_length, gain=0.5, bias=0.001, power=0.2, time_constant=0.4, eps=1e-9) pcen_spectrogram = pcen_spectrogram.astype(np.float32) if pathname is not None: np.save(pathname.replace('.wav', '_{}.npy'.format(self.sampling_rate)), pcen_spectrogram) return pcen_spectrogram
def test_pcen_ref(): srand() # Make a power spectrogram X = np.random.randn(100, 50)**2 # Edge cases: # gain=1, bias=0, power=1, b=1 => ones ones = np.ones_like(X) Y = librosa.pcen(X, gain=1, bias=0, power=1, b=1, eps=1e-20) assert np.allclose(Y, ones) # with ref=ones, we should get X / (eps + ones) == X Y2 = librosa.pcen(X, gain=1, bias=0, power=1, b=1, ref=ones, eps=1e-20) assert np.allclose(Y2, X)
def transform_audio(self, y): '''Compute the PCEN of the (log-) Mel Spectrogram Parameters ---------- y : np.ndarray The audio buffer Returns ------- data : dict data['mag'] : np.ndarray, shape = (n_frames, n_bins) The PCEN magnitude ''' n_frames = self.n_frames(get_duration(y=y, sr=self.sr)) S = melspectrogram(y=y, sr=self.sr, hop_length=self.hop_length, n_mels=self.n_mels, n_fft=self.n_fft) if self.log: S = amplitude_to_db(S, ref=np.max) P = pcen(S, sr=sr, hop_length=self.hop_length) P = to_dtype(P, self.dtype) return { 'mag': P[self.idx] } #copied from mel spectrogram pump feature extractor
def wav_to_h5(input_wav_dir): a, sr = librosa.load(input_wav_dir) a_log = librosa.feature.melspectrogram(a, sr=sr, n_fft=1024, hop_length=315, n_mels=80, fmax=11000, power=1) if PCEN == True: a_out = librosa.pcen(a_log * (2**31)) elif PCEN == False: a_out = librosa.power_to_db(a_log, ref=np.max) duration = librosa.get_duration(a) frames = a_out.shape[1] timeframe = np.linspace(0, duration, num=frames) dir_name = os.path.basename(os.path.dirname(input_wav_dir)) name = os.path.basename(input_wav_dir) with h5py.File('workingfiles/spect/{0}/{1}.h5'.format(dir_name, name), 'w') as data_file: data_file.create_dataset('features', data=a_out.T, dtype='float32') data_file.create_dataset('times', data=timeframe, dtype='float32')
def raw_to_pcen(audio, sampling_rate, window_size, hop_length, n_freqs): """Go from 1D numpy array containing audio waves to PCEN spectrogram. Parameters might not be optimal... Parameters: audio: 1D numpy array containing the audio. sampling_rate: Sampling rate of audio. window_size: STFT window size. hop_length: Distance between successive STFT windows. n_freqs: Number of mel frequency bins. Returns: PCEN spectrogram, bins x time. """ spectro = np.abs( librosa.stft(audio, n_fft=window_size, hop_length=hop_length, center=True)) mel = librosa.feature.melspectrogram(S=spectro, sr=sampling_rate, n_mels=n_freqs) pcen = librosa.pcen(mel, sr=sampling_rate, hop_length=hop_length, time_constant=0.285) return pcen
def __test(gain, bias, power, b, time_constant, eps, ms, S, Pexp): warnings.resetwarnings() warnings.simplefilter('always') with warnings.catch_warnings(record=True) as out: P = librosa.pcen(S, gain=gain, bias=bias, power=power, time_constant=time_constant, eps=eps, b=b, max_size=ms) if np.issubdtype(S.dtype, np.complexfloating): assert len(out) > 0 assert 'complex' in str(out[0].message).lower() assert P.shape == S.shape assert np.all(P >= 0) assert np.all(np.isfinite(P)) if Pexp is not None: assert np.allclose(P, Pexp)
def compute_pcen(audio, sr): # Load settings. pcen_settings = get_pcen_settings() # Validate audio librosa.util.valid_audio(audio, mono=True) # Map to the range [-2**31, 2**31[ audio = (audio * (2**31)).astype('float32') # Resample to 22,050 kHz if not sr == pcen_settings["sr"]: audio = librosa.resample(audio, sr, pcen_settings["sr"]) sr = pcen_settings["sr"] # Compute Short-Term Fourier Transform (STFT). stft = librosa.stft( audio, n_fft=pcen_settings["n_fft"], win_length=pcen_settings["win_length"], hop_length=pcen_settings["hop_length"], window=pcen_settings["window"]) # Compute squared magnitude coefficients. abs2_stft = (stft.real*stft.real) + (stft.imag*stft.imag) # Gather frequency bins according to the Mel scale. # NB: as of librosa v0.6.2, melspectrogram is type-instable and thus # returns 64-bit output even with a 32-bit input. Therefore, we need # to convert PCEN to single precision eventually. This might not be # necessary in the future, if the whole PCEN pipeline is kept type-stable. melspec = librosa.feature.melspectrogram( y=None, S=abs2_stft, sr=pcen_settings["sr"], n_fft=pcen_settings["n_fft"], n_mels=pcen_settings["n_mels"], htk=True, fmin=pcen_settings["fmin"], fmax=pcen_settings["fmax"]) # Compute PCEN. pcen = librosa.pcen( melspec, sr=pcen_settings["sr"], hop_length=pcen_settings["hop_length"], gain=pcen_settings["pcen_norm_exponent"], bias=pcen_settings["pcen_delta"], power=pcen_settings["pcen_power"], time_constant=pcen_settings["pcen_time_constant"]) # Convert to single floating-point precision. pcen = pcen.astype('float32') # Truncate spectrum to range 2-10 kHz. pcen = pcen[:pcen_settings["top_freq_id"], :] # Return. return pcen
def compute_pcen(audio, sr): # Load settings. pcen_settings = get_pcen_settings() # Map to the range [-2**31, 2**31[ audio = (audio * (2**31)).astype('float32') # Resample to 22,050 kHz if not sr == pcen_settings["sr"]: audio = librosa.resample(audio, sr, pcen_settings["sr"]) sr = pcen_settings["sr"] # Compute Short-Term Fourier Transform (STFT). stft = librosa.stft( audio, n_fft=pcen_settings["n_fft"], win_length=pcen_settings["win_length"], hop_length=pcen_settings["hop_length"], window=pcen_settings["window"]) # Compute squared magnitude coefficients. abs2_stft = (stft.real*stft.real) + (stft.imag*stft.imag) # Gather frequency bins according to the Mel scale. # NB: as of librosa v0.6.2, melspectrogram is type-instable and thus # returns 64-bit output even with a 32-bit input. Therefore, we need # to convert PCEN to single precision eventually. This might not be # necessary in the future, if the whole PCEN pipeline is kept type-stable. melspec = librosa.feature.melspectrogram( y=None, S=abs2_stft, sr=pcen_settings["sr"], n_fft=pcen_settings["n_fft"], n_mels=pcen_settings["n_mels"], htk=True, fmin=pcen_settings["fmin"], fmax=pcen_settings["fmax"]) # Compute PCEN. pcen = librosa.pcen( melspec, sr=pcen_settings["sr"], hop_length=pcen_settings["hop_length"], gain=pcen_settings["pcen_norm_exponent"], bias=pcen_settings["pcen_delta"], power=pcen_settings["pcen_power"], time_constant=pcen_settings["pcen_time_constant"]) # Convert to single floating-point precision. pcen = pcen.astype('float32') # Truncate spectrum to range 2-10 kHz. pcen = pcen[:pcen_settings["top_freq_id"], :] # Return. return pcen
def __getitem__(self, idx: int): sample = self.df.loc[idx, :] flac_id = sample["recording_id"] y, sr = librosa.load(self.datadir / f"{flac_id}.{self.format}", sr=self.sampling_rate, mono=True, res_type="kaiser_fast") if self.waveform_transforms: y = self.waveform_transforms(y).astype(np.float32) if self.frequency_range == "high": self.melspectrogram_parameters["fmin"] = 6000 self.melspectrogram_parameters["fmax"] = 16000 self.melspectrogram_parameters["n_mels"] = 96 else: self.melspectrogram_parameters["fmin"] = 0 self.melspectrogram_parameters["fmax"] = 6000 self.melspectrogram_parameters["n_mels"] = 96 images = [] n_images = CLIP_DURATION // self.duration for i in range(n_images): y_patch = y[i * self.duration * sr:(i + 1) * self.duration * sr] melspec = librosa.feature.melspectrogram( y_patch, sr=sr, **self.melspectrogram_parameters) pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters) clean_mel = librosa.power_to_db(melspec**1.5) melspec = librosa.power_to_db(melspec) if self.spectrogram_transforms: melspec = self.spectrogram_transforms(image=melspec)["image"] pcen = self.spectrogram_transforms(image=pcen)["image"] clean_mel = self.spectrogram_transforms( image=clean_mel)["image"] else: pass norm_melspec = normalize_melspec(melspec) norm_pcen = normalize_melspec(pcen) norm_clean_mel = normalize_melspec(clean_mel) image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1) height, width, _ = image.shape image = cv2.resize( image, (int(width * self.img_size / height), self.img_size)) image = np.moveaxis(image, 2, 0) image = (image / 255.0).astype(np.float32) images.append(image) return {"recording_id": flac_id, "image": np.asarray(images)}
def __getitem__(self, idx_: int): n_chunk_per_clip = CLIP_DURATION // self.duration idx = idx_ // n_chunk_per_clip segment_id = idx_ % n_chunk_per_clip sample = self.df.loc[idx, :] flac_id = sample["recording_id"] offset = segment_id * self.duration y, sr = librosa.load(self.datadir / f"{flac_id}.wav", sr=self.sampling_rate, mono=True, offset=offset, duration=self.duration) if self.waveform_transforms: y = self.waveform_transforms(y).astype(np.float32) if self.frequency_range == "high": self.melspectrogram_parameters["fmin"] = 6000 self.melspectrogram_parameters["fmax"] = 16000 self.melspectrogram_parameters["n_mels"] = 96 else: self.melspectrogram_parameters["fmin"] = 0 self.melspectrogram_parameters["fmax"] = 6000 self.melspectrogram_parameters["n_mels"] = 96 melspec = librosa.feature.melspectrogram( y, sr=sr, **self.melspectrogram_parameters) pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters) clean_mel = librosa.power_to_db(melspec**1.5) melspec = librosa.power_to_db(melspec) if self.spectrogram_transforms: melspec = self.spectrogram_transforms(image=melspec)["image"] pcen = self.spectrogram_transforms(image=pcen)["image"] clean_mel = self.spectrogram_transforms(image=clean_mel)["image"] else: pass norm_melspec = normalize_melspec(melspec) norm_pcen = normalize_melspec(pcen) norm_clean_mel = normalize_melspec(clean_mel) image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1) height, width, _ = image.shape if isinstance(self.img_size, int): image = cv2.resize( image, (int(width * self.img_size / height), self.img_size)) else: image = cv2.resize(image, tuple(self.img_size)) image = np.moveaxis(image, 2, 0) image = (image / 255.0).astype(np.float32) return {"recording_id": flac_id, "image": image}
def get_pcen(self, audio, sr, frame_step): fbe, _ = fbank(audio, samplerate=sr, winlen=0.025, winstep=frame_step, nfilt=80, nfft=512) # Magnitude spectra (nfilt x nframe) mag_spec = np.transpose(np.sqrt(fbe / 2)) zi = np.reshape(mag_spec[:, 0], (-1, 1)) pcen_s = librosa.pcen(mag_spec, sr=sr, hop_length=int(frame_step * sr), zi=zi) pcen_s = np.transpose(pcen_s) return pcen_s
def __getitem__(self, idx: int): n_chunk_per_clip = CLIP_DURATION // self.duration clip_idx = idx // (n_chunk_per_clip * 2 - 1) segment_id = idx % (n_chunk_per_clip * 2 - 1) sample = self.df.loc[clip_idx, :] flac_id = sample["recording_id"] if segment_id < n_chunk_per_clip: offset = segment_id * self.duration else: offset = (segment_id - n_chunk_per_clip) * self.duration + self.duration / 2 y, sr = librosa.load(self.datadir / f"{flac_id}.flac", sr=self.sampling_rate, mono=True, offset=offset, duration=self.duration, res_type="kaiser_fast") if self.waveform_transforms: y = self.waveform_transforms(y).astype(np.float32) melspec = librosa.feature.melspectrogram( y, sr=sr, **self.melspectrogram_parameters) pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters) clean_mel = librosa.power_to_db(melspec**1.5) melspec = librosa.power_to_db(melspec) if self.spectrogram_transforms: melspec = self.spectrogram_transforms(image=melspec)["image"] pcen = self.spectrogram_transforms(image=pcen)["image"] clean_mel = self.spectrogram_transforms(image=clean_mel)["image"] else: pass norm_melspec = normalize_melspec(melspec) norm_pcen = normalize_melspec(pcen) norm_clean_mel = normalize_melspec(clean_mel) image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1) height, width, _ = image.shape image = cv2.resize( image, (int(width * self.img_size / height), self.img_size)) image = np.moveaxis(image, 2, 0) image = (image / 255.0).astype(np.float32) return {"recording_id": flac_id, "image": image}
def create_spectrogram(self): hop_length = self.options["hop_length"] if hop_length is not None: hop_length = int(hop_length) spectro = librosa.stft( self.audio.get_data(), int(self["n_fft"]), hop_length=hop_length, window=self["window"], ) if self["scale"] == "Mel": spectro = librosa.feature.melspectrogram( S=spectro, n_mels=self.options._options.get( "n_mels", 256), # y=self.audio.get_data(), sr=self.audio.sr ) spec = np.abs(spectro) if self["pcen"]: # TODO: test this!!! return librosa.pcen(spec * (2**31), bias=1, power=0.25) if self["remove_noise"]: # TODO: check SNR to remove noise? spec = self.remove_noise( spec, self["nr_N"], self["nr_hist_rel_size"], self["nr_window_smoothing"], ) spec = spec.astype("float32") if self["normalize"]: spec = librosa.util.normalize(spec) if self["to_db"]: spec = librosa.amplitude_to_db(spec, ref=np.max) return spec
def __getitem__(self, idx_: int): n_chunk_per_clip = CLIP_DURATION // self.duration idx = idx_ // n_chunk_per_clip segment_id = idx_ % n_chunk_per_clip sample = self.df.loc[idx, :] flac_id = sample["recording_id"] offset = segment_id * self.duration y, _ = torchaudio.load(self.datadir / f"{flac_id}.flac", offset=int(offset * DEFAULT_SR), num_frames=int(self.duration * DEFAULT_SR)) y = self.resampler(y[0]).numpy().astype(np.float32) if self.waveform_transforms: y = self.waveform_transforms(y).astype(np.float32) melspec = self.melspectrogram_converter(torch.from_numpy(y)).numpy() pcen = librosa.pcen(melspec, sr=self.sampling_rate, **self.pcen_parameters) clean_mel = librosa.power_to_db(melspec**1.5) melspec = librosa.power_to_db(melspec) if self.spectrogram_transforms: melspec = self.spectrogram_transforms(image=melspec)["image"] pcen = self.spectrogram_transforms(image=pcen)["image"] clean_mel = self.spectrogram_transforms(image=clean_mel)["image"] else: pass norm_melspec = normalize_melspec(melspec) norm_pcen = normalize_melspec(pcen) norm_clean_mel = normalize_melspec(clean_mel) image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1) height, width, _ = image.shape image = cv2.resize( image, (int(width * self.img_size / height), self.img_size)) image = np.moveaxis(image, 2, 0) image = (image / 255.0).astype(np.float32) return {"recording_id": flac_id, "image": image}
def __test(gain, bias, power, b, time_constant, eps, ms, S, Pexp): warnings.resetwarnings() warnings.simplefilter('always') with warnings.catch_warnings(record=True) as out: P = librosa.pcen(S, gain=gain, bias=bias, power=power, time_constant=time_constant, eps=eps, b=b, max_size=ms) if np.issubdtype(S.dtype, np.complexfloating): assert len(out) > 0 assert 'complex' in str(out[0].message).lower() assert P.shape == S.shape assert np.all(P >= 0) assert np.all(np.isfinite(P)) if Pexp is not None: assert np.allclose(P, Pexp)
def compute_mfccs(self, data): mfcc = librosa.feature.mfcc(data, sr=self.sr, n_mfcc=self.n_mels, hop_length=self.hop_length) mfcc = np.array(mfcc, order="F").astype(np.float32) if self.config["feature_type"] == "log_mel": mel_spec = librosa.feature.melspectrogram( data, sr=self.sr, # 16000 n_mels=self.n_mels, # 40 hop_length=self.hop_length, # 160 n_fft=self.n_fft, # 480 fmin=self.f_min, # 20 fmax=self.f_max # 4000 ) # data[data > 0] = np.log(data[data > 0]) # data = [np.matmul(self.dct_filters, x) for x in np.split(data, data.shape[1], axis=1)] mel_spec = np.array(mel_spec, order="F").astype(np.float32) log_mel = librosa.power_to_db(mel_spec) delta = librosa.feature.delta(mfcc) delta_delta = librosa.feature.delta(delta) data = np.vstack([log_mel, delta, delta_delta]) # (120, 101) return data # shape(120, 101, 1) elif self.config["feature_type"] == "MFCC": # print(mfcc.shape) return mfcc # data shape(40,101) elif self.config["feature_type"] == "PCEN": spec = librosa.feature.melspectrogram(data, self.sr, power=1, n_mels=self.n_mels, hop_length=self.hop_length, n_fft=self.n_fft) pcen = librosa.pcen(spec, self.sr) # (40,101) pcen = np.array(pcen, order="F").astype(np.float32) return pcen
def __getitem__(self, idx: int): sample = self.df.loc[idx, :] flac_id = sample["recording_id"] offset = np.random.choice( np.arange(0.0, CLIP_DURATION - self.duration, 0.1)) y, sr = librosa.load(self.datadir / f"{flac_id}.wav", sr=self.sampling_rate, mono=True, offset=offset, duration=self.duration) if self.waveform_transforms: y = self.waveform_transforms(y).astype(np.float32) melspec = librosa.feature.melspectrogram( y, sr=sr, **self.melspectrogram_parameters) pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters) clean_mel = librosa.power_to_db(melspec**1.5) melspec = librosa.power_to_db(melspec) if self.spectrogram_transforms: melspec = self.spectrogram_transforms(image=melspec)["image"] pcen = self.spectrogram_transforms(image=pcen)["image"] clean_mel = self.spectrogram_transforms(image=clean_mel)["image"] else: pass norm_melspec = normalize_melspec(melspec) norm_pcen = normalize_melspec(pcen) norm_clean_mel = normalize_melspec(clean_mel) image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1) height, width, _ = image.shape image = cv2.resize( image, (int(width * self.img_size / height), self.img_size)) image = np.moveaxis(image, 2, 0) image = (image / 255.0).astype(np.float32) tail = offset + self.duration query_string = f"recording_id == '{flac_id}' & " query_string += f"t_min < {tail} & t_max > {offset}" all_tp_events = self.tp.query(query_string) label = np.zeros(N_CLASSES, dtype=np.float32) songtype_label = np.zeros(N_CLASSES + 2, dtype=np.float32) n_frames = image.shape[2] seconds_per_frame = self.duration / n_frames strong_label = np.zeros((n_frames, N_CLASSES), dtype=np.float32) songtype_strong_label = np.zeros((n_frames, N_CLASSES + 2), dtype=np.float32) for species_id in all_tp_events["species_id"].unique(): label[int(species_id)] = 1.0 for species_id_song_id in all_tp_events["species_id_song_id"].unique(): songtype_label[CLASS_MAP[species_id_song_id]] = 1.0 for _, row in all_tp_events.iterrows(): t_min = row.t_min t_max = row.t_max species_id = row.species_id species_id_song_id = row.species_id_song_id start_index = int((t_min - offset) / seconds_per_frame) end_index = int((t_max - offset) / seconds_per_frame) strong_label[start_index:end_index, species_id] = 1.0 songtype_strong_label[start_index:end_index, CLASS_MAP[species_id_song_id]] = 1.0 return { "recording_id": flac_id, "image": image, "targets": { "weak": label, "strong": strong_label, "weak_songtype": songtype_label, "strong_songtype": songtype_strong_label } }
def __getitem__(self, idx: int): sample_tp = idx % 2 == 0 if sample_tp: idx = idx // 2 sample = self.tp.loc[idx, :] else: sample_again = True while sample_again: sample = self.fp.sample(1).reset_index(drop=True).loc[0] flac_id = sample["recording_id"] if len(self.tp.query(f"recording_id == '{flac_id}'")) == 0: break flac_id = sample["recording_id"] index = sample["index"] t_min = sample["t_min"] t_max = sample["t_max"] if not self.centering: offset = np.random.choice( np.arange(max(t_max - self.duration, 0), t_min, 0.1)) offset = min(CLIP_DURATION - self.duration, offset) else: call_duration = t_max - t_min relative_offset = (self.duration - call_duration) / 2 offset = min(max(0, t_min - relative_offset), CLIP_DURATION - self.duration) y, sr = librosa.load(self.datadir / f"{flac_id}.flac", sr=self.sampling_rate, mono=True, offset=offset, duration=self.duration, res_type="kaiser_fast") if self.waveform_transforms: y = self.waveform_transforms(y).astype(np.float32) melspec = librosa.feature.melspectrogram( y, sr=sr, **self.melspectrogram_parameters) pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters) clean_mel = librosa.power_to_db(melspec**1.5) melspec = librosa.power_to_db(melspec) if self.spectrogram_transforms: melspec = self.spectrogram_transforms(image=melspec)["image"] pcen = self.spectrogram_transforms(image=pcen)["image"] clean_mel = self.spectrogram_transforms(image=clean_mel)["image"] else: pass norm_melspec = normalize_melspec(melspec) norm_pcen = normalize_melspec(pcen) norm_clean_mel = normalize_melspec(clean_mel) image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1) height, width, _ = image.shape image = cv2.resize( image, (int(width * self.img_size / height), self.img_size)) image = np.moveaxis(image, 2, 0) image = (image / 255.0).astype(np.float32) label = np.zeros(N_CLASSES, dtype=np.float32) n_frames = image.shape[2] seconds_per_frame = self.duration / n_frames strong_label = np.zeros((n_frames, N_CLASSES), dtype=np.float32) tail = offset + self.duration query_string = f"recording_id == '{flac_id}' & " query_string += f"t_min < {tail} & t_max > {offset}" if sample_tp: all_tp_events = self.tp.query(query_string) for species_id in all_tp_events["species_id"].unique(): label[int(species_id)] = 1.0 for _, row in all_tp_events.iterrows(): t_min = row.t_min t_max = row.t_max species_id = row.species_id start_index = int((t_min - offset) / seconds_per_frame) end_index = int((t_max - offset) / seconds_per_frame) strong_label[start_index:end_index, species_id] = 1.0 return { "recording_id": flac_id, "image": image, "targets": { "weak": label, "strong": strong_label }, "index": index }
def maximum_pcen(S, **kwargs): S = np.abs(S)**2 pcen = librosa.pcen(S, **kwargs) return np.max(pcen, axis=1)
def compute_pcen(audio, sr, input_format=True): """ Computes PCEN (per-channel-energy normalization) for the given audio clip. Parameters ---------- audio : np.ndarray [shape: (N,)] Audio array sr : int Sample rate input_format : bool [default: ``True``] If True, adds an additional channel dimension (of size 1) and ensures that a fixed number of PCEN frames (corresponding to ``get_pcen_settings()['n_hops']``) is returned. If number of frames is greater, the center frames are returned. If the the number of frames is less, empty frames are padded. Returns ------- pcen : np.ndarray [shape: (top_freq_id, n_hops, 1) or (top_freq_id, num_frames)] Per-channel energy normalization processed Mel spectrogram. If ``input_format=True``, will be in shape ``(top_freq_id, n_hops, 1)``. Otherwise it will be in shape ``(top_freq_id, num_frames)``, where ``num_frames`` is the number of PCEN frames for the entire audio clip. """ # Load settings. pcen_settings = get_pcen_settings() # Standardize type to be float32 [-1, 1] if audio.dtype.kind == 'i': max_val = max(np.iinfo(audio.dtype).max, -np.iinfo(audio.dtype).min) audio = audio.astype('float64') / max_val elif audio.dtype.kind == 'f': audio = audio.astype('float64') else: err_msg = 'Invalid audio dtype: {}' raise BirdVoxClassifyError(err_msg.format(audio.dtype)) # Map to the range [-2**31, 2**31] audio = (audio * (2**31)).astype('float32') # Resample to 22,050 kHz if not sr == pcen_settings["sr"]: audio = librosa.resample(audio, sr, pcen_settings["sr"]) # Compute Short-Term Fourier Transform (STFT). stft = librosa.stft(audio, n_fft=pcen_settings["n_fft"], win_length=pcen_settings["win_length"], hop_length=pcen_settings["hop_length"], window=pcen_settings["window"]) # Compute squared magnitude coefficients. abs2_stft = (stft.real * stft.real) + (stft.imag * stft.imag) # Gather frequency bins # NB: as of librosa v0.6.2, melspectrogram is type-instable and thus # returns 64-bit output even with a 32-bit input. Therefore, we need # to convert PCEN to single precision eventually. This might not be # necessary in the future, if the whole PCEN pipeline is kept type-stable. melspec = librosa.feature.melspectrogram(y=None, S=abs2_stft, sr=pcen_settings["sr"], n_fft=pcen_settings["n_fft"], n_mels=pcen_settings["n_mels"], htk=True, fmin=pcen_settings["fmin"], fmax=pcen_settings["fmax"]) # Compute PCEN. pcen = librosa.pcen(melspec, sr=pcen_settings["sr"], hop_length=pcen_settings["hop_length"], gain=pcen_settings["pcen_norm_exponent"], bias=pcen_settings["pcen_delta"], power=pcen_settings["pcen_power"], time_constant=pcen_settings["pcen_time_constant"]) # Convert to single floating-point precision. pcen = pcen.astype('float32') # Truncate spectrum to range 2-10 kHz. pcen = pcen[:pcen_settings["top_freq_id"], :] # Format for input to network if input_format: # Trim TFR in time to required number of hops. pcen_width = pcen.shape[1] n_hops = pcen_settings["n_hops"] if pcen_width >= n_hops: first_col = int((pcen_width - n_hops) / 2) last_col = int((pcen_width + n_hops) / 2) pcen = pcen[:, first_col:last_col] else: # Pad if not enough frames pad_length = n_hops - pcen_width left_pad = pad_length // 2 right_pad = pad_length - left_pad pcen = np.pad(pcen, [(0, 0), (left_pad, right_pad)], mode='constant') # Add channel dimension pcen = pcen[:, :, np.newaxis] # Return. return pcen
def test_pcen_max1(): librosa.pcen(np.arange(100), max_size=3)
def test_pcen_max1(): librosa.pcen(np.arange(100), max_size=3)
def __getitem__(self, idx: int): sample = self.tp.loc[idx, :] index = sample["index"] flac_id = sample["recording_id"] t_min = sample["t_min"] t_max = sample["t_max"] if not self.centering: call_duration = t_max - t_min if call_duration > self.duration: offset = np.random.choice( np.arange(max(t_min - call_duration / 2, 0), t_min + call_duration / 2, 0.1)) offset = min(CLIP_DURATION - self.duration, offset) else: offset = np.random.choice( np.arange(max(t_max - self.duration, 0), t_min, 0.1)) offset = min(CLIP_DURATION - self.duration, offset) else: call_duration = t_max - t_min if call_duration > self.duration: offset = (t_max + t_min) / 2 - self.duration / 2 else: relative_offset = (self.duration - call_duration) / 2 offset = min(max(0, t_min - relative_offset), CLIP_DURATION - self.duration) y, sr = librosa.load(self.datadir / f"{flac_id}.wav", sr=self.sampling_rate, mono=True, offset=offset, duration=self.duration) if self.waveform_transforms: y = self.waveform_transforms(y).astype(np.float32) if self.frequency_range == "high": self.melspectrogram_parameters["fmin"] = 6000 self.melspectrogram_parameters["fmax"] = 16000 self.melspectrogram_parameters["n_mels"] = 96 else: self.melspectrogram_parameters["fmin"] = 0 self.melspectrogram_parameters["fmax"] = 6000 self.melspectrogram_parameters["n_mels"] = 96 melspec = librosa.feature.melspectrogram( y, sr=sr, **self.melspectrogram_parameters) pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters) clean_mel = librosa.power_to_db(melspec**1.5) melspec = librosa.power_to_db(melspec) if self.spectrogram_transforms: melspec = self.spectrogram_transforms(image=melspec)["image"] pcen = self.spectrogram_transforms(image=pcen)["image"] clean_mel = self.spectrogram_transforms(image=clean_mel)["image"] else: pass norm_melspec = normalize_melspec(melspec) norm_pcen = normalize_melspec(pcen) norm_clean_mel = normalize_melspec(clean_mel) image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1) height, width, _ = image.shape if isinstance(self.img_size, int): image = cv2.resize( image, (int(width * self.img_size / height), self.img_size)) else: image = cv2.resize(image, tuple(self.img_size)) image = np.moveaxis(image, 2, 0) image = (image / 255.0).astype(np.float32) tail = offset + self.duration query_string = f"recording_id == '{flac_id}' & " query_string += f"t_min < {tail} & t_max > {offset}" all_tp_events = self.tp.query(query_string) label = np.zeros(N_CLASSES, dtype=np.float32) songtype_label = np.zeros(N_CLASSES + 2, dtype=np.float32) n_frames = image.shape[2] seconds_per_frame = self.duration / n_frames strong_label = np.zeros((n_frames, N_CLASSES), dtype=np.float32) for species_id in all_tp_events["species_id"].unique(): if species_id in RANGE_SPECIES_MAP[self.frequency_range]: label[int(species_id)] = 1.0 for _, row in all_tp_events.iterrows(): t_min = row.t_min t_max = row.t_max species_id = row.species_id start_index = int((t_min - offset) / seconds_per_frame) end_index = int((t_max - offset) / seconds_per_frame) if species_id in RANGE_SPECIES_MAP[self.frequency_range]: strong_label[start_index:end_index, species_id] = 1.0 return { "recording_id": flac_id, "image": image, "targets": { "weak": label, "strong": strong_label, "weak_songtype": songtype_label }, "index": index }
# Gather frequency bins according to the Mel scale. melspec = librosa.feature.melspectrogram( y=None, S=abs2_stft, sr=pcen_settings["sr"], n_fft=pcen_settings["n_fft"], n_mels=pcen_settings["n_mels"], htk=True, fmin=pcen_settings["fmin"], fmax=pcen_settings["fmax"]) # Compute PCEN. pcen = librosa.pcen( melspec, sr=pcen_settings["sr"], hop_length=pcen_settings["hop_length"], gain=pcen_settings["pcen_norm_exponent"], bias=pcen_settings["pcen_delta"], power=pcen_settings["pcen_power"], time_constant=pcen_settings["pcen_time_constant"]) # Convert to single floating-point precision. pcen = pcen.astype('float32') # Truncate spectrum to range 2-10 kHz. pcen = pcen[:pcen_settings["top_freq_id"], :] # Save. lms_group[clip_name] = pcen # Print elapsed time. print(str(datetime.datetime.now()) + " Finish.")
def test_pcen_axes(): srand() # Make a power spectrogram X = np.random.randn(3, 100, 50)**2 # First, test that axis setting works P1 = librosa.pcen(X[0]) P1a = librosa.pcen(X[0], axis=-1) P2 = librosa.pcen(X[0].T, axis=0).T assert np.allclose(P1, P2) assert np.allclose(P1, P1a) # Test that it works with max-filtering P1 = librosa.pcen(X[0], max_size=3) P1a = librosa.pcen(X[0], axis=-1, max_size=3) P2 = librosa.pcen(X[0].T, axis=0, max_size=3).T assert np.allclose(P1, P2) assert np.allclose(P1, P1a) # Test that it works with multi-dimensional input, no filtering P0 = librosa.pcen(X[0]) P1 = librosa.pcen(X[1]) P2 = librosa.pcen(X[2]) Pa = librosa.pcen(X) assert np.allclose(P0, Pa[0]) assert np.allclose(P1, Pa[1]) assert np.allclose(P2, Pa[2]) # Test that it works with multi-dimensional input, max-filtering P0 = librosa.pcen(X[0], max_size=3) P1 = librosa.pcen(X[1], max_size=3) P2 = librosa.pcen(X[2], max_size=3) Pa = librosa.pcen(X, max_size=3, max_axis=1) assert np.allclose(P0, Pa[0]) assert np.allclose(P1, Pa[1]) assert np.allclose(P2, Pa[2])
def test_pcen_axes_nomax(): srand() # Make a power spectrogram X = np.random.randn(3, 100, 50)**2 librosa.pcen(X, max_size=3)
def __getitem__(self, idx: int): sample = self.tp.loc[idx, :] index = sample["index"] flac_id = sample["recording_id"] t_min = sample["t_min"] t_max = sample["t_max"] call_duration = t_max - t_min if call_duration > self.duration: offset = np.random.choice( np.arange(max(t_min - call_duration / 2, 0), t_min + call_duration / 2, 0.1)) offset = min(CLIP_DURATION - self.duration, offset) else: offset = np.random.choice( np.arange(max(t_max - self.duration, 0), t_min, 0.1)) offset = min(CLIP_DURATION - self.duration, offset) y, sr = librosa.load(self.datadir / f"{flac_id}{self.suffix}", sr=self.sampling_rate, mono=True, offset=offset, duration=self.duration) if self.waveform_transforms: y = self.waveform_transforms(y).astype(np.float32) melspec = librosa.feature.melspectrogram( y, sr=sr, **self.melspectrogram_parameters) use_mixup = False if np.random.rand() < self.mixup_prob: use_mixup = True while True: mixup_sample = self.tp.sample(1).reset_index(drop=True).loc[0] if mixup_sample["index"] != index: break mixup_flac_id = mixup_sample["recording_id"] mixup_t_min = mixup_sample["t_min"] mixup_t_max = mixup_sample["t_max"] mixup_offset = np.random.choice( np.arange(max(mixup_t_max - self.duration, 0), mixup_t_min, 0.1)) mixup_offset = min(CLIP_DURATION - self.duration, mixup_offset) y_mixup, _ = librosa.load(self.datadir / f"{mixup_flac_id}{self.suffix}", sr=self.sampling_rate, mono=True, offset=mixup_offset, duration=self.duration) if self.waveform_transforms: y_mixup = self.waveform_transforms(y_mixup).astype(np.float32) y_mixup = librosa.util.normalize(y_mixup) lam = np.random.beta(self.mixup_alpha, self.mixup_alpha) y_mixed = lam * y + (1 - lam) * y_mixup melspec = librosa.feature.melspectrogram( y_mixed, sr=sr, **self.melspectrogram_parameters) pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters) clean_mel = librosa.power_to_db(melspec**1.5) melspec = librosa.power_to_db(melspec) if self.spectrogram_transforms: melspec = self.spectrogram_transforms(image=melspec)["image"] pcen = self.spectrogram_transforms(image=pcen)["image"] clean_mel = self.spectrogram_transforms(image=clean_mel)["image"] else: pass norm_melspec = normalize_melspec(melspec) norm_pcen = normalize_melspec(pcen) norm_clean_mel = normalize_melspec(clean_mel) image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1) height, width, _ = image.shape image = cv2.resize( image, (int(width * self.img_size / height), self.img_size)) image = np.moveaxis(image, 2, 0) image = (image / 255.0).astype(np.float32) tail = offset + self.duration query_string = f"recording_id == '{flac_id}' & " query_string += f"t_min < {tail} & t_max > {offset}" all_tp_events = self.tp.query(query_string) if use_mixup: mixup_tail = mixup_offset + self.duration query_string = f"recording_id == '{mixup_flac_id}' & " query_string += f"t_min < {mixup_tail} & t_max > {mixup_offset}" mixup_tp_events = self.tp.query(query_string) label = np.zeros(N_CLASSES, dtype=np.float32) n_frames = image.shape[2] seconds_per_frame = self.duration / n_frames strong_label = np.zeros((n_frames, N_CLASSES), dtype=np.float32) for species_id in all_tp_events["species_id"].unique(): if self.float_label and use_mixup: label[int(species_id)] = lam else: label[int(species_id)] = 1.0 for _, row in all_tp_events.iterrows(): t_min = row.t_min t_max = row.t_max species_id = row.species_id start_index = int((t_min - offset) / seconds_per_frame) end_index = int((t_max - offset) / seconds_per_frame) if self.float_label and use_mixup: strong_label[start_index:end_index, species_id] = lam else: strong_label[start_index:end_index, species_id] = 1.0 if use_mixup: for species_id in mixup_tp_events["species_id"].unique(): if self.float_label: label[int(species_id)] = (1 - lam) else: label[int(species_id)] = 1.0 for _, row in mixup_tp_events.iterrows(): t_min = row.t_min t_max = row.t_max species_id = row.species_id start_index = int((t_min - mixup_offset) / seconds_per_frame) end_index = int((t_max - mixup_offset) / seconds_per_frame) if self.float_label: strong_label[start_index:end_index, species_id] = 1 - lam else: strong_label[start_index:end_index, species_id] = 1.0 return { "recording_id": flac_id, "image": image, "targets": { "weak": label, "strong": strong_label }, "index": index }
#%% import librosa import librosa.display as display import numpy import IPython.display as ipd #%% old_audio = '/scratch/richardso21/mp3splt files/nigliq1/NIGLIQ_short_test/NIGLIQ1_20160607_203214_2960m_00s__2980m_00s_17m_40s__17m_50s.wav' #%% old, sr = librosa.load(old_audio) ipd.Audio(old_audio) #%% Old_db = librosa.power_to_db(librosa.feature.melspectrogram(old, sr=sr)) display.specshow(Old_db, y_axis='hz') #%% mel = librosa.feature.melspectrogram(old, sr=sr) new = librosa.pcen(mel) display.specshow(new, y_axis='hz') #%% import soundfile as sf sf.write('pcen_test.wav', new, samplerate=sr) ipd.Audio('pcen_test.wav') #%%
def test_pcen_axes_nomax(): srand() # Make a power spectrogram X = np.random.randn(3, 100, 50)**2 librosa.pcen(X, max_size=3)
# Make an array to store the frequency-averaged PCEN values pcen_blocks = [] # Initialize the PCEN filter delays to steady state zi = None for y_block in stream: # Compute the STFT (without padding, so center=False) D = librosa.stft(y_block, n_fft=n_fft, hop_length=hop_length, center=False) # Compute PCEN on the magnitude spectrum, using initial delays # returned from our previous call (if any) # store the final delays for use as zi in the next iteration P, zi = librosa.pcen(np.abs(D), sr=sr, hop_length=hop_length, zi=zi, return_zf=True) # Compute the average PCEN over frequency, and append it to our list pcen_blocks.extend(np.mean(P, axis=0)) # Cast to a numpy array for use downstream pcen_blocks = np.asarray(pcen_blocks) ##################################################################### # For the sake of comparison, let's see how it would look had we # run PCEN on the entire spectrum without block-wise processing y, sr = librosa.load(filename, sr=44100) # Keep the same parameters as before D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, center=False)
def test_pcen_axes(): srand() # Make a power spectrogram X = np.random.randn(3, 100, 50)**2 # First, test that axis setting works P1 = librosa.pcen(X[0]) P1a = librosa.pcen(X[0], axis=-1) P2 = librosa.pcen(X[0].T, axis=0).T assert np.allclose(P1, P2) assert np.allclose(P1, P1a) # Test that it works with max-filtering P1 = librosa.pcen(X[0], max_size=3) P1a = librosa.pcen(X[0], axis=-1, max_size=3) P2 = librosa.pcen(X[0].T, axis=0, max_size=3).T assert np.allclose(P1, P2) assert np.allclose(P1, P1a) # Test that it works with multi-dimensional input, no filtering P0 = librosa.pcen(X[0]) P1 = librosa.pcen(X[1]) P2 = librosa.pcen(X[2]) Pa = librosa.pcen(X) assert np.allclose(P0, Pa[0]) assert np.allclose(P1, Pa[1]) assert np.allclose(P2, Pa[2]) # Test that it works with multi-dimensional input, max-filtering P0 = librosa.pcen(X[0], max_size=3) P1 = librosa.pcen(X[1], max_size=3) P2 = librosa.pcen(X[2], max_size=3) Pa = librosa.pcen(X, max_size=3, max_axis=1) assert np.allclose(P0, Pa[0]) assert np.allclose(P1, Pa[1]) assert np.allclose(P2, Pa[2])
# Make an array to store the frequency-averaged PCEN values pcen_blocks = [] # Initialize the PCEN filter delays to steady state zi = None for y_block in stream: # Compute the STFT (without padding, so center=False) D = librosa.stft(y_block, n_fft=n_fft, hop_length=hop_length, center=False) # Compute PCEN on the magnitude spectrum, using initial delays # returned from our previous call (if any) # store the final delays for use as zi in the next iteration P, zi = librosa.pcen(np.abs(D), sr=sr, hop_length=hop_length, zi=zi, return_zf=True) # Compute the max PCEN over frequency, and append it to our list pcen_blocks.extend(np.max(P, axis=0)) # Cast to a numpy array for use downstream pcen_blocks = np.asarray(pcen_blocks) ##################################################################### # For the sake of comparison, let's see how it would look had we # run PCEN on the entire spectrum without block-wise processing y, sr = librosa.load(filename, sr=sr)
def average_pcen(S, **kwargs): S = np.abs(S)**2 pcen = librosa.pcen(S, **kwargs) return np.mean(pcen, axis=1)