示例#1
0
    def transform_audio(self, y):
        '''Compute the PCEN of the (log-) Mel Spectrogram        
        Parameters
        ----------
        y : np.ndarray
            The audio buffer
        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins)
                The PCEN magnitude
        '''

        #extract proper shape
        S_test = melspectrogram(y=y,
                                sr=self.sr,
                                hop_length=self.hop_length,
                                n_fft=self.n_fft,
                                n_mels=self.n_mels)
        P_test = pcen(S_test,
                      sr=self.sr,
                      hop_length=self.hop_length,
                      time_constant=1)
        n_frames = P_test.shape[1]

        #double audio and reverse pad to prevent zero initial-energy assumption
        y = np.concatenate((y[::-1], y))

        S = melspectrogram(y=y,
                           sr=self.sr,
                           hop_length=self.hop_length,
                           n_fft=self.n_fft,
                           n_mels=self.n_mels)
        if self.log:
            S = amplitude_to_db(S, ref=np.max)

        t_base = (self.hop_length) / (self.sr)  #tau, or hop length in time
        t_constants = t_base * np.array(
            [2**i for i in range(self.n_t_constants)])
        pcen_layers = []

        for T in t_constants:
            P = pcen(S,
                     sr=self.sr,
                     hop_length=self.hop_length,
                     time_constant=T)
            #source of off-by-one error:
            P = P[:, P.shape[1] - n_frames + 1:]  #remove padded section
            P = to_dtype(P, self.dtype)
            pcen_layers.append(P)

        pcen_layers = to_dtype(np.asarray(pcen_layers), self.dtype)

        return {
            'mag': self._index(pcen_layers)
        }  #copied from mel spectrogram pump feature extractor
示例#2
0
    def to_pcen(self, gain=0.8, bias=10.0, power=0.25, time_constant=0.06):
        """ Create PCEN from MelSpectrogram

        Argument descriptions come from https://librosa.org/doc/latest/generated/librosa.pcen.html?highlight=pcen#librosa-pcen

        Args:
            gain: The gain factor. Typical values should be slightly less than 1 [default: 0.8]
            bias: The bias point of the nonlinear compression [default: 10.0]
            power: The compression exponent. Typical values should be between 0
                and 0.5. Smaller values of power result in stronger compression. At
                the limit power=0, polynomial compression becomes logarithmic
                [default: 0.25]
            time_constant: The time constant for IIR filtering, measured in seconds [default: 0.06]

        Returns:
            The per-channel energy normalized version of MelSpectrogram.S
        """

        return MelSpectrogram(
            pcen(
                self.S,
                sr=self.sample_rate,
                hop_length=self.hop_length,
                gain=gain,
                bias=bias,
                power=power,
                time_constant=time_constant,
            ),
            self.sample_rate,
            self.hop_length,
            self.fmin,
            self.fmax,
        )
示例#3
0
def generate_spectrogram(wav, sample_rate, spec_opts):
    opts = {
        "n_fft": spec_opts.get("n_fft", DEFAULT_OPTS["n_fft"]),
        "hop_length": spec_opts.get("hop_length", DEFAULT_OPTS["hop_length"]),
        "window": spec_opts.get("window", DEFAULT_OPTS["window"]),
    }

    spec = librosa.stft(wav, **opts)

    if spec_opts.get("type", "mel") == "mel":
        opts.update({
            "n_mels": spec_opts.get("n_mels", DEFAULT_OPTS["n_mels"]),
            "sr": sample_rate,
        })
        spec = librosa.feature.melspectrogram(S=np.abs(spec)**2, **opts)
        spec = spec.astype(np.float32)

    pcen = spec_opts.get("pcen", {})
    if pcen:
        pcen_opts = common_utils.deep_dict_update(DEFAULT_OPTS["pcen"],
                                                  pcen,
                                                  copy=True)
        opts["pcen"] = pcen_opts
        spec = librosa.pcen(spec * (2**31), **pcen_opts)

    return spec, opts
示例#4
0
def test_pcen_ref():

    srand()
    # Make a power spectrogram
    X = np.random.randn(100, 50)**2

    # Edge cases:
    #   gain=1, bias=0, power=1, b=1 => ones
    ones = np.ones_like(X)

    Y = librosa.pcen(X, gain=1, bias=0, power=1, b=1, eps=1e-20)
    assert np.allclose(Y, ones)

    # with ref=ones, we should get X / (eps + ones) == X
    Y2 = librosa.pcen(X, gain=1, bias=0, power=1, b=1, ref=ones, eps=1e-20)
    assert np.allclose(Y2, X)
    def audio_to_pcen(self, audio, pathname=None): # use power=1 to get a magnitude spectrum instead of a power spectrum
        mag_spectrogram = librosa.feature.melspectrogram(audio,
                                                         sr=self.sampling_rate,
                                                         n_mels=self.n_mels,
                                                         hop_length=self.hop_length,
                                                         n_fft=self.n_fft,
                                                         fmin=self.fmin,
                                                         fmax=self.fmax,
                                                         power=1)
        if pathname is not None:
            np.save(pathname.replace('.wav', '_{}.npy'.format(self.sampling_rate)), mag_spectrogram.astype(np.float32))
            return mag_spectrogram

        # https://www.kaggle.com/c/freesound-audio-tagging-2019/discussion/91859#529792
        pcen_spectrogram = librosa.pcen(mag_spectrogram,
                                        sr=self.sampling_rate,
                                        hop_length=self.hop_length,
                                        gain=0.5,
                                        bias=0.001,
                                        power=0.2,
                                        time_constant=0.4,
                                        eps=1e-9)
        pcen_spectrogram = pcen_spectrogram.astype(np.float32)
        if pathname is not None:
            np.save(pathname.replace('.wav', '_{}.npy'.format(self.sampling_rate)), pcen_spectrogram)
        return pcen_spectrogram
示例#6
0
def test_pcen_ref():

    srand()
    # Make a power spectrogram
    X = np.random.randn(100, 50)**2

    # Edge cases:
    #   gain=1, bias=0, power=1, b=1 => ones
    ones = np.ones_like(X)

    Y = librosa.pcen(X, gain=1, bias=0, power=1, b=1, eps=1e-20)
    assert np.allclose(Y, ones)

    # with ref=ones, we should get X / (eps + ones) == X
    Y2 = librosa.pcen(X, gain=1, bias=0, power=1, b=1, ref=ones, eps=1e-20)
    assert np.allclose(Y2, X)
示例#7
0
    def transform_audio(self, y):
        '''Compute the PCEN of the (log-) Mel Spectrogram        
        Parameters
        ----------
        y : np.ndarray
            The audio buffer
        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins)
                The PCEN magnitude
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        S = melspectrogram(y=y,
                           sr=self.sr,
                           hop_length=self.hop_length,
                           n_mels=self.n_mels,
                           n_fft=self.n_fft)

        if self.log:
            S = amplitude_to_db(S, ref=np.max)

        P = pcen(S, sr=sr, hop_length=self.hop_length)

        P = to_dtype(P, self.dtype)

        return {
            'mag': P[self.idx]
        }  #copied from mel spectrogram pump feature extractor
def wav_to_h5(input_wav_dir):
    a, sr = librosa.load(input_wav_dir)

    a_log = librosa.feature.melspectrogram(a,
                                           sr=sr,
                                           n_fft=1024,
                                           hop_length=315,
                                           n_mels=80,
                                           fmax=11000,
                                           power=1)
    if PCEN == True:
        a_out = librosa.pcen(a_log * (2**31))
    elif PCEN == False:
        a_out = librosa.power_to_db(a_log, ref=np.max)

    duration = librosa.get_duration(a)
    frames = a_out.shape[1]
    timeframe = np.linspace(0, duration, num=frames)

    dir_name = os.path.basename(os.path.dirname(input_wav_dir))
    name = os.path.basename(input_wav_dir)

    with h5py.File('workingfiles/spect/{0}/{1}.h5'.format(dir_name, name),
                   'w') as data_file:
        data_file.create_dataset('features', data=a_out.T, dtype='float32')
        data_file.create_dataset('times', data=timeframe, dtype='float32')
示例#9
0
def raw_to_pcen(audio, sampling_rate, window_size, hop_length, n_freqs):
    """Go from 1D numpy array containing audio waves to PCEN spectrogram.

    Parameters might not be optimal...

    Parameters:
        audio: 1D numpy array containing the audio.
        sampling_rate: Sampling rate of audio.
        window_size: STFT window size.
        hop_length: Distance between successive STFT windows.
        n_freqs: Number of mel frequency bins.

    Returns:
        PCEN spectrogram, bins x time.
    """
    spectro = np.abs(
        librosa.stft(audio,
                     n_fft=window_size,
                     hop_length=hop_length,
                     center=True))
    mel = librosa.feature.melspectrogram(S=spectro,
                                         sr=sampling_rate,
                                         n_mels=n_freqs)
    pcen = librosa.pcen(mel,
                        sr=sampling_rate,
                        hop_length=hop_length,
                        time_constant=0.285)
    return pcen
示例#10
0
    def __test(gain, bias, power, b, time_constant, eps, ms, S, Pexp):

        warnings.resetwarnings()
        warnings.simplefilter('always')
        with warnings.catch_warnings(record=True) as out:

            P = librosa.pcen(S,
                             gain=gain,
                             bias=bias,
                             power=power,
                             time_constant=time_constant,
                             eps=eps,
                             b=b,
                             max_size=ms)

            if np.issubdtype(S.dtype, np.complexfloating):
                assert len(out) > 0
                assert 'complex' in str(out[0].message).lower()

        assert P.shape == S.shape
        assert np.all(P >= 0)
        assert np.all(np.isfinite(P))

        if Pexp is not None:
            assert np.allclose(P, Pexp)
示例#11
0
def compute_pcen(audio, sr):
    # Load settings.
    pcen_settings = get_pcen_settings()

    # Validate audio
    librosa.util.valid_audio(audio, mono=True)

    # Map to the range [-2**31, 2**31[
    audio = (audio * (2**31)).astype('float32')

    # Resample to 22,050 kHz
    if not sr == pcen_settings["sr"]:
        audio = librosa.resample(audio, sr, pcen_settings["sr"])
        sr = pcen_settings["sr"]

    # Compute Short-Term Fourier Transform (STFT).
    stft = librosa.stft(
        audio,
        n_fft=pcen_settings["n_fft"],
        win_length=pcen_settings["win_length"],
        hop_length=pcen_settings["hop_length"],
        window=pcen_settings["window"])

    # Compute squared magnitude coefficients.
    abs2_stft = (stft.real*stft.real) + (stft.imag*stft.imag)

    # Gather frequency bins according to the Mel scale.
    # NB: as of librosa v0.6.2, melspectrogram is type-instable and thus
    # returns 64-bit output even with a 32-bit input. Therefore, we need
    # to convert PCEN to single precision eventually. This might not be
    # necessary in the future, if the whole PCEN pipeline is kept type-stable.
    melspec = librosa.feature.melspectrogram(
        y=None,
        S=abs2_stft,
        sr=pcen_settings["sr"],
        n_fft=pcen_settings["n_fft"],
        n_mels=pcen_settings["n_mels"],
        htk=True,
        fmin=pcen_settings["fmin"],
        fmax=pcen_settings["fmax"])

    # Compute PCEN.
    pcen = librosa.pcen(
        melspec,
        sr=pcen_settings["sr"],
        hop_length=pcen_settings["hop_length"],
        gain=pcen_settings["pcen_norm_exponent"],
        bias=pcen_settings["pcen_delta"],
        power=pcen_settings["pcen_power"],
        time_constant=pcen_settings["pcen_time_constant"])

    # Convert to single floating-point precision.
    pcen = pcen.astype('float32')

    # Truncate spectrum to range 2-10 kHz.
    pcen = pcen[:pcen_settings["top_freq_id"], :]

    # Return.
    return pcen
示例#12
0
def compute_pcen(audio, sr):
    # Load settings.
    pcen_settings = get_pcen_settings()

    # Map to the range [-2**31, 2**31[
    audio = (audio * (2**31)).astype('float32')

    # Resample to 22,050 kHz
    if not sr == pcen_settings["sr"]:
        audio = librosa.resample(audio, sr, pcen_settings["sr"])
        sr = pcen_settings["sr"]

    # Compute Short-Term Fourier Transform (STFT).
    stft = librosa.stft(
        audio,
        n_fft=pcen_settings["n_fft"],
        win_length=pcen_settings["win_length"],
        hop_length=pcen_settings["hop_length"],
        window=pcen_settings["window"])

    # Compute squared magnitude coefficients.
    abs2_stft = (stft.real*stft.real) + (stft.imag*stft.imag)

    # Gather frequency bins according to the Mel scale.
    # NB: as of librosa v0.6.2, melspectrogram is type-instable and thus
    # returns 64-bit output even with a 32-bit input. Therefore, we need
    # to convert PCEN to single precision eventually. This might not be
    # necessary in the future, if the whole PCEN pipeline is kept type-stable.
    melspec = librosa.feature.melspectrogram(
        y=None,
        S=abs2_stft,
        sr=pcen_settings["sr"],
        n_fft=pcen_settings["n_fft"],
        n_mels=pcen_settings["n_mels"],
        htk=True,
        fmin=pcen_settings["fmin"],
        fmax=pcen_settings["fmax"])

    # Compute PCEN.
    pcen = librosa.pcen(
        melspec,
        sr=pcen_settings["sr"],
        hop_length=pcen_settings["hop_length"],
        gain=pcen_settings["pcen_norm_exponent"],
        bias=pcen_settings["pcen_delta"],
        power=pcen_settings["pcen_power"],
        time_constant=pcen_settings["pcen_time_constant"])

    # Convert to single floating-point precision.
    pcen = pcen.astype('float32')

    # Truncate spectrum to range 2-10 kHz.
    pcen = pcen[:pcen_settings["top_freq_id"], :]

    # Return.
    return pcen
示例#13
0
    def __getitem__(self, idx: int):
        sample = self.df.loc[idx, :]
        flac_id = sample["recording_id"]

        y, sr = librosa.load(self.datadir / f"{flac_id}.{self.format}",
                             sr=self.sampling_rate,
                             mono=True,
                             res_type="kaiser_fast")
        if self.waveform_transforms:
            y = self.waveform_transforms(y).astype(np.float32)

        if self.frequency_range == "high":
            self.melspectrogram_parameters["fmin"] = 6000
            self.melspectrogram_parameters["fmax"] = 16000
            self.melspectrogram_parameters["n_mels"] = 96
        else:
            self.melspectrogram_parameters["fmin"] = 0
            self.melspectrogram_parameters["fmax"] = 6000
            self.melspectrogram_parameters["n_mels"] = 96

        images = []
        n_images = CLIP_DURATION // self.duration
        for i in range(n_images):
            y_patch = y[i * self.duration * sr:(i + 1) * self.duration * sr]

            melspec = librosa.feature.melspectrogram(
                y_patch, sr=sr, **self.melspectrogram_parameters)
            pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters)
            clean_mel = librosa.power_to_db(melspec**1.5)
            melspec = librosa.power_to_db(melspec)

            if self.spectrogram_transforms:
                melspec = self.spectrogram_transforms(image=melspec)["image"]
                pcen = self.spectrogram_transforms(image=pcen)["image"]
                clean_mel = self.spectrogram_transforms(
                    image=clean_mel)["image"]
            else:
                pass

            norm_melspec = normalize_melspec(melspec)
            norm_pcen = normalize_melspec(pcen)
            norm_clean_mel = normalize_melspec(clean_mel)
            image = np.stack([norm_melspec, norm_pcen, norm_clean_mel],
                             axis=-1)

            height, width, _ = image.shape
            image = cv2.resize(
                image, (int(width * self.img_size / height), self.img_size))
            image = np.moveaxis(image, 2, 0)
            image = (image / 255.0).astype(np.float32)

            images.append(image)

        return {"recording_id": flac_id, "image": np.asarray(images)}
示例#14
0
    def __getitem__(self, idx_: int):
        n_chunk_per_clip = CLIP_DURATION // self.duration
        idx = idx_ // n_chunk_per_clip
        segment_id = idx_ % n_chunk_per_clip

        sample = self.df.loc[idx, :]
        flac_id = sample["recording_id"]

        offset = segment_id * self.duration
        y, sr = librosa.load(self.datadir / f"{flac_id}.wav",
                             sr=self.sampling_rate,
                             mono=True,
                             offset=offset,
                             duration=self.duration)
        if self.waveform_transforms:
            y = self.waveform_transforms(y).astype(np.float32)

        if self.frequency_range == "high":
            self.melspectrogram_parameters["fmin"] = 6000
            self.melspectrogram_parameters["fmax"] = 16000
            self.melspectrogram_parameters["n_mels"] = 96
        else:
            self.melspectrogram_parameters["fmin"] = 0
            self.melspectrogram_parameters["fmax"] = 6000
            self.melspectrogram_parameters["n_mels"] = 96

        melspec = librosa.feature.melspectrogram(
            y, sr=sr, **self.melspectrogram_parameters)
        pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters)
        clean_mel = librosa.power_to_db(melspec**1.5)
        melspec = librosa.power_to_db(melspec)

        if self.spectrogram_transforms:
            melspec = self.spectrogram_transforms(image=melspec)["image"]
            pcen = self.spectrogram_transforms(image=pcen)["image"]
            clean_mel = self.spectrogram_transforms(image=clean_mel)["image"]
        else:
            pass

        norm_melspec = normalize_melspec(melspec)
        norm_pcen = normalize_melspec(pcen)
        norm_clean_mel = normalize_melspec(clean_mel)
        image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)

        height, width, _ = image.shape
        if isinstance(self.img_size, int):
            image = cv2.resize(
                image, (int(width * self.img_size / height), self.img_size))
        else:
            image = cv2.resize(image, tuple(self.img_size))
        image = np.moveaxis(image, 2, 0)
        image = (image / 255.0).astype(np.float32)

        return {"recording_id": flac_id, "image": image}
示例#15
0
 def get_pcen(self, audio, sr, frame_step):
     fbe, _ = fbank(audio,
                    samplerate=sr,
                    winlen=0.025,
                    winstep=frame_step,
                    nfilt=80,
                    nfft=512)
     # Magnitude spectra (nfilt x nframe)
     mag_spec = np.transpose(np.sqrt(fbe / 2))
     zi = np.reshape(mag_spec[:, 0], (-1, 1))
     pcen_s = librosa.pcen(mag_spec,
                           sr=sr,
                           hop_length=int(frame_step * sr),
                           zi=zi)
     pcen_s = np.transpose(pcen_s)
     return pcen_s
示例#16
0
    def __getitem__(self, idx: int):
        n_chunk_per_clip = CLIP_DURATION // self.duration
        clip_idx = idx // (n_chunk_per_clip * 2 - 1)
        segment_id = idx % (n_chunk_per_clip * 2 - 1)

        sample = self.df.loc[clip_idx, :]
        flac_id = sample["recording_id"]

        if segment_id < n_chunk_per_clip:
            offset = segment_id * self.duration
        else:
            offset = (segment_id -
                      n_chunk_per_clip) * self.duration + self.duration / 2
        y, sr = librosa.load(self.datadir / f"{flac_id}.flac",
                             sr=self.sampling_rate,
                             mono=True,
                             offset=offset,
                             duration=self.duration,
                             res_type="kaiser_fast")
        if self.waveform_transforms:
            y = self.waveform_transforms(y).astype(np.float32)

        melspec = librosa.feature.melspectrogram(
            y, sr=sr, **self.melspectrogram_parameters)
        pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters)
        clean_mel = librosa.power_to_db(melspec**1.5)
        melspec = librosa.power_to_db(melspec)

        if self.spectrogram_transforms:
            melspec = self.spectrogram_transforms(image=melspec)["image"]
            pcen = self.spectrogram_transforms(image=pcen)["image"]
            clean_mel = self.spectrogram_transforms(image=clean_mel)["image"]
        else:
            pass

        norm_melspec = normalize_melspec(melspec)
        norm_pcen = normalize_melspec(pcen)
        norm_clean_mel = normalize_melspec(clean_mel)
        image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)

        height, width, _ = image.shape
        image = cv2.resize(
            image, (int(width * self.img_size / height), self.img_size))
        image = np.moveaxis(image, 2, 0)
        image = (image / 255.0).astype(np.float32)

        return {"recording_id": flac_id, "image": image}
示例#17
0
    def create_spectrogram(self):
        hop_length = self.options["hop_length"]
        if hop_length is not None:
            hop_length = int(hop_length)

        spectro = librosa.stft(
            self.audio.get_data(),
            int(self["n_fft"]),
            hop_length=hop_length,
            window=self["window"],
        )

        if self["scale"] == "Mel":
            spectro = librosa.feature.melspectrogram(
                S=spectro,
                n_mels=self.options._options.get(
                    "n_mels",
                    256),  # y=self.audio.get_data(), sr=self.audio.sr
            )

        spec = np.abs(spectro)

        if self["pcen"]:
            # TODO: test this!!!
            return librosa.pcen(spec * (2**31), bias=1, power=0.25)

        if self["remove_noise"]:
            # TODO: check SNR to remove noise?
            spec = self.remove_noise(
                spec,
                self["nr_N"],
                self["nr_hist_rel_size"],
                self["nr_window_smoothing"],
            )
            spec = spec.astype("float32")

        if self["normalize"]:
            spec = librosa.util.normalize(spec)

        if self["to_db"]:
            spec = librosa.amplitude_to_db(spec, ref=np.max)

        return spec
示例#18
0
    def __getitem__(self, idx_: int):
        n_chunk_per_clip = CLIP_DURATION // self.duration
        idx = idx_ // n_chunk_per_clip
        segment_id = idx_ % n_chunk_per_clip

        sample = self.df.loc[idx, :]
        flac_id = sample["recording_id"]

        offset = segment_id * self.duration
        y, _ = torchaudio.load(self.datadir / f"{flac_id}.flac",
                               offset=int(offset * DEFAULT_SR),
                               num_frames=int(self.duration * DEFAULT_SR))
        y = self.resampler(y[0]).numpy().astype(np.float32)
        if self.waveform_transforms:
            y = self.waveform_transforms(y).astype(np.float32)

        melspec = self.melspectrogram_converter(torch.from_numpy(y)).numpy()
        pcen = librosa.pcen(melspec,
                            sr=self.sampling_rate,
                            **self.pcen_parameters)
        clean_mel = librosa.power_to_db(melspec**1.5)
        melspec = librosa.power_to_db(melspec)

        if self.spectrogram_transforms:
            melspec = self.spectrogram_transforms(image=melspec)["image"]
            pcen = self.spectrogram_transforms(image=pcen)["image"]
            clean_mel = self.spectrogram_transforms(image=clean_mel)["image"]
        else:
            pass

        norm_melspec = normalize_melspec(melspec)
        norm_pcen = normalize_melspec(pcen)
        norm_clean_mel = normalize_melspec(clean_mel)
        image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)

        height, width, _ = image.shape
        image = cv2.resize(
            image, (int(width * self.img_size / height), self.img_size))
        image = np.moveaxis(image, 2, 0)
        image = (image / 255.0).astype(np.float32)

        return {"recording_id": flac_id, "image": image}
示例#19
0
    def __test(gain, bias, power, b, time_constant, eps, ms, S, Pexp):

        warnings.resetwarnings()
        warnings.simplefilter('always')
        with warnings.catch_warnings(record=True) as out:

            P = librosa.pcen(S, gain=gain, bias=bias, power=power,
                             time_constant=time_constant, eps=eps, b=b,
                             max_size=ms)

            if np.issubdtype(S.dtype, np.complexfloating):
                assert len(out) > 0
                assert 'complex' in str(out[0].message).lower()

        assert P.shape == S.shape
        assert np.all(P >= 0)
        assert np.all(np.isfinite(P))

        if Pexp is not None:
            assert np.allclose(P, Pexp)
示例#20
0
    def compute_mfccs(self, data):
        mfcc = librosa.feature.mfcc(data,
                                    sr=self.sr,
                                    n_mfcc=self.n_mels,
                                    hop_length=self.hop_length)
        mfcc = np.array(mfcc, order="F").astype(np.float32)

        if self.config["feature_type"] == "log_mel":
            mel_spec = librosa.feature.melspectrogram(
                data,
                sr=self.sr,  # 16000
                n_mels=self.n_mels,  # 40
                hop_length=self.hop_length,  # 160
                n_fft=self.n_fft,  # 480
                fmin=self.f_min,  # 20
                fmax=self.f_max  # 4000
            )
            # data[data > 0] = np.log(data[data > 0])
            # data = [np.matmul(self.dct_filters, x) for x in np.split(data, data.shape[1], axis=1)]
            mel_spec = np.array(mel_spec, order="F").astype(np.float32)

            log_mel = librosa.power_to_db(mel_spec)
            delta = librosa.feature.delta(mfcc)
            delta_delta = librosa.feature.delta(delta)
            data = np.vstack([log_mel, delta, delta_delta])  # (120, 101)
            return data  # shape(120, 101, 1)
        elif self.config["feature_type"] == "MFCC":
            # print(mfcc.shape)
            return mfcc  # data shape(40,101)
        elif self.config["feature_type"] == "PCEN":
            spec = librosa.feature.melspectrogram(data,
                                                  self.sr,
                                                  power=1,
                                                  n_mels=self.n_mels,
                                                  hop_length=self.hop_length,
                                                  n_fft=self.n_fft)
            pcen = librosa.pcen(spec, self.sr)  # (40,101)
            pcen = np.array(pcen, order="F").astype(np.float32)
            return pcen
示例#21
0
    def __getitem__(self, idx: int):
        sample = self.df.loc[idx, :]
        flac_id = sample["recording_id"]

        offset = np.random.choice(
            np.arange(0.0, CLIP_DURATION - self.duration, 0.1))
        y, sr = librosa.load(self.datadir / f"{flac_id}.wav",
                             sr=self.sampling_rate,
                             mono=True,
                             offset=offset,
                             duration=self.duration)
        if self.waveform_transforms:
            y = self.waveform_transforms(y).astype(np.float32)

        melspec = librosa.feature.melspectrogram(
            y, sr=sr, **self.melspectrogram_parameters)
        pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters)
        clean_mel = librosa.power_to_db(melspec**1.5)
        melspec = librosa.power_to_db(melspec)

        if self.spectrogram_transforms:
            melspec = self.spectrogram_transforms(image=melspec)["image"]
            pcen = self.spectrogram_transforms(image=pcen)["image"]
            clean_mel = self.spectrogram_transforms(image=clean_mel)["image"]
        else:
            pass

        norm_melspec = normalize_melspec(melspec)
        norm_pcen = normalize_melspec(pcen)
        norm_clean_mel = normalize_melspec(clean_mel)
        image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)

        height, width, _ = image.shape
        image = cv2.resize(
            image, (int(width * self.img_size / height), self.img_size))
        image = np.moveaxis(image, 2, 0)
        image = (image / 255.0).astype(np.float32)

        tail = offset + self.duration
        query_string = f"recording_id == '{flac_id}' & "
        query_string += f"t_min < {tail} & t_max > {offset}"
        all_tp_events = self.tp.query(query_string)

        label = np.zeros(N_CLASSES, dtype=np.float32)
        songtype_label = np.zeros(N_CLASSES + 2, dtype=np.float32)

        n_frames = image.shape[2]
        seconds_per_frame = self.duration / n_frames
        strong_label = np.zeros((n_frames, N_CLASSES), dtype=np.float32)
        songtype_strong_label = np.zeros((n_frames, N_CLASSES + 2),
                                         dtype=np.float32)

        for species_id in all_tp_events["species_id"].unique():
            label[int(species_id)] = 1.0

        for species_id_song_id in all_tp_events["species_id_song_id"].unique():
            songtype_label[CLASS_MAP[species_id_song_id]] = 1.0

        for _, row in all_tp_events.iterrows():
            t_min = row.t_min
            t_max = row.t_max
            species_id = row.species_id
            species_id_song_id = row.species_id_song_id

            start_index = int((t_min - offset) / seconds_per_frame)
            end_index = int((t_max - offset) / seconds_per_frame)

            strong_label[start_index:end_index, species_id] = 1.0
            songtype_strong_label[start_index:end_index,
                                  CLASS_MAP[species_id_song_id]] = 1.0

        return {
            "recording_id": flac_id,
            "image": image,
            "targets": {
                "weak": label,
                "strong": strong_label,
                "weak_songtype": songtype_label,
                "strong_songtype": songtype_strong_label
            }
        }
示例#22
0
    def __getitem__(self, idx: int):
        sample_tp = idx % 2 == 0
        if sample_tp:
            idx = idx // 2
            sample = self.tp.loc[idx, :]
        else:
            sample_again = True
            while sample_again:
                sample = self.fp.sample(1).reset_index(drop=True).loc[0]
                flac_id = sample["recording_id"]
                if len(self.tp.query(f"recording_id == '{flac_id}'")) == 0:
                    break

        flac_id = sample["recording_id"]
        index = sample["index"]

        t_min = sample["t_min"]
        t_max = sample["t_max"]

        if not self.centering:
            offset = np.random.choice(
                np.arange(max(t_max - self.duration, 0), t_min, 0.1))
            offset = min(CLIP_DURATION - self.duration, offset)
        else:
            call_duration = t_max - t_min
            relative_offset = (self.duration - call_duration) / 2
            offset = min(max(0, t_min - relative_offset),
                         CLIP_DURATION - self.duration)
        y, sr = librosa.load(self.datadir / f"{flac_id}.flac",
                             sr=self.sampling_rate,
                             mono=True,
                             offset=offset,
                             duration=self.duration,
                             res_type="kaiser_fast")
        if self.waveform_transforms:
            y = self.waveform_transforms(y).astype(np.float32)

        melspec = librosa.feature.melspectrogram(
            y, sr=sr, **self.melspectrogram_parameters)
        pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters)
        clean_mel = librosa.power_to_db(melspec**1.5)
        melspec = librosa.power_to_db(melspec)

        if self.spectrogram_transforms:
            melspec = self.spectrogram_transforms(image=melspec)["image"]
            pcen = self.spectrogram_transforms(image=pcen)["image"]
            clean_mel = self.spectrogram_transforms(image=clean_mel)["image"]
        else:
            pass

        norm_melspec = normalize_melspec(melspec)
        norm_pcen = normalize_melspec(pcen)
        norm_clean_mel = normalize_melspec(clean_mel)
        image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)

        height, width, _ = image.shape
        image = cv2.resize(
            image, (int(width * self.img_size / height), self.img_size))
        image = np.moveaxis(image, 2, 0)
        image = (image / 255.0).astype(np.float32)

        label = np.zeros(N_CLASSES, dtype=np.float32)
        n_frames = image.shape[2]
        seconds_per_frame = self.duration / n_frames
        strong_label = np.zeros((n_frames, N_CLASSES), dtype=np.float32)

        tail = offset + self.duration
        query_string = f"recording_id == '{flac_id}' & "
        query_string += f"t_min < {tail} & t_max > {offset}"

        if sample_tp:
            all_tp_events = self.tp.query(query_string)

            for species_id in all_tp_events["species_id"].unique():
                label[int(species_id)] = 1.0

            for _, row in all_tp_events.iterrows():
                t_min = row.t_min
                t_max = row.t_max
                species_id = row.species_id

                start_index = int((t_min - offset) / seconds_per_frame)
                end_index = int((t_max - offset) / seconds_per_frame)

                strong_label[start_index:end_index, species_id] = 1.0

        return {
            "recording_id": flac_id,
            "image": image,
            "targets": {
                "weak": label,
                "strong": strong_label
            },
            "index": index
        }
示例#23
0
def maximum_pcen(S, **kwargs):
    S = np.abs(S)**2
    pcen = librosa.pcen(S, **kwargs)
    return np.max(pcen, axis=1)
示例#24
0
def compute_pcen(audio, sr, input_format=True):
    """
    Computes PCEN (per-channel-energy normalization) for the given audio clip.

    Parameters
    ----------
    audio : np.ndarray [shape: (N,)]
        Audio array
    sr : int
        Sample rate
    input_format : bool [default: ``True``]
        If True, adds an additional channel dimension (of size 1) and ensures
        that a fixed number of PCEN frames (corresponding to
        ``get_pcen_settings()['n_hops']``) is returned. If number of frames is
        greater, the center frames are returned. If the the number of frames is
        less, empty frames are padded.

    Returns
    -------
    pcen : np.ndarray [shape: (top_freq_id, n_hops, 1) or (top_freq_id, num_frames)]
        Per-channel energy normalization processed Mel spectrogram. If
        ``input_format=True``, will be in shape ``(top_freq_id, n_hops, 1)``.
        Otherwise it will be in shape ``(top_freq_id, num_frames)``, where
        ``num_frames`` is the number of PCEN frames for the entire audio clip.

    """
    # Load settings.
    pcen_settings = get_pcen_settings()

    # Standardize type to be float32 [-1, 1]
    if audio.dtype.kind == 'i':
        max_val = max(np.iinfo(audio.dtype).max, -np.iinfo(audio.dtype).min)
        audio = audio.astype('float64') / max_val
    elif audio.dtype.kind == 'f':
        audio = audio.astype('float64')
    else:
        err_msg = 'Invalid audio dtype: {}'
        raise BirdVoxClassifyError(err_msg.format(audio.dtype))

    # Map to the range [-2**31, 2**31]
    audio = (audio * (2**31)).astype('float32')

    # Resample to 22,050 kHz
    if not sr == pcen_settings["sr"]:
        audio = librosa.resample(audio, sr, pcen_settings["sr"])

    # Compute Short-Term Fourier Transform (STFT).
    stft = librosa.stft(audio,
                        n_fft=pcen_settings["n_fft"],
                        win_length=pcen_settings["win_length"],
                        hop_length=pcen_settings["hop_length"],
                        window=pcen_settings["window"])

    # Compute squared magnitude coefficients.
    abs2_stft = (stft.real * stft.real) + (stft.imag * stft.imag)

    # Gather frequency bins
    # NB: as of librosa v0.6.2, melspectrogram is type-instable and thus
    # returns 64-bit output even with a 32-bit input. Therefore, we need
    # to convert PCEN to single precision eventually. This might not be
    # necessary in the future, if the whole PCEN pipeline is kept type-stable.
    melspec = librosa.feature.melspectrogram(y=None,
                                             S=abs2_stft,
                                             sr=pcen_settings["sr"],
                                             n_fft=pcen_settings["n_fft"],
                                             n_mels=pcen_settings["n_mels"],
                                             htk=True,
                                             fmin=pcen_settings["fmin"],
                                             fmax=pcen_settings["fmax"])

    # Compute PCEN.
    pcen = librosa.pcen(melspec,
                        sr=pcen_settings["sr"],
                        hop_length=pcen_settings["hop_length"],
                        gain=pcen_settings["pcen_norm_exponent"],
                        bias=pcen_settings["pcen_delta"],
                        power=pcen_settings["pcen_power"],
                        time_constant=pcen_settings["pcen_time_constant"])

    # Convert to single floating-point precision.
    pcen = pcen.astype('float32')

    # Truncate spectrum to range 2-10 kHz.
    pcen = pcen[:pcen_settings["top_freq_id"], :]

    # Format for input to network
    if input_format:
        # Trim TFR in time to required number of hops.
        pcen_width = pcen.shape[1]
        n_hops = pcen_settings["n_hops"]
        if pcen_width >= n_hops:
            first_col = int((pcen_width - n_hops) / 2)
            last_col = int((pcen_width + n_hops) / 2)
            pcen = pcen[:, first_col:last_col]
        else:
            # Pad if not enough frames
            pad_length = n_hops - pcen_width
            left_pad = pad_length // 2
            right_pad = pad_length - left_pad
            pcen = np.pad(pcen, [(0, 0), (left_pad, right_pad)],
                          mode='constant')

        # Add channel dimension
        pcen = pcen[:, :, np.newaxis]

    # Return.
    return pcen
示例#25
0
def test_pcen_max1():

    librosa.pcen(np.arange(100), max_size=3)
示例#26
0
def test_pcen_max1():

    librosa.pcen(np.arange(100), max_size=3)
示例#27
0
    def __getitem__(self, idx: int):
        sample = self.tp.loc[idx, :]
        index = sample["index"]
        flac_id = sample["recording_id"]

        t_min = sample["t_min"]
        t_max = sample["t_max"]

        if not self.centering:
            call_duration = t_max - t_min
            if call_duration > self.duration:
                offset = np.random.choice(
                    np.arange(max(t_min - call_duration / 2, 0),
                              t_min + call_duration / 2, 0.1))
                offset = min(CLIP_DURATION - self.duration, offset)
            else:
                offset = np.random.choice(
                    np.arange(max(t_max - self.duration, 0), t_min, 0.1))
                offset = min(CLIP_DURATION - self.duration, offset)
        else:
            call_duration = t_max - t_min
            if call_duration > self.duration:
                offset = (t_max + t_min) / 2 - self.duration / 2
            else:
                relative_offset = (self.duration - call_duration) / 2
                offset = min(max(0, t_min - relative_offset),
                             CLIP_DURATION - self.duration)
        y, sr = librosa.load(self.datadir / f"{flac_id}.wav",
                             sr=self.sampling_rate,
                             mono=True,
                             offset=offset,
                             duration=self.duration)
        if self.waveform_transforms:
            y = self.waveform_transforms(y).astype(np.float32)

        if self.frequency_range == "high":
            self.melspectrogram_parameters["fmin"] = 6000
            self.melspectrogram_parameters["fmax"] = 16000
            self.melspectrogram_parameters["n_mels"] = 96
        else:
            self.melspectrogram_parameters["fmin"] = 0
            self.melspectrogram_parameters["fmax"] = 6000
            self.melspectrogram_parameters["n_mels"] = 96

        melspec = librosa.feature.melspectrogram(
            y, sr=sr, **self.melspectrogram_parameters)
        pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters)
        clean_mel = librosa.power_to_db(melspec**1.5)
        melspec = librosa.power_to_db(melspec)

        if self.spectrogram_transforms:
            melspec = self.spectrogram_transforms(image=melspec)["image"]
            pcen = self.spectrogram_transforms(image=pcen)["image"]
            clean_mel = self.spectrogram_transforms(image=clean_mel)["image"]
        else:
            pass

        norm_melspec = normalize_melspec(melspec)
        norm_pcen = normalize_melspec(pcen)
        norm_clean_mel = normalize_melspec(clean_mel)
        image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)

        height, width, _ = image.shape
        if isinstance(self.img_size, int):
            image = cv2.resize(
                image, (int(width * self.img_size / height), self.img_size))
        else:
            image = cv2.resize(image, tuple(self.img_size))
        image = np.moveaxis(image, 2, 0)
        image = (image / 255.0).astype(np.float32)

        tail = offset + self.duration
        query_string = f"recording_id == '{flac_id}' & "
        query_string += f"t_min < {tail} & t_max > {offset}"
        all_tp_events = self.tp.query(query_string)

        label = np.zeros(N_CLASSES, dtype=np.float32)
        songtype_label = np.zeros(N_CLASSES + 2, dtype=np.float32)

        n_frames = image.shape[2]
        seconds_per_frame = self.duration / n_frames
        strong_label = np.zeros((n_frames, N_CLASSES), dtype=np.float32)

        for species_id in all_tp_events["species_id"].unique():
            if species_id in RANGE_SPECIES_MAP[self.frequency_range]:
                label[int(species_id)] = 1.0

        for _, row in all_tp_events.iterrows():
            t_min = row.t_min
            t_max = row.t_max
            species_id = row.species_id

            start_index = int((t_min - offset) / seconds_per_frame)
            end_index = int((t_max - offset) / seconds_per_frame)

            if species_id in RANGE_SPECIES_MAP[self.frequency_range]:
                strong_label[start_index:end_index, species_id] = 1.0

        return {
            "recording_id": flac_id,
            "image": image,
            "targets": {
                "weak": label,
                "strong": strong_label,
                "weak_songtype": songtype_label
            },
            "index": index
        }
                # Gather frequency bins according to the Mel scale.
                melspec = librosa.feature.melspectrogram(
                    y=None,
                    S=abs2_stft,
                    sr=pcen_settings["sr"],
                    n_fft=pcen_settings["n_fft"],
                    n_mels=pcen_settings["n_mels"],
                    htk=True,
                    fmin=pcen_settings["fmin"],
                    fmax=pcen_settings["fmax"])

                # Compute PCEN.
                pcen = librosa.pcen(
                    melspec,
                    sr=pcen_settings["sr"],
                    hop_length=pcen_settings["hop_length"],
                    gain=pcen_settings["pcen_norm_exponent"],
                    bias=pcen_settings["pcen_delta"],
                    power=pcen_settings["pcen_power"],
                    time_constant=pcen_settings["pcen_time_constant"])

                # Convert to single floating-point precision.
                pcen = pcen.astype('float32')

                # Truncate spectrum to range 2-10 kHz.
                pcen = pcen[:pcen_settings["top_freq_id"], :]

                # Save.
                lms_group[clip_name] = pcen

# Print elapsed time.
print(str(datetime.datetime.now()) + " Finish.")
示例#29
0
def test_pcen_axes():

    srand()
    # Make a power spectrogram
    X = np.random.randn(3, 100, 50)**2

    # First, test that axis setting works
    P1 = librosa.pcen(X[0])
    P1a = librosa.pcen(X[0], axis=-1)
    P2 = librosa.pcen(X[0].T, axis=0).T

    assert np.allclose(P1, P2)
    assert np.allclose(P1, P1a)

    # Test that it works with max-filtering
    P1 = librosa.pcen(X[0], max_size=3)
    P1a = librosa.pcen(X[0], axis=-1, max_size=3)
    P2 = librosa.pcen(X[0].T, axis=0, max_size=3).T

    assert np.allclose(P1, P2)
    assert np.allclose(P1, P1a)

    # Test that it works with multi-dimensional input, no filtering
    P0 = librosa.pcen(X[0])
    P1 = librosa.pcen(X[1])
    P2 = librosa.pcen(X[2])
    Pa = librosa.pcen(X)

    assert np.allclose(P0, Pa[0])
    assert np.allclose(P1, Pa[1])
    assert np.allclose(P2, Pa[2])

    # Test that it works with multi-dimensional input, max-filtering
    P0 = librosa.pcen(X[0], max_size=3)
    P1 = librosa.pcen(X[1], max_size=3)
    P2 = librosa.pcen(X[2], max_size=3)
    Pa = librosa.pcen(X, max_size=3, max_axis=1)

    assert np.allclose(P0, Pa[0])
    assert np.allclose(P1, Pa[1])
    assert np.allclose(P2, Pa[2])
示例#30
0
def test_pcen_axes_nomax():
    srand()
    # Make a power spectrogram
    X = np.random.randn(3, 100, 50)**2

    librosa.pcen(X, max_size=3)
示例#31
0
    def __getitem__(self, idx: int):
        sample = self.tp.loc[idx, :]
        index = sample["index"]
        flac_id = sample["recording_id"]

        t_min = sample["t_min"]
        t_max = sample["t_max"]

        call_duration = t_max - t_min
        if call_duration > self.duration:
            offset = np.random.choice(
                np.arange(max(t_min - call_duration / 2, 0),
                          t_min + call_duration / 2, 0.1))
            offset = min(CLIP_DURATION - self.duration, offset)
        else:
            offset = np.random.choice(
                np.arange(max(t_max - self.duration, 0), t_min, 0.1))
            offset = min(CLIP_DURATION - self.duration, offset)

        y, sr = librosa.load(self.datadir / f"{flac_id}{self.suffix}",
                             sr=self.sampling_rate,
                             mono=True,
                             offset=offset,
                             duration=self.duration)
        if self.waveform_transforms:
            y = self.waveform_transforms(y).astype(np.float32)

        melspec = librosa.feature.melspectrogram(
            y, sr=sr, **self.melspectrogram_parameters)

        use_mixup = False
        if np.random.rand() < self.mixup_prob:
            use_mixup = True
            while True:
                mixup_sample = self.tp.sample(1).reset_index(drop=True).loc[0]
                if mixup_sample["index"] != index:
                    break
            mixup_flac_id = mixup_sample["recording_id"]
            mixup_t_min = mixup_sample["t_min"]
            mixup_t_max = mixup_sample["t_max"]

            mixup_offset = np.random.choice(
                np.arange(max(mixup_t_max - self.duration, 0), mixup_t_min,
                          0.1))
            mixup_offset = min(CLIP_DURATION - self.duration, mixup_offset)

            y_mixup, _ = librosa.load(self.datadir /
                                      f"{mixup_flac_id}{self.suffix}",
                                      sr=self.sampling_rate,
                                      mono=True,
                                      offset=mixup_offset,
                                      duration=self.duration)
            if self.waveform_transforms:
                y_mixup = self.waveform_transforms(y_mixup).astype(np.float32)
            y_mixup = librosa.util.normalize(y_mixup)

            lam = np.random.beta(self.mixup_alpha, self.mixup_alpha)
            y_mixed = lam * y + (1 - lam) * y_mixup
            melspec = librosa.feature.melspectrogram(
                y_mixed, sr=sr, **self.melspectrogram_parameters)

        pcen = librosa.pcen(melspec, sr=sr, **self.pcen_parameters)
        clean_mel = librosa.power_to_db(melspec**1.5)
        melspec = librosa.power_to_db(melspec)

        if self.spectrogram_transforms:
            melspec = self.spectrogram_transforms(image=melspec)["image"]
            pcen = self.spectrogram_transforms(image=pcen)["image"]
            clean_mel = self.spectrogram_transforms(image=clean_mel)["image"]
        else:
            pass

        norm_melspec = normalize_melspec(melspec)
        norm_pcen = normalize_melspec(pcen)
        norm_clean_mel = normalize_melspec(clean_mel)
        image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)

        height, width, _ = image.shape
        image = cv2.resize(
            image, (int(width * self.img_size / height), self.img_size))
        image = np.moveaxis(image, 2, 0)
        image = (image / 255.0).astype(np.float32)

        tail = offset + self.duration
        query_string = f"recording_id == '{flac_id}' & "
        query_string += f"t_min < {tail} & t_max > {offset}"
        all_tp_events = self.tp.query(query_string)

        if use_mixup:
            mixup_tail = mixup_offset + self.duration
            query_string = f"recording_id == '{mixup_flac_id}' & "
            query_string += f"t_min < {mixup_tail} & t_max > {mixup_offset}"
            mixup_tp_events = self.tp.query(query_string)

        label = np.zeros(N_CLASSES, dtype=np.float32)

        n_frames = image.shape[2]
        seconds_per_frame = self.duration / n_frames
        strong_label = np.zeros((n_frames, N_CLASSES), dtype=np.float32)

        for species_id in all_tp_events["species_id"].unique():
            if self.float_label and use_mixup:
                label[int(species_id)] = lam
            else:
                label[int(species_id)] = 1.0

        for _, row in all_tp_events.iterrows():
            t_min = row.t_min
            t_max = row.t_max
            species_id = row.species_id

            start_index = int((t_min - offset) / seconds_per_frame)
            end_index = int((t_max - offset) / seconds_per_frame)

            if self.float_label and use_mixup:
                strong_label[start_index:end_index, species_id] = lam
            else:
                strong_label[start_index:end_index, species_id] = 1.0

        if use_mixup:
            for species_id in mixup_tp_events["species_id"].unique():
                if self.float_label:
                    label[int(species_id)] = (1 - lam)
                else:
                    label[int(species_id)] = 1.0

            for _, row in mixup_tp_events.iterrows():
                t_min = row.t_min
                t_max = row.t_max
                species_id = row.species_id

                start_index = int((t_min - mixup_offset) / seconds_per_frame)
                end_index = int((t_max - mixup_offset) / seconds_per_frame)

                if self.float_label:
                    strong_label[start_index:end_index, species_id] = 1 - lam
                else:
                    strong_label[start_index:end_index, species_id] = 1.0

        return {
            "recording_id": flac_id,
            "image": image,
            "targets": {
                "weak": label,
                "strong": strong_label
            },
            "index": index
        }
示例#32
0
#%%
import librosa
import librosa.display as display
import numpy
import IPython.display as ipd
#%%
old_audio = '/scratch/richardso21/mp3splt files/nigliq1/NIGLIQ_short_test/NIGLIQ1_20160607_203214_2960m_00s__2980m_00s_17m_40s__17m_50s.wav'
#%%
old, sr = librosa.load(old_audio)
ipd.Audio(old_audio)
#%%
Old_db = librosa.power_to_db(librosa.feature.melspectrogram(old, sr=sr))
display.specshow(Old_db, y_axis='hz')

#%%
mel = librosa.feature.melspectrogram(old, sr=sr)
new = librosa.pcen(mel)
display.specshow(new, y_axis='hz')

#%%
import soundfile as sf
sf.write('pcen_test.wav', new, samplerate=sr)
ipd.Audio('pcen_test.wav')

#%%
示例#33
0
def test_pcen_axes_nomax():
    srand()
    # Make a power spectrogram
    X = np.random.randn(3, 100, 50)**2

    librosa.pcen(X, max_size=3)
示例#34
0
# Make an array to store the frequency-averaged PCEN values
pcen_blocks = []

# Initialize the PCEN filter delays to steady state
zi = None

for y_block in stream:
    # Compute the STFT (without padding, so center=False)
    D = librosa.stft(y_block, n_fft=n_fft, hop_length=hop_length,
                     center=False)

    # Compute PCEN on the magnitude spectrum, using initial delays
    # returned from our previous call (if any)
    # store the final delays for use as zi in the next iteration
    P, zi = librosa.pcen(np.abs(D), sr=sr, hop_length=hop_length,
                         zi=zi, return_zf=True)

    # Compute the average PCEN over frequency, and append it to our list
    pcen_blocks.extend(np.mean(P, axis=0))

# Cast to a numpy array for use downstream
pcen_blocks = np.asarray(pcen_blocks)

#####################################################################
# For the sake of comparison, let's see how it would look had we 
# run PCEN on the entire spectrum without block-wise processing

y, sr = librosa.load(filename, sr=44100)

# Keep the same parameters as before
D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, center=False)
示例#35
0
def test_pcen_axes():

    srand()
    # Make a power spectrogram
    X = np.random.randn(3, 100, 50)**2

    # First, test that axis setting works
    P1 = librosa.pcen(X[0])
    P1a = librosa.pcen(X[0], axis=-1)
    P2 = librosa.pcen(X[0].T, axis=0).T

    assert np.allclose(P1, P2)
    assert np.allclose(P1, P1a)

    # Test that it works with max-filtering
    P1 = librosa.pcen(X[0], max_size=3)
    P1a = librosa.pcen(X[0], axis=-1, max_size=3)
    P2 = librosa.pcen(X[0].T, axis=0, max_size=3).T

    assert np.allclose(P1, P2)
    assert np.allclose(P1, P1a)

    # Test that it works with multi-dimensional input, no filtering
    P0 = librosa.pcen(X[0])
    P1 = librosa.pcen(X[1])
    P2 = librosa.pcen(X[2])
    Pa = librosa.pcen(X)

    assert np.allclose(P0, Pa[0])
    assert np.allclose(P1, Pa[1])
    assert np.allclose(P2, Pa[2])

    # Test that it works with multi-dimensional input, max-filtering
    P0 = librosa.pcen(X[0], max_size=3)
    P1 = librosa.pcen(X[1], max_size=3)
    P2 = librosa.pcen(X[2], max_size=3)
    Pa = librosa.pcen(X, max_size=3, max_axis=1)

    assert np.allclose(P0, Pa[0])
    assert np.allclose(P1, Pa[1])
    assert np.allclose(P2, Pa[2])
示例#36
0
# Make an array to store the frequency-averaged PCEN values
pcen_blocks = []

# Initialize the PCEN filter delays to steady state
zi = None

for y_block in stream:
    # Compute the STFT (without padding, so center=False)
    D = librosa.stft(y_block, n_fft=n_fft, hop_length=hop_length, center=False)

    # Compute PCEN on the magnitude spectrum, using initial delays
    # returned from our previous call (if any)
    # store the final delays for use as zi in the next iteration
    P, zi = librosa.pcen(np.abs(D),
                         sr=sr,
                         hop_length=hop_length,
                         zi=zi,
                         return_zf=True)

    # Compute the max PCEN over frequency, and append it to our list
    pcen_blocks.extend(np.max(P, axis=0))

# Cast to a numpy array for use downstream
pcen_blocks = np.asarray(pcen_blocks)

#####################################################################
# For the sake of comparison, let's see how it would look had we
# run PCEN on the entire spectrum without block-wise processing

y, sr = librosa.load(filename, sr=sr)
示例#37
0
def average_pcen(S, **kwargs):
    S = np.abs(S)**2
    pcen = librosa.pcen(S, **kwargs)
    return np.mean(pcen, axis=1)