Exemplo n.º 1
0
def mashability(song1, song2):
    """
	Returns how well song1 transitions into song2 using cosine matrix similarity
	and FFT semitone bin approximation matrices
	"""
    # If the tempo differs by more than thirty then we should never make that transition
    if abs(song1.bpm - song2.bpm) > 30:
        return 1
    sample_length = MIX_LENGTH  # beats per sample
    beats1 = song1.AudioFile.analysis.beats[song1.mix_out : song1.mix_out + sample_length]
    beats2 = song2.AudioFile.analysis.beats[song1.mix_in : song1.mix_in + sample_length]
    data1 = audio.getpieces(song1.AudioFile, beats1)
    data2 = audio.getpieces(song2.AudioFile, beats2)
    data1.encode("temp1.mp3")
    data2.encode("temp2.mp3")
    y1, sr1 = librosa.load("temp1.mp3")
    y2, sr2 = librosa.load("temp2.mp3")
    S1 = np.abs(librosa.stft(y1, n_fft=4096))
    chroma1 = librosa.feature.chroma_stft(S=S1, sr=sr1)
    S2 = np.abs(librosa.stft(y2, n_fft=4096))
    chroma2 = librosa.feature.chroma_stft(S=S2, sr=sr2)
    # im = librosa.display.specshow(chroma1,x_axis = "time",y_axis = "chroma")
    # im2 = librosa.display.specshow(chroma2,x_axis = "time",y_axis = "chroma")
    # plt.show()
    orthogonal_arr = []
    for i in range(min(chroma1.shape[1], chroma2.shape[1])):
        orthogonal_arr.append(dst.cosine(chroma1[:, i], chroma2[:, i]))
    return sum(orthogonal_arr) / len(orthogonal_arr)
Exemplo n.º 2
0
def reverse_channel(a, b, n_fft=2**13, win_length=2**12, hop_length=2**10):
    '''
    Estimates the channel distortion in b relative to a and reverses it
    
    :parameters:
        - a : np.ndarray
            Some signal
        - b : np.ndarray
            Some other signal with channel distortion relative to a
        - n_fft : int
            Number of samples in each FFT computation, default 2**13
        - win_length : int
            Number of samples in each window, default 2**12
        - hop_length : int
            Number of samples between successive FFT computations, default 2**10
    
    :returns:
        - b_filtered : np.ndarray
            The signal b, filtered to reduce channel distortion
    '''
    # Compute spectrograms
    a_spec = librosa.stft(a, n_fft=n_fft, win_length=win_length, hop_length=hop_length)
    b_spec = librosa.stft(b, n_fft=n_fft, win_length=win_length, hop_length=hop_length)
    # Compute the best filter
    H = best_filter_coefficients(a_spec, b_spec)
    # Apply it in the frequency domain (ignoring aliasing!  Yikes)
    b_spec_filtered = H*b_spec
    # Get back to time domain
    b_filtered = librosa.istft(b_spec_filtered, win_length=win_length, hop_length=hop_length)
    return b_filtered
Exemplo n.º 3
0
def test_stft_bad_window():

    y = np.zeros(22050 * 5)

    n_fft = 2048
    window = np.ones(n_fft // 2)

    librosa.stft(y, n_fft=n_fft, window=window)
Exemplo n.º 4
0
 def getSpectra(self, n_fft = 4096, hop_length = 1024):
   if self.spectra == None:
     stft = librosa.stft(self.signal, n_fft, hop_length)
     self.spectra = Spectra(stft, self.sr, n_fft, hop_length)
   elif self.spectra.n_fft != n_fft or self.spectra.hop_length != hop_length:
     stft = librosa.stft(self.signal, n_fft, hop_length)
     return Spectra(stft, self.sr, n_fft, hop_length)
   return self.spectra
Exemplo n.º 5
0
def hpss(y):

    D = librosa.stft(y)
    H, P = librosa.decompose.hpss(D, kernel_size=KERNEL_SIZE, power=HPSS_P)

    D_harm = np.abs(librosa.stft(librosa.istft(H), n_fft=N_FFT, hop_length=HOP))
    D_perc = np.abs(librosa.stft(librosa.istft(P), n_fft=N_FFT, hop_length=HOP))

    return D_harm, D_perc
Exemplo n.º 6
0
    def __test_stft(center, pad_mode):
        D1 = librosa.stft(y, center=center, pad_mode='reflect')
        D2 = librosa.stft(y, center=center, pad_mode=pad_mode)

        assert D1.shape == D2.shape

        if center and pad_mode != 'reflect':
            assert not np.allclose(D1, D2)
        else:
            assert np.allclose(D1, D2)
Exemplo n.º 7
0
def SaveSpectrogram(y_mix, y_inst,y_vocal, filename, orig_sr=44100) :
    y_mix = librosa.core.resample(y_mix,orig_sr,SR)
    y_vocal = librosa.core.resample(y_vocal,orig_sr,SR)
    y_inst = librosa.core.resample(y_inst,orig_sr,SR)

    S_mix = np.abs(librosa.stft(y_mix,n_fft=window_size,hop_length=hop_length)).astype(np.float32)
    S_inst = np.abs(librosa.stft(y_inst,n_fft=window_size,hop_length=hop_length)).astype(np.float32)
    S_vocal = np.abs(librosa.stft(y_vocal,n_fft=window_size,hop_length=hop_length)).astype(np.float32)
    
    norm = S_mix.max()
    S_mix /= norm
    S_inst /= norm
    S_vocal /= norm
    
    np.savez(os.path.join('./Spectrogram',filename+'.npz'),mix=S_mix,inst=S_inst ,vocal=S_vocal)
def wavs_to_specs(wavs_mono, wavs_src1, wavs_src2, n_fft = 1024, hop_length = None):

    stfts_mono = list()
    stfts_src1 = list()
    stfts_src2 = list()

    for wav_mono, wav_src1, wav_src2 in zip(wavs_mono, wavs_src1, wavs_src2):
        stft_mono = librosa.stft(wav_mono, n_fft = n_fft, hop_length = hop_length)
        stft_src1 = librosa.stft(wav_src1, n_fft = n_fft, hop_length = hop_length)
        stft_src2 = librosa.stft(wav_src2, n_fft = n_fft, hop_length = hop_length)
        stfts_mono.append(stft_mono)
        stfts_src1.append(stft_src1)
        stfts_src2.append(stft_src2)

    return stfts_mono, stfts_src1, stfts_src2
Exemplo n.º 9
0
def midi_to_cqt(midi, sf2_path=None, fs=22050, hop=512):
    '''
    Feature extraction routine for midi data, converts to a drum-free, percussion-suppressed CQT.
    
    Input:
        midi - pretty_midi.PrettyMIDI object
        sf2_path - path to .sf2 file to pass to pretty_midi.fluidsynth
        fs - sampling rate to synthesize audio at, default 22050
        hop - hop length for cqt, default 512
    Output:
        midi_gram - Simulated CQT of the midi data
    '''
    # Synthesize the MIDI using the supplied sf2 path
    midi_audio = midi.fluidsynth(fs=fs, sf2_path=sf2_path)
    # Use the harmonic part of the signal
    H, P = librosa.decompose.hpss(librosa.stft(midi_audio))
    midi_audio_harmonic = librosa.istft(H)
    # Compute log frequency spectrogram of audio synthesized from MIDI
    midi_gram = np.abs(librosa.cqt(y=midi_audio_harmonic,
                                   sr=fs,
                                   hop_length=hop,
                                   fmin=librosa.midi_to_hz(36),
                                   n_bins=60,
                                   tuning=0.0))**2
    return midi_gram
Exemplo n.º 10
0
def amplitude_for_file(audio_path):
    y, sr = librosa.load(audio_path)
    # from http://bmcfee.github.io/librosa/librosa.html#librosa.core.logamplitude
    # Get a power spectrogram from a waveform y
    S = np.abs(librosa.stft(y)) ** 2
    log_S = librosa.logamplitude(S)
    return log_S
Exemplo n.º 11
0
def audio_to_cqt_and_onset_strength(audio, fs=22050, hop=512):
    '''
    Feature extraction for audio data.
    Gets a power CQT of harmonic component and onset strength signal of percussive.
    
    Input:
        midi - pretty_midi.PrettyMIDI object
        fs - sampling rate to synthesize audio at, default 22050
        hop - hop length for cqt, default 512, onset strength hop will be 1/4 of this
    Output:
        audio_gram - CQT of audio data
        audio_onset_strength - onset strength signal
    '''
    # Use harmonic part for gram, percussive part for onsets
    H, P = librosa.decompose.hpss(librosa.stft(audio))
    audio_harmonic = librosa.istft(H)
    audio_percussive = librosa.istft(P)
    # Compute log-frequency spectrogram of original audio
    audio_gram = np.abs(librosa.cqt(y=audio_harmonic,
                                    sr=fs,
                                    hop_length=hop,
                                    fmin=librosa.midi_to_hz(36),
                                    n_bins=60))**2
    # Beat track the audio file at 4x the hop rate
    audio_onset_strength = librosa.onset.onset_strength(audio_percussive, hop_length=hop/4, sr=fs)
    return audio_gram, audio_onset_strength
Exemplo n.º 12
0
def test_piptrack():

    def __test(S, freq):
        pitches, mags = librosa.piptrack(S=S, fmin=100)

        idx = (mags > 0)

        assert len(idx) > 0

        recovered_pitches = pitches[idx]

        # We should be within one cent of the target
        assert np.all(np.abs(np.log2(recovered_pitches) - np.log2(freq)) <= 1e-2)

    sr = 22050
    duration = 3.0

    for freq in [110, 220, 440, 880]:
        # Generate a sine tone
        y = np.sin(2 * np.pi * freq * np.linspace(0, duration, num=duration*sr))
        for n_fft in [1024, 2048, 4096]:
            # Using left-aligned frames eliminates reflection artifacts at the boundaries
            S = np.abs(librosa.stft(y, n_fft=n_fft, center=False))

            yield __test, S, freq
Exemplo n.º 13
0
def parse_audio(path, audio_conf, windows, normalize=False):
    '''
    Input:
        path       : string 导入音频的路径
        audio_conf : dict 求频谱的音频参数
        windows    : dict 加窗类型
    Output:
        spect      : FloatTensor  每帧的频谱
    '''
    y = load_audio(path)
    n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"])
    win_length = n_fft
    hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride'])
    window = windows[audio_conf['window']]
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                        win_length=win_length, window=window)
    spect, phase = librosa.magphase(D) 
    
    spect = torch.FloatTensor(spect)
    spect = spect.log1p()
    
    if normalize:
        mean = spect.mean()
        std = spect.std()
        spect.add_(-mean)
        spect.div_(std)
    
    return spect.transpose(0,1)
Exemplo n.º 14
0
    def parse_audio(self, audio_path):
        if self.augment:
            y = load_randomly_augmented_audio(audio_path, self.sample_rate)
        else:
            y = load_audio(audio_path)
        if self.noiseInjector:
            add_noise = np.random.binomial(1, self.noise_prob)
            if add_noise:
                y = self.noiseInjector.inject_noise(y)
        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)
        # STFT
        D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        spect, phase = librosa.magphase(D)
        # S = log(S+1)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        if self.normalize:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)

        return spect
Exemplo n.º 15
0
def test_piptrack_properties():

    def __test(S, n_fft, hop_length, fmin, fmax, threshold):

        pitches, mags = librosa.core.piptrack(S=S,
                                              n_fft=n_fft,
                                              hop_length=hop_length,
                                              fmin=fmin,
                                              fmax=fmax,
                                              threshold=threshold)

        # Shape tests
        eq_(S.shape, pitches.shape)
        eq_(S.shape, mags.shape)

        # Make sure all magnitudes are positive
        assert np.all(mags >= 0)

        # Check the frequency estimates for bins with non-zero magnitude
        idx = (mags > 0)
        assert np.all(pitches[idx] >= fmin)
        assert np.all(pitches[idx] <= fmax)

        # And everywhere else, pitch should be 0
        assert np.all(pitches[~idx] == 0)

    y, sr = librosa.load('data/test1_22050.wav')

    for n_fft in [2048, 4096]:
        for hop_length in [None, n_fft // 4, n_fft // 2]:
            S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
            for fmin in [0, 100]:
                for fmax in [4000, 8000, sr // 2]:
                    for threshold in [0.1, 0.2, 0.5]:
                        yield __test, S, n_fft, hop_length, fmin, fmax, threshold
Exemplo n.º 16
0
def stretch_demo(input_file, output_file, speed):
    '''Phase-vocoder time stretch demo function.

    :parameters:
      - input_file : str
          path to input audio
      - output_file : str
          path to save output (wav)
      - speed : float > 0
          speed up by this factor
    '''

    N_FFT       = 2048
    HOP_LENGTH  = N_FFT /4

    # 1. Load the wav file, resample
    print 'Loading ', input_file

    y, sr = librosa.load(input_file)

    # 2. generate STFT @ 2048 samples
    print 'Computing short-time fourier transform... '
    D = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH)

    print 'Playing back at %3.f%% speed' % (speed * 100)
    D_stretch = librosa.phase_vocoder(D, speed, hop_length=HOP_LENGTH)

    y_stretch = librosa.istft(D_stretch, hop_length=HOP_LENGTH)

    print 'Saving stretched audio to: ', output_file
    librosa.output.write_wav(output_file, y_stretch, sr)
def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512):
    """
    Compute average Mel log spectrogram per beat given previously
    extracted beat times.

    :param filename: path to audio file
    :param beat_times: list of beat times in seconds
    :param mel_bands: number of Mel bands
    :param fft_size: FFT size
    :param hop_size: hop size for FFT processing
    :return: beat Mel spectrogram (mel_bands x frames)
    """

    y, sr = librosa.load(filename, sr=22050, mono=True)

    spec = np.abs(librosa.stft(y=y, n_fft=fft_size, hop_length=hop_size, win_length=fft_size,
                               window=scipy.signal.hamming))

    mel_fb = librosa.filters.mel(sr=22050, n_fft=fft_size, n_mels=mel_bands, fmin=50, fmax=10000, htk=True)
    s = np.sum(mel_fb, axis=1)
    mel_fb = np.divide(mel_fb, s[:, np.newaxis])

    mel_spec = np.dot(mel_fb, spec)
    mel_spec = np.log10(1. + 1000. * mel_spec)

    beat_frames = np.round(beat_times * (22050. / hop_size))

    beat_melspec = np.max(mel_spec[:, beat_frames[0]:beat_frames[1]], axis=1)

    for k in xrange(1, beat_frames.shape[0]-1):
        beat_melspec = np.column_stack((beat_melspec,
                                        np.max(mel_spec[:, beat_frames[k]:beat_frames[k+1]], axis=1)))

    return beat_melspec
Exemplo n.º 18
0
    def __test_consistency(frame_length, hop_length, center):
        y, sr = librosa.load(__EXAMPLE_FILE, sr=None)

        # Ensure audio is divisible into frame size.
        y = librosa.util.fix_length(y, y.size - y.size % frame_length)
        assert y.size % frame_length == 0

        # STFT magnitudes with a constant windowing function and no centering.
        S = librosa.magphase(librosa.stft(y,
                                          n_fft=frame_length,
                                          hop_length=hop_length,
                                          window=np.ones,
                                          center=center))[0]

        # Try both RMS methods.
        rms1 = librosa.feature.rms(S=S, frame_length=frame_length,
                                   hop_length=hop_length)
        rms2 = librosa.feature.rms(y=y, frame_length=frame_length,
                                   hop_length=hop_length, center=center)

        assert rms1.shape == rms2.shape
        # Normalize envelopes.
        rms1 /= rms1.max()
        rms2 /= rms2.max()

        # Ensure results are similar.
        np.testing.assert_allclose(rms1, rms2, rtol=5e-2)
Exemplo n.º 19
0
def get_spectrograms(sound_file): 
    '''Extracts melspectrogram and log magnitude from given `sound_file`.
    Args:
      sound_file: A string. Full path of a sound file.

    Returns:
      Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels)
      Transposed magnitude: A 2d array.Has shape of (T, 1+hp.n_fft//2)
    '''
    # Loading sound file
    y, sr = librosa.load(sound_file, sr=hp.sr) # or set sr to hp.sr.
    
    # stft. D: (1+n_fft//2, T)
    D = librosa.stft(y=y,
                     n_fft=hp.n_fft, 
                     hop_length=hp.hop_length, 
                     win_length=hp.win_length) 
    
    # magnitude spectrogram
    magnitude = np.abs(D) #(1+n_fft/2, T)
    
    # power spectrogram
    power = magnitude**2 #(1+n_fft/2, T) 
    
    # mel spectrogram
    S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) #(n_mels, T)

    return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32)) # (T, n_mels), (T, 1+n_fft/2)
Exemplo n.º 20
0
 def getChromagram(self, n_fft = 4096, hop_length = 1024):
   if self.chromagram == None:      
     if self.spectra == None:
       stft = librosa.stft(self.signal, n_fft, hop_length)
       self.spectra = Spectra(stft, self.sr, n_fft, hop_length)
     self.chromagram = librosa.feature.chromagram(S=self.spectra.getMagnitude())    
   return self.chromagram
Exemplo n.º 21
0
def mfcc_clustering(file_name, n_clusters):
    """
    From Prem
    :return:
    """

    clusterer = KMeans(n_clusters=n_clusters)

    print(file_name)
    mix, sr = librosa.load(file_name)
    mix_stft = librosa.stft(mix)
    comps, acts = find_template(mix_stft, sr, 100, 101, 0, mix_stft.shape[1])
    cluster_comps = librosa.feature.mfcc(S=comps)[1:14]
    save_mfcc_img(file_name[:-4] + "_mfcc.png", np.flipud(cluster_comps))
    clusterer.fit_transform(cluster_comps.T)
    labels = clusterer.labels_
    # print(labels)
    sources = []

    for cluster_index in range(n_clusters):
        indices = np.where(labels == cluster_index)[0]
        template, residual = extract_template(comps[:, indices], mix_stft)
        t = librosa.istft(template)
        sources.append(t)

    return np.array(sources)
Exemplo n.º 22
0
 def find_peaks(self, n_fft=2048, tau=10, kappa=10):
     ''' Extracts a fingerprint from the loaded audio data. Stores value
         internally as a integer hash code.
     '''
     # compute stft
     if self._audio_data is None:
         raise ValueError("No audio data loaded.")
     S = librosa.stft(self._audio_data, n_fft)
     # find peaks
     n_frames = S.shape[0]
     n_bins = S.shape[1]
     peaks = []
     for n in range(n_frames):
         for k in range(n_bins):
             p = S[n][k]
             is_peak = True
             # search neigborhood
             for i in range(tau):
                 for j in range(kappa):
                     n_ = n + i - tau//2
                     k_ = k + j - kappa//2
                     if 0 <= n_ < n_frames and 0 < k_ < n_bins and (n != n_ and k != n_):
                         p_ = S[n_][k_]
                         if abs(p_) < abs(p):
                             is_peak = False
             if is_peak:
                 peaks.append((n, k))
     return peaks
Exemplo n.º 23
0
def percussive(y):
    '''Extract the percussive component of an audio time series'''

    D = librosa.stft(y)
    P = librosa.decompose.hpss(D)[1]
    
    return librosa.istft(P)
Exemplo n.º 24
0
def extract_features(audio_file: Path,
                     seconds: int = params.nsynth_max_seconds,
                     window_size: int = params.librosa_spec_windows,
                     hop_length: int = params.librosa_hop_length,
                     calc_chroma_stft=True,
                     calc_mfcc_stft=True,
                     calc_mfcc=True):

    audio_data, sr = librosa.load(audio_file, sr=None)

    if seconds:
        audio_data = audio_data[:seconds * sr]

    stft = np.abs(librosa.stft(
        audio_data,
        hop_length=hop_length)) if calc_chroma_stft or calc_mfcc_stft else None
    chroma_stft = librosa.feature.chroma_stft(
        S=stft, sr=sr, n_chroma=window_size,
        hop_length=hop_length) if calc_chroma_stft else None
    mfcc_stft = librosa.feature.mfcc(
        audio_data, S=stft, sr=sr, n_mfcc=window_size,
        hop_length=hop_length) if calc_mfcc_stft else None
    mfcc = librosa.feature.mfcc(
        audio_data, sr=sr, n_mfcc=window_size,
        hop_length=hop_length) if calc_mfcc else None

    return (chroma_stft, mfcc_stft, mfcc)
Exemplo n.º 25
0
def makeSpectragrams(filename):
    f, sr = librosa.load(filename)
    print "first"
    melSpectra = librosa.feature.melspectrogram(f)
    cqtSpectra = librosa.cqt(f)
    stftSpectra = librosa.stft(f)
    print "stuff"
    librosa.display.specshow(melSpectra)
    #    plt.specgram(melSpectra)
    imageName = filename, "MelSpectragram.png"
    title = "Mel Spectrogram \nof " + filename[26:]
    plt.title(title)
    plt.ion()
    # plt.savefig(imageName)
    plt.show()

    librosa.display.specshow(cqtSpectra)
    title = "Constant Q Spectrogram \nof " + filename[26:]
    plt.title(title)
    # plt.spectrogram(cqtSpectra)
    plt.show()

    librosa.display.specshow(stftSpectra)
    title = "STFT Spectrogram \nof " + filename[26:]
    plt.title(title)
    # plt.spectrogram(cqtSpectra)
    plt.show()

    return True
def get_feature(fname):
    #b,_ = librosa.load(fname, res_type = 'kaiser_fast')
    b,_ = librosa.load(fname, res_type = 'kaiser_fast')
    try:
        mfcc = np.mean(librosa.feature.mfcc(y = b,n_mfcc=60).T,axis=0)
        mels = np.mean(librosa.feature.melspectrogram(b, sr = SAMPLE_RATE).T,axis = 0)
        stft = np.abs(librosa.stft(b))
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr = SAMPLE_RATE).T,axis = 0)
        contrast=np.mean(librosa.feature.spectral_contrast(S=stft, sr=SAMPLE_RATE).T,axis=0)
        tonnetz=np.mean(librosa.feature.tonnetz(librosa.effects.harmonic(b), sr = SAMPLE_RATE).T,axis = 0)
        ft2 = librosa.feature.zero_crossing_rate(b)[0]
        ft3 = librosa.feature.spectral_rolloff(b)[0]
        ft4 = librosa.feature.spectral_centroid(b)[0]
        ft5 = librosa.feature.spectral_contrast(b)[0]
        ft6 = librosa.feature.spectral_bandwidth(b)[0]
        ft2_trunc = np.hstack([np.mean(ft2),np.std(ft2), skew(ft2), np.max(ft2), np.min(ft2)])
        ft3_trunc = np.hstack([np.mean(ft3),np.std(ft3), skew(ft3), np.max(ft3), np.min(ft3)])
        ft4_trunc = np.hstack([np.mean(ft4),np.std(ft4), skew(ft4), np.max(ft4), np.min(ft4)])
        ft5_trunc = np.hstack([np.mean(ft5),np.std(ft5), skew(ft5), np.max(ft5), np.min(ft5)])
        ft6_trunc = np.hstack([np.mean(ft6),np.std(ft6), skew(ft6), np.max(ft6), np.min(ft6)])
        return pd.Series(np.hstack((mfcc,mels,chroma,contrast,tonnetz,ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc)))
        #d = np.hstack([mfcc,mels,chroma,contrast,tonnetz,ft2_trunc,ft3_trunc,ft4_trunc,ft5_trunc,ft6_trunc])
        #features = np.empty((0,238))
        #d = np.vstack([features,d])
    except:
        print('bad file')
        return pd.Series([0]*238)   
Exemplo n.º 27
0
def parse_audio(path, audio_conf, windows, normalize=True):
    '''
    Input:
        path       : string 导入音频的路径
        audio_conf : dcit 求频谱的音频参数
        windows    : dict 加窗类型
    Output:
        spect      : ndarray  每帧的频谱
    '''
    y = load_audio(path)
    n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"])
    win_length = n_fft
    hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride'])
    window = windows[audio_conf['window']]
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                        win_length=win_length, window=window)
    spect, phase = librosa.magphase(D)

    spect = np.log1p(spect)
    
    if normalize:
        mean = spect.mean()
        std = spect.std()
        spect = np.add(spect, -mean)
        spect = np.divide(spect, std)
    
    return spect.transpose()
Exemplo n.º 28
0
def melspectrogram(y, sr=22050, n_fft=256, hop_length=128, **kwargs):
    """Compute a mel spectrogram from a time series

    Arguments:
      y             --  (ndarray)   audio time-series
      sr            --  (int)       sampling rate of y        | default: 22050
      n_fft         --  (int)       number of FFT components  | default: 256
      hop_length    --  (int)       frames to hop             | default: 128

      **kwargs      --  Mel filterbank parameters
                        See melfb() documentation for details.

    Returns S:
      S             -- (ndarray)   Mel spectrogram

    """

    # Compute the STFT
    powspec     = np.abs(librosa.stft(y,   
                                      n_fft       =   n_fft, 
                                      hann_w      =   n_fft, 
                                      hop_length  =   hop_length))**2

    # Build a Mel filter
    mel_basis   = melfb(sr, n_fft, **kwargs)

    # Remove everything past the nyquist frequency
    mel_basis   = mel_basis[:, :(n_fft/ 2  + 1)]
    
    return np.dot(mel_basis, powspec)
Exemplo n.º 29
0
def compute_cqt(filename):
    a, sr = librosa.load(filename, sr=SR)
    spectrum = librosa.stft(a)
    harm_spec, _ = librosa.decompose.hpss(spectrum)
    harm = librosa.istft(harm_spec)
    cqt = np.abs(librosa.cqt(harm, sr=sr, hop_length=HOP, real=False))
    return cqt
Exemplo n.º 30
0
def test_real_hpss():

    # Load an audio signal
    y, sr = librosa.load('data/test1_22050.wav')

    D = np.abs(librosa.stft(y))
    
    def __hpss_test(window, power, mask, margin):
        H, P = librosa.decompose.hpss(D, kernel_size=window, power=power, mask=mask, margin=margin)

        if margin == 1.0 or margin == (1.0, 1.0):
            if mask:
                assert np.allclose(H + P, np.ones_like(D))
            else:
                assert np.allclose(H + P, D)
        else:
            if mask: 
                assert not np.any(H.astype(bool) & P.astype(bool))
            else:
                assert np.all(H + P <= D)

    for window in [31, (5, 5)]:
        for power in [1, 2, 10]:
            for mask in [False, True]:
                for margin in [1.0, 3.0, (1.0, 1.0), (9.0, 10.0)]:
                    yield __hpss_test, window, power, mask, margin
Exemplo n.º 31
0
    def local_audio_3d(self, filepath, mode="waveform"):
        """ Converts a local audio file into a 3D model.

        Args:
            filepath: string containing the path to the audio file
            mode: musical parameter to be used to create the 3D model
                  options: waveform, stft
        """

        print "Loading audio file " + filepath
        waveform, sr = librosa.load(filepath)

        if mode == "waveform":  # time domain analysis
            # Downsample waveform and store positive values only
            if len(waveform) > 1000:
                m = len(str(len(waveform)))
                downsample_factor = (10 ** (m-1-3) * int(str(len(waveform))[0]))  # 1k (i.e. 10^3) magnitude
            else:
                downsample_factor = 1
            half_waveform = [waveform[i] for i in xrange(len(waveform)) if waveform[i]>0 and i%downsample_factor==0]

            # Reshape and rescale waveform
            processed_waveform = self._movingaverage(half_waveform, self.ma_window_size)
            # processed_waveform = self._limit_spikes(half_waveform, np.mean(half_waveform), 5)
            processed_waveform = self._rescale_list(processed_waveform, 0, self.height_Y)
            processed_waveform = self.make_waveform_square(processed_waveform, self.n_waveform_bars)  # make waveform "square"

            # Convert 2D waveform into 3D
            print "Creating 3D model"
            model_3d = self.make_waveform_3d(processed_waveform, self.height_Z)

        else:  # frequency domain analysis
            self.mask_val /= 100
            # Get STFT magnitude
            print "Analyzing frequency components"
            stft = librosa.stft(waveform, n_fft=256)
            stft, phase = librosa.magphase(stft)
            # Downsample and rescale STFT
            if len(stft[0]) > 1000:
                m = len(str(len(stft[0])))
                downsample_factor = (10 ** (m-1-3)) * int(str(len(stft[0]))[0])  # 1k (i.e. 10^3) magnitude
            else:
                downsample_factor = 1
            new_stft = []
            for curr_fft in stft:
                min_loudness_value = max(curr_fft)
                ds_fft = [curr_fft[j] + min_loudness_value for j in xrange(len(curr_fft)) if j%downsample_factor==0]
                ds_fft = self._rescale_list(ds_fft, self.min_absolute_value, self.height_Z)
                new_stft.append(ds_fft)
            print "Creating 3D model"
            model_3d = np.array(new_stft)

        print "Exporting the 3D file"
        if self.OUTPUT_FOLDER[-1] != '/':
            self.OUTPUT_FOLDER.append('/')
        output_filename = self.OUTPUT_FOLDER + filepath.split('/')[-1][:-4] + "_" + mode + ".stl"
        numpy2stl(model_3d, 
                  output_filename, 
                  scale=self.scale, 
                  mask_val=self.mask_val, 
                  solid=True)
Exemplo n.º 32
0
def plot_signal(idx, data):
    if len(idx) == 0:
        raise PreventUpdate
    figs = make_subplots(rows=2,
                         cols=1,
                         subplot_titles=('Waveform', 'Spectrogram'))
    try:
        filename = data[idx[0]]['audio_filepath']
        audio, fs = librosa.load(filename, sr=None)
        time_stride = 0.01
        hop_length = int(fs * time_stride)
        n_fft = 512
        # linear scale spectrogram
        s = librosa.stft(y=audio, n_fft=n_fft, hop_length=hop_length)
        s_db = librosa.power_to_db(np.abs(s)**2, ref=np.max, top_db=100)
        figs.add_trace(
            go.Scatter(
                x=np.arange(audio.shape[0]) / fs,
                y=audio,
                line={'color': 'green'},
                name='Waveform',
                hovertemplate=
                'Time: %{x:.2f} s<br>Amplitude: %{y:.2f}<br><extra></extra>',
            ),
            row=1,
            col=1,
        )
        figs.add_trace(
            go.Heatmap(
                z=s_db,
                colorscale=[
                    [0, 'rgb(30,62,62)'],
                    [0.5, 'rgb(30,128,128)'],
                    [1, 'rgb(30,255,30)'],
                ],
                colorbar=dict(yanchor='middle',
                              lenmode='fraction',
                              y=0.2,
                              len=0.5,
                              ticksuffix=' dB'),
                dx=time_stride,
                dy=fs / n_fft / 1000,
                name='Spectrogram',
                hovertemplate=
                'Time: %{x:.2f} s<br>Frequency: %{y:.2f} kHz<br>Magnitude: %{z:.2f} dB<extra></extra>',
            ),
            row=2,
            col=1,
        )
        figs.update_layout({
            'margin': dict(l=0, r=0, t=20, b=0, pad=0),
            'height': 500
        })
        figs.update_xaxes(title_text='Time, s', row=1, col=1)
        figs.update_yaxes(title_text='Amplitude', row=1, col=1)
        figs.update_xaxes(title_text='Time, s', row=2, col=1)
        figs.update_yaxes(title_text='Frequency, kHz', row=2, col=1)
    except Exception:
        pass

    return figs
Exemplo n.º 33
0
"""
Created on Fri Mar  8 08:41:08 2019

@author: MR toad
"""

import librosa
import os
import numpy as np
import librosa.display
import matplotlib.pyplot as plt
#from PIL import Image

file_path = 'E:/speech/'
mfcc_path = 'E:/mfcc/'
pic_path = 'E:/pic'

file_name_list = os.listdir(file_path)
for file_name in file_name_list:
    y, sr = librosa.load(file_path + file_name)
    mfcc_feature = librosa.feature.mfcc(y=y, sr=sr)
    np.save(mfcc_path + file_name.split('.')[0] + ".npy", mfcc_feature)

    plt.figure(figsize=(12, 8))
    D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)
    plt.subplot(4, 2, 1)
    librosa.display.specshow(D, y_axis='linear')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Linear-frequency power spectrogram')
    plt.savefig(file_name.split('.')[0] + ".png", dpi=300)
Exemplo n.º 34
0
 def _stft(self, x):
     return librosa.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length)
Exemplo n.º 35
0
def GenerateLibrosaFeatures(wav_path):
    y, sr = librosa.load(wav_path)
    stft = librosa.stft(y, n_fft=BUFFER_LENGTH, hop_length=HOP_LENGTH)
    D = np.abs(stft)**2
    S = np.log(librosa.feature.melspectrogram(S=D, n_mels=MEL_COUNT))
    return S
Exemplo n.º 36
0
for chromagram_i in range(12):
    data[i,50+chromagram_i] = chromagram_mean[chromagram_i]
    data[i,62+chromagram_i] = chromagram_var[chromagram_i]

#plt.figure(figsize=(16, 6))
#librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm')

#----------------------

#Fourier Transform(傅里叶变换)
# Default FFT window size
n_fft = 2048                    # FFT window size
hop_length = 512                # number audio of frames between STFT columns (looks like a good default)
# Short-time Fourier transform (STFT)
D = np.abs(librosa.stft(audio_file, n_fft = n_fft, hop_length = hop_length))    #振幅D=(f,t)
frequent_weights  = D.sum(axis=1)               #各频率权重时间和(行为频率列为时间)
frequent_list = librosa.fft_frequencies(sr=sr, n_fft = n_fft)/sr
#print(np.shape(frequent_list))
#print(np.shape(frequent_weight))
#print(frequent_list)
#print(frequent_weight)
chroma_stft_mean = np.average(frequent_list,weights=frequent_weights)
chroma_stft_var = sum(((frequent_list-chroma_stft_mean)*frequent_weights)**2)/sum(frequent_weights)
#print(chroma_stft_mean)
#print(chroma_stft_var)

data[i,74] = chroma_stft_mean
data[i,75] = chroma_stft_var

print(data)
Exemplo n.º 37
0
def get_spec(wav, n_fft=1024, window="hamming", hop_length=256):
    return librosa.stft(wav, window=window, n_fft=n_fft, hop_length=hop_length)
 def stft_fn(y):
     return librosa.stft(y=y,
                         n_fft=int(frame_size),
                         hop_length=hop_size,
                         center=False).T
def specgram_lbrs(audiopath: str, plotpath: str=None, name: str=None,
                  cmap: str='gray_r', algorithm='default', y_axis=None,
                  **kwargs):
    """
    Generates a spectrogram of an audio file using librosa.display.specshow
    function.

    The output will be in png format.

    :param audiopath: string
        Path of the audio file.
    :param plotpath: string
        Path to plot the spectrogram. Default to the current working directory.
    :param name: string
        Name of the output image. Default to audio name.
    :param cmap: string
        Automatic colormap detection
        See matplotlib.pyplot.pcolormesh.
    :type algorithm: str or callable
    :param algorithm: Algorithm to use to compute the spectrogram.
        Available algorithms: 'default', 'mel', 'log'.
        The 'default' mode uses the librosa.stft function to compute the
        spectrogram data, the 'mel' uses the librosa.feature.melspectrogram
        function and 'log' uses librosa.cqt function.
        Expected return type of the algorithm: np.ndarray [shape=(Any, t)]
    :param y_axis: None or str.
        Range for the y-axes. This parameter is passed to the
        librosa.display.specshow function.
    :param kwargs:
        Additional kwargs are passed on to the defined algorithm function.

    """
    if plotpath is not None and not os.path.isdir(plotpath):
        os.makedirs(plotpath)
    if algorithm not in ['mel', 'log', 'default'] or not callable:
        raise ValueError('Unrecognized scale or not a callable object.')

    # Load audio and convert it to mono
    y, sr = librosa.load(audiopath)
    y = librosa.core.to_mono(y)

    # Apply algorithm to obtain an array of spectrogram data
    if algorithm == 'default':
        spec_data = librosa.stft(y, **kwargs)
        # Convert the data spectrogram to decibel units
        spec_data = librosa.power_to_db(librosa.magphase(spec_data, power=2)[0],
                                        ref=np.max)
    elif algorithm == 'mel':
        kwargs.setdefault('n_mels', 128)
        kwargs.setdefault('fmax', 8000)
        spec_data = librosa.feature.melspectrogram(y=y, sr=sr, **kwargs)
        # Convert the data spectrogram to decibel units
        spec_data = librosa.power_to_db(spec_data, ref=np.max)
    elif algorithm == 'log':
        spec_data = librosa.cqt(y, sr, **kwargs)
        # Convert the data spectrogram to decibel units
        spec_data = librosa.power_to_db(librosa.magphase(spec_data, power=2)[0],
                                        ref=np.max)
    else:
        spec_data = algorithm(y=y, sr=sr, **kwargs)

    # Plot spectrogram
    fig = plt.figure(figsize=FIG_SIZE)
    librosa.display.specshow(spec_data, sr=sr, cmap=cmap, y_axis=y_axis)
    plt.axis('off')
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1)

    if name is None:
        name = audiopath.split('/')[-1]

    if plotpath is not None:
        plt.savefig(plotpath + '/' + name + '.png')
    else:
        plt.savefig(name + '.png')
    plt.close()
Exemplo n.º 40
0
def to_spec(wav, len_frame=ModelConfig.L_FRAME, len_hop=ModelConfig.L_HOP):
    return librosa.stft(wav, n_fft=len_frame, hop_length=len_hop)
Exemplo n.º 41
0
def features(X, sample_rate):
    stft = np.abs(librosa.stft(X))

    # fmin 和 fmax 对应于人类语音的最小最大基本频率
    pitches, magnitudes = librosa.piptrack(X,
                                           sr=sample_rate,
                                           S=stft,
                                           fmin=70,
                                           fmax=400)
    pitch = []
    for i in range(magnitudes.shape[1]):
        index = magnitudes[:, 1].argmax()
        pitch.append(pitches[index, i])

    pitch_tuning_offset = librosa.pitch_tuning(pitches)
    pitchmean = np.mean(pitch)
    pitchstd = np.std(pitch)
    pitchmax = np.max(pitch)
    pitchmin = np.min(pitch)

    # 频谱质心
    cent = librosa.feature.spectral_centroid(y=X, sr=sample_rate)
    cent = cent / np.sum(cent)
    meancent = np.mean(cent)
    stdcent = np.std(cent)
    maxcent = np.max(cent)

    # 谱平面
    flatness = np.mean(librosa.feature.spectral_flatness(y=X))

    # 使用系数为50的MFCC特征
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T,
                    axis=0)
    mfccsstd = np.std(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T,
                      axis=0)
    mfccmax = np.max(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T,
                     axis=0)

    # 色谱图
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,
                     axis=0)

    # 梅尔频率
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)

    # ottava对比
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft,
                                                         sr=sample_rate).T,
                       axis=0)

    # 过零率
    zerocr = np.mean(librosa.feature.zero_crossing_rate(X))

    S, phase = librosa.magphase(stft)
    meanMagnitude = np.mean(S)
    stdMagnitude = np.std(S)
    maxMagnitude = np.max(S)

    # 均方根能量
    rmse = librosa.feature.rmse(S=S)[0]
    meanrms = np.mean(rmse)
    stdrms = np.std(rmse)
    maxrms = np.max(rmse)

    ext_features = np.array([
        flatness, zerocr, meanMagnitude, maxMagnitude, meancent, stdcent,
        maxcent, stdMagnitude, pitchmean, pitchmax, pitchstd,
        pitch_tuning_offset, meanrms, maxrms, stdrms
    ])

    ext_features = np.concatenate(
        (ext_features, mfccs, mfccsstd, mfccmax, chroma, mel, contrast))

    return ext_features
Exemplo n.º 42
0
plt.xlabel("Frequency")
plt.ylabel("Magnitude")
plt.title("Power spectrum")

# STFT -> spectrogram
hop_length = 512  # in num. of samples
n_fft = 2048  # window in num. of samples

# calculate duration hop length and window in seconds
hop_length_duration = float(hop_length) / sample_rate
n_fft_duration = float(n_fft) / sample_rate
print("STFT hop length duration is: {}s".format(hop_length_duration))
print("STFT window duration is: {}s".format(n_fft_duration))

# perform stft
stft = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)

# calculate abs values on complex numbers to get magnitude
spectrogram = np.abs(stft)

# display spectrogram
plt.figure(figsize=FIG_SIZE)
librosa.display.specshow(spectrogram, sr=sample_rate, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar()
plt.title("Spectrogram")

# apply logarithm to cast amplitude to Decibels
log_spectrogram = librosa.amplitude_to_db(spectrogram)
plt.figure(figsize=FIG_SIZE)
Exemplo n.º 43
0
        for j in range(
                len(info1)):  #Get the files in the folder inside each id
            #print(j)
            path1 = path + '/' + info1[j]  # Path of the info1 folder
            info2 = os.listdir(path1)  # Get the audio files

            for k in range(len(info2)):  # For each audio files
                path2 = path1 + '/' + info2[k]
                sig, rate = librosa.load(path2, duration=3.0)

                #rate, sig = wav.read(path2)   # Read .wav signal
                sig = scipy.signal.medfilt(sig, kernel_size=None)  # Filtering
                mfcc1 = librosa.feature.mfcc(
                    sig, rate, n_mfcc=13)  #, hop_length=800, n_fft=1600)
                #print(mfcc1.shape)
                librosa_stft = np.abs(librosa.stft(sig))

                cent = librosa.feature.spectral_centroid(y=sig, sr=rate)
                #print(cent.shape)
                flat = librosa.feature.spectral_flatness(y=sig)
                #print(flat.shape)
                rolloff = librosa.feature.spectral_rolloff(y=sig,
                                                           sr=rate,
                                                           roll_percent=0.99)
                #print(rolloff.shape)
                zcr = librosa.feature.zero_crossing_rate(sig)
                #print(zcr.shape)

                Tot_feat = np.vstack(
                    (mfcc1, librosa_stft, cent, flat, rolloff, zcr))
                #Tot_feat = speechpy.processing.cmvn(Tot_feat,variance_normalization=True)
Exemplo n.º 44
0
def separate_melody_accompaniment(x,
                                  Fs,
                                  N,
                                  H,
                                  traj,
                                  n_harmonics=10,
                                  tol_cent=50):
    """F0-based melody-accompaniement separation

    Notebook: C8/C8S2_MelodyExtractSep.ipynb

    Args:
        x: Audio signal
        Fs: Sampling frequency
        N: Window size in samples
        H: Hopsize in samples
        traj: F0 traj (time in seconds in 1st column, frequency in Hz in 2nd column)
        n_harmonics: Number of harmonics
        tol_cent: Tolerance in cents

    Returns:
        x_mel: Reconstructed audio signal for melody
        x_acc: Reconstructed audio signal for accompaniement
    """
    # Compute STFT
    X = librosa.stft(x,
                     n_fft=N,
                     hop_length=H,
                     win_length=N,
                     pad_mode='constant')
    Fs_feature = Fs / H
    T_coef = np.arange(X.shape[1]) / Fs_feature
    freq_res = Fs / N
    F_coef = np.arange(X.shape[0]) * freq_res

    # Adjust trajectory
    traj_X_values = interp1d(traj[:, 0],
                             traj[:, 1],
                             kind='nearest',
                             fill_value='extrapolate')(T_coef)
    traj_X = np.hstack((T_coef[:, None], traj_X_values[:, None, ]))

    # Compute binary masks
    mask_mel = convert_trajectory_to_mask_cent(traj_X,
                                               F_coef,
                                               n_harmonics=n_harmonics,
                                               tol_cent=tol_cent)
    mask_acc = np.ones(mask_mel.shape) - mask_mel

    # Compute masked STFTs
    X_mel = X * mask_mel
    X_acc = X * mask_acc

    # Reconstruct signals
    x_mel = librosa.istft(X_mel,
                          hop_length=H,
                          win_length=N,
                          window='hann',
                          center=True,
                          length=x.size)
    x_acc = librosa.istft(X_acc,
                          hop_length=H,
                          win_length=N,
                          window='hann',
                          center=True,
                          length=x.size)

    return x_mel, x_acc
Exemplo n.º 45
0
def extract_IMU_sound_video(video_IMU_data):
    features = np.empty((0, Data_window_watch, 12))
    labels = []
    features_sound = np.empty((0, 193))
    features_video = np.empty((0, Data_window_video, 64, 64, 3))

    for i in range(len(video_IMU_data)):
        print(i)

        #Saving features after every 100 samples
        if (i + 1) % 100 == 0:
            features_video = features_video.astype('uint8')
            data = [features, labels, features_sound, features_video]
            with open('Data_train_' + str(i) + 'ser.pkl', 'wb') as f:
                pickle.dump(data, f)
            features = np.empty((0, Data_window_watch, 12))
            labels = []
            features_sound = np.empty((0, 193))
            features_video = np.empty((0, Data_window_video, 64, 64, 3))

        #print(video_IMU_data[i][0])
        V_name = video_IMU_data[i][0]
        V_type = video_IMU_data[i][1]
        V_stime = video_IMU_data[i][2]
        V_etime = video_IMU_data[i][3]
        #print(V_name,V_etime-V_stime)
        duration = V_etime - V_stime

        #Sound Features
        Sound_X = video_IMU_data[i][6]
        Sound_sample_rate_expected = video_IMU_data[i][5]
        #print('Leng of sound:',len(Sound_X))
        sound_len = len(Sound_X)
        sound_sampling = (sound_len *
                          1000) / duration  #Duration is in milli seconds

        #Sound windows below
        sound_win = []
        start = 0
        end = start + sound_sampling * TimeWindow
        end = int(end)
        while end <= sound_len:
            winx = Sound_X[start:end]
            stft = np.abs(librosa.stft(winx))
            mfccs = np.mean(librosa.feature.mfcc(y=winx,
                                                 sr=Sound_sample_rate_expected,
                                                 n_mfcc=40).T,
                            axis=0)
            chroma = np.mean(librosa.feature.chroma_stft(
                S=stft, sr=Sound_sample_rate_expected).T,
                             axis=0)
            mel = np.mean(librosa.feature.melspectrogram(
                winx, sr=Sound_sample_rate_expected).T,
                          axis=0)
            contrast = np.mean(librosa.feature.spectral_contrast(
                S=stft, sr=Sound_sample_rate_expected).T,
                               axis=0)
            tonnetz = np.mean(librosa.feature.tonnetz(
                y=librosa.effects.harmonic(winx),
                sr=Sound_sample_rate_expected).T,
                              axis=0)
            #print (start,end,len(winx))
            ext_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz])
            sound_win.append(ext_features)
            start = start + sound_sampling * TimeWindow_slide
            start = int(start)
            end = start + sound_sampling * TimeWindow
            end = int(end)

        #Video Features and Windows
        video_len = video_IMU_data[i][7]
        #print(video_len,duration)
        video_sampling = (video_len * 1000) / duration

        video_raw = []
        start = 0
        end = start + video_sampling * TimeWindow
        end = int(end)
        #print('Video Sampling Rate:',video_sampling)
        #print('Start is:',start,': End is:',end)
        #         cap = cv2.VideoCapture(V_name)
        #         video_continue=True
        #         while video_continue:
        #             video_continue, img = cap.read()
        #             #video_raw.append(cv2.resize(img, (img_size, img_size)))
        #             video_raw.append(cv2.resize(img, (img_size, img_size)))
        # #             video_raw.append(img)
        cap = cv2.VideoCapture(V_name)
        while True:
            ret, img = cap.read()
            if not ret:
                break
            video_raw.append(
                cv2.resize(img, (img_size, img_size)).astype('uint8'))

        Windows_video = []
        while end <= video_len:
            #print('Start is:',start,': End is:',end)
            win_video = video_raw[start:end]
            win_video = win_video[:Data_window_video]
            Windows_video.append(win_video)
            start = start + video_sampling * TimeWindow_slide
            start = int(start)
            end = start + video_sampling * TimeWindow
            end = int(end)

        #Dict of sensor data
        V_dict = dict()

        # Loop over the data items
        for j in range(len(video_IMU_data[i][4])):
            #print(video_IMU_data[i][4][j])
            S_type = video_IMU_data[i][4][j][0]
            timestamp = video_IMU_data[i][4][j][2]
            x = video_IMU_data[i][4][j][3]
            y = video_IMU_data[i][4][j][4]
            z = video_IMU_data[i][4][j][5]

            if S_type in V_dict:
                V_dict[S_type].append([timestamp, x, y, z])
            else:
                V_dict[S_type] = []
                V_dict[S_type].append([timestamp, x, y, z])
            #print(S_type,timestamp,x,y,z)
    # 3 = Acc Right Wrist
    # 4 = GYRO Right Wrist
    # 11 = Acc left Wrist
    # 12 = GYRO left Wrist

        raw3 = []
        raw4 = []
        raw11 = []
        raw12 = []
        for sid in V_dict:
            if sid == 'raw3':
                #print(sid,len(V_dict[sid]))
                # Timewindow 2 Sec, Sampling 25HZ.
                total_samples = len(V_dict[sid])
                sampling = (total_samples * 1000) / (duration)
                #print('Sampling',sampling)
                start = 0
                end = start + sampling * TimeWindow
                end = int(end)
                while end <= total_samples:
                    datawind = V_dict[sid][start:end]
                    #Fixing Datawind_leng Here:
                    datawind = datawind[:Data_window_watch]
                    raw3.append(datawind)

                    #print('window Samples:',len(datawind))
                    #winlen1.append(len(datawind))

                    #print(start,end)
                    start = start + (sampling) * (TimeWindow_slide)
                    start = int(start)
                    end = start + sampling * TimeWindow
                    end = int(end)

            if sid == 'raw4':
                #print(sid,len(V_dict[sid]))
                # Timewindow 2 Sec, Sampling 25HZ.
                total_samples = len(V_dict[sid])
                sampling = (total_samples * 1000) / (duration)
                #print('Sampling',sampling)
                start = 0
                end = start + sampling * TimeWindow
                end = int(end)
                while end <= total_samples:
                    datawind = V_dict[sid][start:end]
                    #print('window Samples:',len(datawind))
                    #winlen2.append(len(datawind))
                    datawind = datawind[:Data_window_watch]
                    raw4.append(datawind)
                    #print(start,end)
                    start = start + (sampling) * (TimeWindow_slide)
                    start = int(start)
                    end = start + sampling * TimeWindow
                    end = int(end)

            if sid == 'raw11':
                #print(sid,len(V_dict[sid]))
                # Timewindow 2 Sec, Sampling 25HZ.
                total_samples = len(V_dict[sid])
                sampling = (total_samples * 1000) / (duration)
                #print('Sampling',sampling)
                start = 0
                end = start + sampling * TimeWindow
                end = int(end)
                while end <= total_samples:
                    datawind = V_dict[sid][start:end]
                    #print('window Samples:',len(datawind))
                    #winlen3.append(len(datawind))
                    datawind = datawind[:Data_window_watch]
                    raw11.append(datawind)
                    #print(start,end)
                    start = start + (sampling) * (TimeWindow_slide)
                    start = int(start)
                    end = start + sampling * TimeWindow
                    end = int(end)

            if sid == 'raw12':
                #print(sid,len(V_dict[sid]))
                # Timewindow 2 Sec, Sampling 25HZ.
                total_samples = len(V_dict[sid])
                sampling = (total_samples * 1000) / (duration)
                #print('Sampling',sampling)
                start = 0
                end = start + sampling * TimeWindow
                end = int(end)
                while end <= total_samples:
                    datawind = V_dict[sid][start:end]
                    datawind = datawind[:Data_window_watch]
                    raw12.append(datawind)

                    #print('window Samples:',len(datawind))
                    #winlen4.append(len(datawind))

                    #print(start,end)
                    start = start + (sampling) * (TimeWindow_slide)
                    start = int(start)
                    end = start + sampling * TimeWindow
                    end = int(end)
            #print(sid,len(V_dict[sid]))
        #print(len(raw3),len(raw4),len(raw11),len(raw12))
        raw3 = np.array(raw3)
        raw4 = np.array(raw4)
        raw11 = np.array(raw11)
        raw12 = np.array(raw12)
        sound_win = np.array(sound_win)
        Windows_video = np.array(Windows_video)

        # We abandon the timestamp
        #print(raw3.shape,raw4.shape,raw11.shape,raw12.shape)
        raw3_windows = raw3.shape[0]
        raw4_windows = raw4.shape[0]
        raw11_windows = raw11.shape[0]
        raw12_windows = raw12.shape[0]
        #print('sensor wind:',raw12_windows)
        sound_win_windows = sound_win.shape[0]
        video_win_windows = Windows_video.shape[0]
        #print('Sound wind:',sound_win_windows)
        #print('Video wind:',video_win_windows)
        try:
            #Sometimes Watch have less features, and we don't know the Reason
            min_features_watch = np.array(
                [raw3.shape[1], raw4.shape[1], raw11.shape[1],
                 raw12.shape[1]]).min()

            if min_features_watch < Data_window_watch:
                print('Skipping:', raw3.shape, raw4.shape, raw11.shape,
                      raw12.shape)
                continue
            min_features_Video = Windows_video.shape[1]
            if min_features_Video < Data_window_video:
                print('Skipping:', Windows_video.shape)
                continue
        except:
            print('Error Skipping:', raw3.shape, raw4.shape, raw11.shape,
                  raw12.shape)
            continue
        #print(sound_win_windows)
        min_windows = np.array([
            raw3_windows, raw4_windows, raw11_windows, raw12_windows,
            sound_win_windows, video_win_windows
        ]).min()

        if min_windows > 0:
            output = np.concatenate(
                (raw3[:min_windows, :, 1:4], raw4[:min_windows, :, 1:4],
                 raw11[:min_windows, :, 1:4], raw12[:min_windows, :, 1:4]),
                axis=2)
            #print(output.shape)
            features = np.vstack([features, output])
            # print(sound_win[:min_windows].shape)
            features_sound = np.vstack(
                [features_sound, sound_win[:min_windows]])
            #print(Windows_video[:min_windows].shape, features_video.shape)
            features_video = np.vstack(
                [features_video, Windows_video[:min_windows]])

            #output.shape[0],V_type
            for k in range(output.shape[0]):
                labels.append(V_type)

    #     print(raw4.shape)


#         print(output.shape)
#         print(features.shape, features_sound.shape, Windows_video[:min_windows].shape )
#         break
# 12-num of windows, 47-num of samples in window 12= 3*4 num of sensor reading types

    features_video = features_video.astype('uint8')
    data = [features, labels, features_sound, features_video]
    with open('Data_train_' + str(i) + '.pkl', 'wb') as f:
        pickle.dump(data, f)
    print('Saved: ' + 'Data_train_' + str(i) + '.pkl')

    print(features_video.shape)
    return features, labels, features_sound, features_video
Exemplo n.º 46
0
def stft(y):
    return librosa.stft(
        y=y,
        n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length)
Exemplo n.º 47
0
     energy.append(np.mean(e))
     ent = 0.0
     m = np.mean(e)
     for j in range(0,len(e[0])):
          q = np.absolute(e[0][j] - m)
          ent = ent + (q * np.log10(q))
     entropy_of_energy.append(ent)
f_list_1 = []
f_list_1.append(zero_crossings)
f_list_1.append(energy)
f_list_1.append(entropy_of_energy)
f_np_1 = np.array(f_list_1)
f_np_1 = np.transpose(f_np_1)[:-1]
kmeans = KMeans(n_clusters=2, random_state=0).fit(f_np_1)
result=kmeans.predict(f_np_1)
D = li.amplitude_to_db(np.abs(li.stft(y)), ref=np.max)
plt.subplot(3,1,1)
plt.title("Audio Analog Signal")
plt.plot(y[1950:2000])
plt.subplot(3,1,2)
plt.title("Spectogram")
librosa.display.specshow(D, y_axis='linear')
plt.colorbar(format='%+2.0f dB')
plt.subplot(3,1,3)
plt.title("Audio Digital Signal")
plt.plot(result, marker='d', color='blue', drawstyle='steps')
plt.show()
stream.stop_stream()
stream.close()
audio.terminate()
Exemplo n.º 48
0
def LoadAudio(file_path):
    y, sr = load(file_path, sr=SR)
    stft = librosa.stft(y, n_fft=window_size, hop_length=hop_length)
    mag, phase = librosa.magphase(stft)
    return mag.astype(np.float32), phase
x = np.reshape(x, (-1, N_CHANNELS))

if np.array_equal(x.T, a_content):
    print("equal")
else:
    print("not equal")

print("content = ", a_content)
print("x = ", x)
diff = a_content - x.T
print("diff = ", diff)

a = np.zeros_like(a_content)
a[:N_CHANNELS, :] = np.exp(x.T) - 1
p = 2 * np.pi * np.random.random_sample(a.shape) - np.pi
for i in range(500):
    S = a * np.exp(1j * p)
    x = librosa.istft(S)
    p = np.angle(librosa.stft(x, N_FFT))

OUTPUT_FILENAME = 'outputs/' + CONTENT_FILENAME[:
                                                -4] + '_' + STYLE_FILENAME[:-4] + '_ctw-' + str(
                                                    CONTENT_WEIGHT
                                                ) + '_stw-' + str(
                                                    STYLE_WEIGHT
                                                ) + '_iter-' + str(
                                                    ITERATIONS) + '.wav'
librosa.output.write_wav(OUTPUT_FILENAME, x, fs)
print(OUTPUT_FILENAME)
print("done")
Exemplo n.º 50
0
my_dpi = 120

for index, row in speakers_filtered.iterrows():
    dir_ = root + '/' + row['SUBSET'] + '/' + str(row['ID']) + '/'
    print('working on df row {}, spaker {}'.format(index, row['CODE']))
    if not os.path.exists(dir_):
        print('dir {} not exists, skipping'.format(dir_))
        continue

    files_iter = Path(dir_).glob('**/*.flac')
    files_ = [str(f) for f in files_iter]

    for f in files_:
        ay, sr = librosa.load(f)
        duration = ay.shape[0] / sr
        start = 0
        while start + 5 < duration:
            slice_ = ay[start * sr:(start + 5) * sr]
            start = start + 5 - 1
            x = librosa.stft(slice_)
            xdb = librosa.amplitude_to_db(abs(x))
            plt.figure(figsize=(227 / my_dpi, 227 / my_dpi), dpi=my_dpi)
            plt.axis('off')
            librosa.display.specshow(xdb, sr=sr, x_axis='time', y_axis='log')
            plt.savefig(root + '/train-gram/' + str(row['CODE']) + '/' +
                        uuid.uuid4().hex + '.png',
                        dpi=my_dpi)
            plt.close()

    print('work done on index {}, speaker {}'.format(index, row['CODE']))
Exemplo n.º 51
0
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Get the file path to the included audio example
filename = 'Train/blues/blues.00000.au'

# Load the example clip
y, sr = librosa.load(filename)

# Compute spectral centroid
sc = librosa.feature.spectral_centroid(y=y, sr=sr)

# Compute spectrogram
S, phase = librosa.magphase(librosa.stft(y=y))
librosa.feature.spectral_centroid(S=S)

# Plot the result
plt.figure()
plt.subplot(2, 1, 1)
plt.semilogy(sc.T, label='Spectral centroid')
plt.ylabel('Hz')
plt.xticks([])
plt.xlim([0, sc.shape[-1]])
plt.legend()
plt.subplot(2, 1, 2)
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
                         y_axis='log',
                         x_axis='time')
plt.title('log Power spectrogram')
def read_audio_spectum(filename):
    x, fs = librosa.load(filename)
    S = librosa.stft(x, N_FFT)
    p = np.angle(S)
    S = np.log1p(np.abs(S[:, :430]))
    return S, fs
Exemplo n.º 53
0
def getStft(y, n_fft=2048, hop_length=512):
    return librosa.feature.rmse(S=librosa.stft(y, n_fft=n_fft, hop_length=hop_length))[0]
Exemplo n.º 54
0
def display_sample_info(file_path, label=''):
    """Generate various representations a given audio file.
    E.g. Mel, MFCC and power spectrogram's.

    Args:
        file_path (str): Path to the audio file.
        label (str): Optional label to display for the given audio file.

    Returns:
        Nothing.
    """

    if not os.path.isfile(file_path):
        raise ValueError('{} does not exist.'.format(file_path))

    # By default, all audio is mixed to mono and resampled to 22050 Hz at load time.
    y, sr = librosa.load(file_path, sr=None, mono=True)

    # At 16000 Hz, 512 samples ~= 32ms. At 16000 Hz, 200 samples = 12ms. 16 samples = 1ms @ 16kHz.
    hop_length = 200  # Number of samples between successive frames e.g. columns if a spectrogram.
    f_max = sr / 2.  # Maximum frequency (Nyquist rate).
    f_min = 64.  # Minimum frequency.
    n_fft = 1024  # Number of samples in a frame.
    n_mels = 80  # Number of Mel bins to generate.
    n_mfcc = 13  # Number of Mel cepstral coefficients to extract.
    win_length = 333  # Window length.

    # Create info string.
    num_samples = y.shape[0]
    duration = librosa.get_duration(y=y, sr=sr)
    info_str_format = 'Label: {}\nPath: {}\nDuration={:.3f}s with {:,d} Samples\n' \
                      'Sampling Rate={:,d} Hz\nMin, Max=[{:.2f}, {:.2f}]'
    info_str = info_str_format.format(label, file_path, duration, num_samples,
                                      sr, np.min(y), np.max(y))
    print(info_str)
    # Escape some LaTeX special characters
    info_str_tex = info_str.replace('_', '\\_')

    plt.figure(figsize=(10, 7))
    plt.subplot(3, 1, 1)
    display.waveplot(y, sr=sr)
    plt.title('Monophonic')

    # Plot waveforms.
    y_harm, y_perc = librosa.effects.hpss(y)
    plt.subplot(3, 1, 2)
    display.waveplot(y_harm, sr=sr, alpha=0.33)
    display.waveplot(y_perc, sr=sr, color='r', alpha=0.40)
    plt.title('Harmonic and Percussive')

    # Add file information.
    plt.subplot(3, 1, 3)
    plt.axis('off')
    plt.text(0.0, 1.0, info_str_tex, color='black', verticalalignment='top')
    plt.tight_layout()

    # Calculating MEL spectrogram and MFCC.
    db_pow = np.abs(
        librosa.stft(y=y,
                     n_fft=n_fft,
                     hop_length=hop_length,
                     win_length=win_length))**2

    s_mel = librosa.feature.melspectrogram(S=db_pow,
                                           sr=sr,
                                           hop_length=hop_length,
                                           fmax=f_max,
                                           fmin=f_min,
                                           n_mels=n_mels)

    s_mel = librosa.power_to_db(s_mel, ref=np.max)
    s_mfcc = librosa.feature.mfcc(S=s_mel, sr=sr, n_mfcc=n_mfcc)

    # STFT (Short-time Fourier Transform)
    # https://librosa.github.io/librosa/generated/librosa.core.stft.html
    plt.figure(figsize=(12, 10))
    db = librosa.amplitude_to_db(librosa.magphase(librosa.stft(y))[0],
                                 ref=np.max)
    plt.subplot(3, 2, 1)
    display.specshow(db,
                     sr=sr,
                     x_axis='time',
                     y_axis='linear',
                     hop_length=hop_length)
    plt.colorbar(format='%+2.0f dB')
    plt.title('Linear-frequency power spectrogram')

    plt.subplot(3, 2, 2)
    display.specshow(db,
                     sr=sr,
                     x_axis='time',
                     y_axis='log',
                     hop_length=hop_length)
    plt.colorbar(format='%+2.0f dB')
    plt.title('Log-frequency power spectrogram')

    plt.subplot(3, 2, 3)
    display.specshow(s_mfcc,
                     sr=sr,
                     x_axis='time',
                     y_axis='linear',
                     hop_length=hop_length)
    plt.colorbar(format='%+2.0f dB')
    plt.title('MFCC spectrogram')

    # # CQT (Constant-T Transform)
    # # https://librosa.github.io/librosa/generated/librosa.core.cqt.html
    cqt = librosa.amplitude_to_db(librosa.magphase(librosa.cqt(y, sr=sr))[0],
                                  ref=np.max)
    # plt.subplot(3, 2, 3)
    # display.specshow(cqt, sr=sr, x_axis='time', y_axis='cqt_note', hop_length=hop_length)
    # plt.colorbar(format='%+2.0f dB')
    # plt.title('Constant-Q power spectrogram (note)')

    plt.subplot(3, 2, 4)
    display.specshow(cqt,
                     sr=sr,
                     x_axis='time',
                     y_axis='cqt_hz',
                     hop_length=hop_length)
    plt.colorbar(format='%+2.0f dB')
    plt.title('Constant-Q power spectrogram (Hz)')

    plt.subplot(3, 2, 5)
    display.specshow(db,
                     sr=sr,
                     x_axis='time',
                     y_axis='log',
                     hop_length=hop_length)
    plt.colorbar(format='%+2.0f dB')
    plt.title('Log power spectrogram')

    plt.subplot(3, 2, 6)
    display.specshow(s_mel, x_axis='time', y_axis='mel', hop_length=hop_length)
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel spectrogram')

    # TODO Import project used features (python_speech_features).
    # norm_features = 'none'
    # mfcc = load_sample(file_path, feature_type='mfcc', feature_normalization=norm_features)[0]
    # mfcc = np.swapaxes(mfcc, 0, 1)
    #
    # mel = load_sample(file_path, feature_type='mel', feature_normalization=norm_features)[0]
    # mel = np.swapaxes(mel, 0, 1)

    (__sr, __y) = wavfile.read(file_path)

    num_features = 26
    win_len = WIN_LENGTH
    win_step = WIN_STEP
    __mel = psf.logfbank(signal=__y,
                         samplerate=__sr,
                         winlen=win_len,
                         winstep=win_step,
                         nfilt=num_features,
                         nfft=n_fft,
                         lowfreq=f_min,
                         highfreq=f_max,
                         preemph=0.97)

    __mfcc = psf.mfcc(signal=__y,
                      samplerate=__sr,
                      winlen=win_len,
                      winstep=win_step,
                      numcep=num_features // 2,
                      nfilt=num_features,
                      nfft=n_fft,
                      lowfreq=f_min,
                      highfreq=f_max,
                      preemph=0.97,
                      ceplifter=22,
                      appendEnergy=False)

    __mfcc = __mfcc.astype(np.float32)
    __mel = __mel.astype(np.float32)
    __mfcc = np.swapaxes(__mfcc, 0, 1)
    __mel = np.swapaxes(__mel, 0, 1)

    plt.figure(figsize=(5.2, 1.6))
    display.waveplot(y, sr=sr)

    fig = plt.figure(figsize=(10, 4))
    plt.subplot(2, 1, 2)
    display.specshow(__mfcc,
                     sr=__sr,
                     x_axis='time',
                     y_axis='mel',
                     hop_length=win_step * __sr)
    # plt.set_cmap('magma')
    # plt.xticks(rotation=295)
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')
    plt.xlim(xmin=0)
    plt.ylim(0, 8000)
    plt.colorbar(format='%+2.0f')
    plt.title('MFCC', visible=False)

    plt.subplot(2, 1, 1)
    display.specshow(__mel,
                     sr=__sr,
                     x_axis='time',
                     y_axis='mel',
                     hop_length=win_step * __sr)
    # plt.set_cmap('magma')
    # plt.xticks(rotation=295)
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')
    plt.xlim(xmin=0)
    plt.ylim(0, 8000)
    plt.colorbar(format='%+2.0f', label='Power (dB)')
    plt.title('Mel Spectrogram', visible=False)

    plt.tight_layout()
    fig.savefig('/tmp/mel-mfcc-plot-we-did-it.pdf', bbox_inches='tight')
    plt.show()
Exemplo n.º 55
0
    def hpss_wav(y):
        H, P = librosa.decompose.hpss(librosa.stft(y))

        return librosa.istft(H), librosa.istft(P)
Exemplo n.º 56
0
def gen_audio_features(item, config):
    """Generate audio features and transformations
    Args:
        item (Dict): dictionary containing the attributes to encode.
        config (Dict): configuration dictionary.
    Returns:
        (bool): keep this sample or not.
        mel (ndarray): mel matrix in np.float32.
        energy (ndarray): energy audio profile.
        f0 (ndarray): fundamental frequency.
        item (Dict): dictionary containing the updated attributes.
    """
    # get info from sample.
    audio = item["audio"]
    utt_id = item["utt_id"]
    rate = item["rate"]

    # check audio properties
    assert len(audio.shape) == 1, f"{utt_id} seems to be multi-channel signal."
    assert np.abs(
        audio).max() <= 1.0, f"{utt_id} is different from 16 bit PCM."

    # check sample rate
    if rate != config["sampling_rate"]:
        audio = librosa.resample(audio, rate, config["sampling_rate"])
        logging.info(
            f"{utt_id} sampling rate is {rate}, not {config['sampling_rate']}, we resample it."
        )

    # trim silence
    if config["trim_silence"]:
        if "trim_mfa" in config and config["trim_mfa"]:
            _, item["text_ids"], audio = ph_based_trim(
                config,
                utt_id,
                item["text_ids"],
                item["raw_text"],
                audio,
                config["hop_size"],
            )
            if (
                    audio.__len__() < 1
            ):  # very short files can get trimmed fully if mfa didnt extract any tokens LibriTTS maybe take only longer files?
                logging.warning(
                    f"File have only silence or MFA didnt extract any token {utt_id}"
                )
                return False, None, None, None, item
        else:
            audio, _ = librosa.effects.trim(
                audio,
                top_db=config["trim_threshold_in_db"],
                frame_length=config["trim_frame_size"],
                hop_length=config["trim_hop_size"],
            )

    # resample audio if necessary
    if "sampling_rate_for_feats" in config:
        audio = librosa.resample(audio, rate,
                                 config["sampling_rate_for_feats"])
        sampling_rate = config["sampling_rate_for_feats"]
        assert (
            config["hop_size"] * config["sampling_rate_for_feats"] % rate == 0
        ), "'hop_size' must be 'int' value. Please check if 'sampling_rate_for_feats' is correct."
        hop_size = config["hop_size"] * config[
            "sampling_rate_for_feats"] // rate
    else:
        sampling_rate = config["sampling_rate"]
        hop_size = config["hop_size"]

    # get spectrogram
    D = librosa.stft(
        audio,
        n_fft=config["fft_size"],
        hop_length=hop_size,
        win_length=config["win_length"],
        window=config["window"],
        pad_mode="reflect",
    )
    S, _ = librosa.magphase(D)  # (#bins, #frames)

    # get mel basis
    fmin = 0 if config["fmin"] is None else config["fmin"]
    fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"]
    mel_basis = librosa.filters.mel(
        sr=sampling_rate,
        n_fft=config["fft_size"],
        n_mels=config["num_mels"],
        fmin=fmin,
        fmax=fmax,
    )
    mel = np.log10(np.maximum(np.dot(mel_basis, S),
                              1e-10)).T  # (#frames, #bins)
    mel_eos = np.zeros(shape=[1, np.shape(mel)[1]
                              ])  # (1, #bins)  # represent mel for eos_token.
    mel = np.concatenate([mel, mel_eos], axis=0)  # (#frames + 1, #bins)

    # check audio and feature length
    audio_eos = np.zeros(
        shape=[hop_size])  # (hop_size)  # represent audio for eos_token.
    audio = np.concatenate([audio, audio_eos], axis=-1)
    audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
    audio = audio[:len(mel) * hop_size]
    assert len(mel) * hop_size == len(
        audio), f"{len(mel) * hope_size}, {len(audio)}"

    # extract raw pitch
    _f0, t = pw.dio(
        audio.astype(np.double),
        fs=sampling_rate,
        f0_ceil=fmax,
        frame_period=1000 * hop_size / sampling_rate,
    )
    f0 = pw.stonemask(audio.astype(np.double), _f0, t, sampling_rate)
    if len(f0) >= len(mel):
        f0 = f0[:len(mel)]
    else:
        f0 = np.pad(f0, (0, len(mel) - len(f0)))

    # extract energy
    energy = np.sqrt(np.sum(S**2, axis=0))
    energy = np.concatenate([energy, [0]],
                            axis=-1)  # # represent energy for eos_token.
    assert len(mel) == len(f0) == len(
        energy), f"{len(mel)}, {len(f0)}, {len(energy)}"

    # apply global gain
    if config["global_gain_scale"] > 0.0:
        audio *= config["global_gain_scale"]
    if np.abs(audio).max() >= 1.0:
        logging.warn(
            f"{utt_id} causes clipping. It is better to reconsider global gain scale value."
        )
    item["audio"] = audio
    item["mel"] = mel
    item["f0"] = remove_outlier(f0)
    item["energy"] = remove_outlier(energy)
    return True, mel, energy, f0, item
Exemplo n.º 57
0
t=np.linspace(0,N/fe,N);
s = 0.2*np.cos(2*np.pi*200*t) + 2*np.cos(2*np.pi*400*t);
tf=np.linspace(0,fe/N,N);
plt.subplot(1,2,1);
plt.plot(t[:200],s[:200]);
plt.title('280Hz et 500Hz,fe=8000Hz')
plt.subplot(1,2,2);
plt.plot(np.abs(np.fft.fft(s)));
plt.title('280Hz et 500Hz,fe=8000Hz')
"""
#x, fe = librosa.load('ressources/mesange-tete-noire.wav')
x, fe = librosa.load('ressources/PIANO.wav')
plt.figure(figsize=(14, 5))
librosa.display.waveplot(x, sr=fe)
plt.title('')
plt.show()
fe /= 2
n = len(x)
t = np.linspace(0, n / fe, n, endpoint=False)
s = 0.75 * np.cos(2 * np.pi * 440 * t)
plt.plot(t, x)
plt.plot(np.abs(np.fft.fft(s)))
Sdb = librosa.amplitude_to_db(abs(s))

S = np.abs(librosa.stft(s))
Sdb = librosa.amplitude_to_db(abs(S))
#librosa.display.specshow(Sdb, sr=fe, x_axis='time', y_axis='hz')
#librosa.display.specshow(Sdb, sr=fe, x_axis='time', y_axis='hz')

sd.play(x, fe)
status = sd.wait()
Exemplo n.º 58
0
def calculate_melsp(x, n_fft=1024, hop_length=128):
    stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2
    log_stft = librosa.power_to_db(stft)
    melsp = librosa.feature.melspectrogram(S=log_stft, n_mels=128)
    return melsp
Exemplo n.º 59
0
def decompose_audio(y):
    harmonic, percussive = librosa.decompose.hpss(librosa.stft(y), margin=2)
    harmonic = librosa.istft(harmonic)
    percussive = librosa.istft(percussive)
    return harmonic, percussive
Exemplo n.º 60
0
import ntpath

root = os.path.dirname(os.path.realpath(__file__))
path_name = r'test'
direc_name = os.path.join(root, path_name)
train_path = r'test/audio'
csv_file = os.path.join(direc_name, 'features_test.csv')
folders = os.listdir(path_name)

with open(csv_file, "w", newline='') as output:
    audio_class_folder = os.path.join(root, train_path)
    files = os.listdir(audio_class_folder)
    for file in files:
        print(file)
        X, samp_rate = librosa.load(os.path.join(audio_class_folder, file))
        stft = np.array(np.abs(librosa.stft(X)))
        mfcc = np.array(
            np.mean(librosa.feature.mfcc(y=X, sr=samp_rate, n_mfcc=40).T,
                    axis=0))
        chroma = np.array(
            np.mean(librosa.feature.chroma_stft(S=stft, sr=samp_rate).T,
                    axis=0))
        contrast = np.array(
            np.mean(librosa.feature.spectral_contrast(S=stft, sr=samp_rate).T,
                    axis=0))
        features = np.append(mfcc, chroma)
        features = np.append(features, contrast)
        features_full = features.tolist()
        writer = csv.writer(output, delimiter=',')
        writer.writerow(features_full)
print('Yay')