Exemplo n.º 1
0
def decompose(y, n_components=8):
    # How about something more advanced?  Let's decompose a spectrogram with NMF, and then resynthesize an individual component
    D = librosa.stft(y)

    # Separate the magnitude and phase
    S, phase = librosa.magphase(D)

    # Decompose by nmf
    components, activations = librosa.decompose.decompose(S, n_components, sort=True)

    plt.figure(figsize=(12,4))

    plt.subplot(1,2,1)
    librosa.display.specshow(librosa.logamplitude(components**2.0, ref_power=np.max), y_axis='log')
    plt.xlabel('Component')
    plt.ylabel('Frequency')
    plt.title('Components')

    plt.subplot(1,2,2)
    librosa.display.specshow(activations)
    plt.xlabel('Time')
    plt.ylabel('Component')
    plt.title('Activations')

    plt.tight_layout()
    plt.savefig('components_activations.png')

    print('components', components.shape)
    print('activations', activations.shape)
    return components, activations, phase
Exemplo n.º 2
0
    def transform_audio(self, y):

        mag, phase = librosa.magphase(librosa.stft(y,
                                                   hop_length=self.hop_length,
                                                   n_fft=self.n_fft,
                                                   dtype=np.float32))
        return {'mag': mag.T, 'phase': np.angle(phase.T)}
Exemplo n.º 3
0
    def __test_consistency(frame_length, hop_length, center):
        y, sr = librosa.load(__EXAMPLE_FILE, sr=None)

        # Ensure audio is divisible into frame size.
        y = librosa.util.fix_length(y, y.size - y.size % frame_length)
        assert y.size % frame_length == 0

        # STFT magnitudes with a constant windowing function and no centering.
        S = librosa.magphase(librosa.stft(y,
                                          n_fft=frame_length,
                                          hop_length=hop_length,
                                          window=np.ones,
                                          center=center))[0]

        # Try both RMS methods.
        rms1 = librosa.feature.rms(S=S, frame_length=frame_length,
                                   hop_length=hop_length)
        rms2 = librosa.feature.rms(y=y, frame_length=frame_length,
                                   hop_length=hop_length, center=center)

        assert rms1.shape == rms2.shape
        # Normalize envelopes.
        rms1 /= rms1.max()
        rms2 /= rms2.max()

        # Ensure results are similar.
        np.testing.assert_allclose(rms1, rms2, rtol=5e-2)
Exemplo n.º 4
0
def parse_audio(path, audio_conf, windows, normalize=False):
    '''
    Input:
        path       : string 导入音频的路径
        audio_conf : dict 求频谱的音频参数
        windows    : dict 加窗类型
    Output:
        spect      : FloatTensor  每帧的频谱
    '''
    y = load_audio(path)
    n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"])
    win_length = n_fft
    hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride'])
    window = windows[audio_conf['window']]
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                        win_length=win_length, window=window)
    spect, phase = librosa.magphase(D) 
    
    spect = torch.FloatTensor(spect)
    spect = spect.log1p()
    
    if normalize:
        mean = spect.mean()
        std = spect.std()
        spect.add_(-mean)
        spect.div_(std)
    
    return spect.transpose(0,1)
Exemplo n.º 5
0
def parse_audio(path, audio_conf, windows, normalize=False):
    '''使用librosa计算音频的对数幅度谱
    Args:
        path(string)       : 音频的路径
        audio_conf(dict)   : 求频谱的参数
        windows(dict)      : 加窗类型
    Returns:
        spect(FloatTensor) : 音频的对数幅度谱(numFrames * nFeatures)
                             nFeatures = n_fft / 2 + 1
    '''
    y = load_audio(path)
    n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"])
    win_length = n_fft
    hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride'])
    window = windows[audio_conf['window']]
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                        win_length=win_length, window=window)
    spect, phase = librosa.magphase(D) 
    spect = torch.FloatTensor(spect)
    spect = spect.log1p()
    
    #每句话自己做归一化
    if normalize:
        mean = spect.mean()
        std = spect.std()
        spect.add_(-mean)
        spect.div_(std)  
    return spect.transpose(0,1)
Exemplo n.º 6
0
    def parse_audio(self, audio_path):
        if self.augment:
            y = load_randomly_augmented_audio(audio_path, self.sample_rate)
        else:
            y = load_audio(audio_path)
        if self.noiseInjector:
            add_noise = np.random.binomial(1, self.noise_prob)
            if add_noise:
                y = self.noiseInjector.inject_noise(y)
        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)
        # STFT
        D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        spect, phase = librosa.magphase(D)
        # S = log(S+1)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        if self.normalize:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)

        return spect
Exemplo n.º 7
0
def parse_audio(path, audio_conf, windows, normalize=True):
    '''
    Input:
        path       : string 导入音频的路径
        audio_conf : dcit 求频谱的音频参数
        windows    : dict 加窗类型
    Output:
        spect      : ndarray  每帧的频谱
    '''
    y = load_audio(path)
    n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"])
    win_length = n_fft
    hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride'])
    window = windows[audio_conf['window']]
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                        win_length=win_length, window=window)
    spect, phase = librosa.magphase(D)

    spect = np.log1p(spect)
    
    if normalize:
        mean = spect.mean()
        std = spect.std()
        spect = np.add(spect, -mean)
        spect = np.divide(spect, std)
    
    return spect.transpose()
Exemplo n.º 8
0
def Magnitude_phase(spectrogram) :
    Magnitude_list = []
    Phase_list = []
    for X in spectrogram :
        mag, phase = librosa.magphase(X)
        Magnitude_list.append(mag)
        Phase_list.append(phase)
    return Magnitude_list, Phase_list
Exemplo n.º 9
0
def test_magphase():

    (y, sr) = librosa.load(os.path.join('data', 'test1_22050.wav'))

    D = librosa.stft(y)

    S, P = librosa.magphase(D)

    assert np.allclose(S * P, D)
Exemplo n.º 10
0
def test_magphase():

    (y, sr) = librosa.load('data/test1_22050.wav')

    D = librosa.stft(y)

    S, P = librosa.magphase(D)

    assert np.allclose(S * P, D)
Exemplo n.º 11
0
def mix_by_spectogram(src_path, tgt_paths, n_fft = 4096, hop_length = 1024):
  #STFT from source
  src_signal, sr = librosa.load(src_path)
  src_stft = librosa.stft(src_signal, n_fft, hop_length)
  src_mag, src_phase = librosa.magphase(src_stft)
  src_spectra = Spectra(src_mag, src_phase, sr)
  targets = {}


  #STFT from paths
  for path in tgt_paths:
    signal, sr = librosa.load(path)
    D_tgt = librosa.stft(signal, n_fft, hop_length)
    tgt_mag, tgt_phase = librosa.magphase(D_tgt)
    tgt_spectra = Spectra(tgt_mag, tgt_phase, sr)
    targets[path] = tgt_spectra    
  length = len(src_stft[0])
  
  #Compute distances
  for i in range(len(src_spectra.magnitude)):
    print i
    distance = None
    closest = src_spectra.magnitude*0
    for target in targets.values():
      try:
        cap = min(len(target.magnitude[i]), len(src_spectra.magnitude[i]))
        new_dist = norm(target.magnitude[i][:cap] - src_mag[i][:cap])
        if new_dist < distance or distance == None:
          distance = new_dist
          closest = target
      except IndexError:
        print 'IDX Error'
    cap = min(len(closest.magnitude[i]) , len(src_mag[i]))   
    #Add magnitudes and phases
    src_spectra.magnitude[i][:cap] += closest.magnitude[i][:cap]  
    src_spectra.phase[i][:cap] += closest.phase[i][:cap]
  #Average magnitudes and phases
  src_spectra.magnitude *= 0.5
  src_spectra.phase *= 0.5  
    
  signal = librosa.istft(src_spectra.magnitude * src_spectra.phase)
  librosa.output.write_wav(src_path[:-4]+"-mix.wav", signal, 2*sr)
Exemplo n.º 12
0
    def transform_audio(self, y):

        cqt, phase = librosa.magphase(librosa.cqt(y=y,
                                                  sr=self.sr,
                                                  hop_length=self.hop_length,
                                                  fmin=self.fmin,
                                                  n_bins=self.n_octaves *
                                                         self.over_sample * 12,
                                                  bins_per_octave=self.over_sample * 12,
                                                  real=False))

        return {'mag': cqt.T.astype(np.float32),
                'phase': np.angle(phase).T.astype(np.float32)}
Exemplo n.º 13
0
def spectrogram2wav(mag, n_fft, win_length, hop_length, num_iters, phase_angle=None, length=None):
    assert(num_iters > 0)
    if phase_angle is None:
        phase_angle = np.pi * np.random.rand(*mag.shape)
    spec = mag * np.exp(1.j * phase_angle)
    for i in range(num_iters):
        wav = librosa.istft(spec, win_length=win_length, hop_length=hop_length, length=length)
        if i != num_iters - 1:
            spec = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length)
            _, phase = librosa.magphase(spec)
            phase_angle = np.angle(phase)
            spec = mag * np.exp(1.j * phase_angle)
    return wav
Exemplo n.º 14
0
 def _get_features(self, audio_data):
     _fs = 16000  # sampling rate
     hop_length = 160
     win_length = 400
     #utt, sr = librosa.load(audio_path,sr=None)
     audio_data = audio_data / np.max(audio_data)
     utt = self.pre_emp(audio_data)
     linear_spect = self.lin_spectogram_from_wav(utt,
                                                 hop_length,
                                                 win_length,
                                                 n_fft=512)
     mag, _ = librosa.magphase(linear_spect)  # magnitude
     spec = mag.T
     logspec = np.log(spec + 1e-8).T
     return logspec
Exemplo n.º 15
0
def wav_to_spectrogram_clips(wav_file):
    """convert audio into spectorgram, then chop it into 2d-segmentation of 100 frames"""
    # convert audio into spectorgram
    sound, sr = librosa.load(wav_file, sr=SR, mono=True)
    stft = librosa.stft(sound,
                        n_fft=N_FFT,
                        hop_length=HOP_LEN,
                        win_length=WIN_LEN)
    mag, phase = librosa.magphase(stft)
    # chop magnitude of spectrogram into clips, each has 1025 bins, 100 frames
    stft_clips = np.empty((0, FREQ_BINS, 100))
    for i in range(mag.shape[1] // 100):
        stft_clips = np.concatenate((stft_clips, mag[np.newaxis, :,
                                                     i * 100:(i + 1) * 100]))
    return stft_clips
Exemplo n.º 16
0
def make_feature(y, sr):
    if FEATURE == 'fft':
        S = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LEN, window=hamming)
        feature, _ = librosa.magphase(S)
        feature = np.log1p(feature)
        feature = feature.transpose()
    else:
        if FEATURE == 'logfbank':
            feature = logfbank(y, sr, winlen=WIN_LEN, winstep=WIN_STEP)
        else:
            feature = mfcc(y, sr, winlen=WIN_LEN, winstep=WIN_STEP)
        feature_d1 = delta(feature, N=1)
        feature_d2 = delta(feature, N=2)
        feature = np.hstack([feature, feature_d1, feature_d2])
    return normalize(feature)
def load_custom_data(args) :
    datalist = os.listdir(args.custompath)
    input_data = list()
    for dataname in datalist :
        speech_data, _ = librosa.load(args.custompath + dataname, sr=16000)
        speech_data = speech_data / np.max(speech_data)
        mel_data = get_mel_feature(speech_data, args)
        mel_data, _ = librosa.magphase(mel_data)

        text = os.path.splitext(dataname)[0]
        filtered_data = sentence_filter(text)
        jamo = split_syllables(filtered_data)
        label = np.array(jamo_to_label(jamo))
        input_data.append([mel_data, filtered_data, label])
    return input_data
Exemplo n.º 18
0
    def transform_audio(self, y):

        cqt, phase = librosa.magphase(
            librosa.cqt(y=y,
                        sr=self.sr,
                        hop_length=self.hop_length,
                        fmin=self.fmin,
                        n_bins=self.n_octaves * self.over_sample * 12,
                        bins_per_octave=self.over_sample * 12,
                        real=False))

        return {
            'mag': cqt.T.astype(np.float32),
            'phase': np.angle(phase).T.astype(np.float32)
        }
Exemplo n.º 19
0
    def __inverse_transform(self, data, n_iter):
        data = self.scaler.inverse_transform(data)
        data = data.T
        complex_specgram = self.inv_magphase(data, 0.0)

        for i in range(n_iter):
            audio = librosa.core.istft(complex_specgram, win_length=self.n_fft)

            if i != n_iter - 1:
                complex_specgram = librosa.core.stft(audio, n_fft=self.n_fft)
                _, phase = librosa.magphase(complex_specgram)
                phase_angle = np.angle(phase)
                complex_specgram = self.inv_magphase(data, phase_angle)

        return audio
Exemplo n.º 20
0
def spectralCent(song):
    y, sr = librosa.load("C:\Users\Katherine\Music\\" + song + ".mp3",
                         duration=60)
    cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    S, phase = librosa.magphase(librosa.stft(y=y))
    librosa.feature.spectral_centroid(S=S)
    if_gram, D = librosa.ifgram(y)
    librosa.feature.spectral_centroid(S=np.abs(D), freq=if_gram)
    plt.figure()
    plt.subplot(2, 1, 1)
    plt.semilogy(cent.T, label=song)
    plt.ylabel('Hz')
    plt.xticks([])
    plt.xlim([0, cent.shape[-1]])
    plt.legend()
    def __getitem__(self, item):
        noisy_path, clean_path = self.dataset_list[item].split(" ")
        name = os.path.splitext(os.path.basename(noisy_path))[0]
        noisy, _ = librosa.load(os.path.abspath(
            os.path.expanduser(noisy_path)),
                                sr=self.sr)
        clean, _ = librosa.load(os.path.abspath(
            os.path.expanduser(clean_path)),
                                sr=self.sr)

        if self.train:
            noisy_mag, _ = librosa.magphase(
                librosa.stft(noisy,
                             n_fft=self.n_fft,
                             hop_length=self.hop_length,
                             win_length=self.n_fft))
            clean_mag, _ = librosa.magphase(
                librosa.stft(clean,
                             n_fft=self.n_fft,
                             hop_length=self.hop_length,
                             win_length=self.n_fft))
            return noisy_mag, clean_mag, noisy_mag.shape[-1], name
        else:
            return noisy, clean, name
Exemplo n.º 22
0
def griffin_lim(magnitude, n_fft, hop_length, n_iterations):
    """Iterative algorithm for phase retrival from a magnitude spectrogram."""
    phase_angle = np.pi * np.random.rand(*magnitude.shape)
    D = invert_magnitude_phase(magnitude, phase_angle)
    signal = librosa.istft(D, hop_length=hop_length)

    for i in range(n_iterations):
        D = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
        _, phase = librosa.magphase(D)
        phase_angle = np.angle(phase)

        D = invert_magnitude_phase(magnitude, phase_angle)
        signal = librosa.istft(D, hop_length=hop_length)

    return signal
Exemplo n.º 23
0
 def griffin_lim(self, magnitude, iters=30):
     '''
     based on:
     https://github.com/soobinseo/Tacotron-pytorch/blob/master/data.py
     in turn based on:
     librosa implementation of Griffin-Lim
     Based on https://github.com/librosa/librosa/issues/434
     '''
     angles = np.exp(2j * np.pi * np.random.rand(*magnitude.shape))
     S_complex = np.abs(magnitude).astype(np.complex)
     y = librosa.istft(S_complex * angles)
     for i in range(iters):
         _, angles = librosa.magphase(librosa.stft(y))
         y = librosa.istft(S_complex * angles)
     return y
Exemplo n.º 24
0
def mix_strategies(src, tgt):
  src_signal, sr = librosa.load(src)
  src_stft = librosa.stft(src_signal, n_fft, hop_length)
  src_mag, src_phase = librosa.magphase(src_stft)
  src_spectra = Spectra(src_mag, src_phase, sr)

  tgt_signal, sr = librosa.load(tgt)
  tgt_stft = librosa.stft(tgt_signal, n_fft, hop_length)
  tgt_mag, tgt_phase = librosa.magphase(tgt_stft)
  tgt_spectra = Spectra(tgt_mag, tgt_phase, sr)

  for i in range(len(src_spectra.magnitude)):
    #Average of magnitude and Phase
    cap = min(len(src_spectra.magnitude[i]), len(tgt_spectra.magnitude[i]))
    src_spectra.magnitude[i][:cap] += tgt_spectra.magnitude[i][:cap]  
    src_spectra.phase[i][:cap] += tgt_spectra.phase[i][:cap]

  src_spectra.magnitude *= 0.5
  src_spectra.phase *= 0.5
  new_spectra = src_spectra  
  new_signal = librosa.istft(new_spectra.magnitude * new_spectra.phase)
  librosa.output.write_wav(src[:-4]+"-mix.wav", new_signal, 2*new_spectra.sr)

  """
Exemplo n.º 25
0
 def decompose(self):
     #filter out precussive parts
     hpss_y = self.hpss()
     #Perform Short-time Fourier transform
     D = librosa.stft(hpss_y)
     # Separate the magnitude and phase
     S, phase = librosa.magphase(D)
     #NMF decompose to components
     components, activations = self.decomposeNMF(hpss_y, S,
                                                 self.n_components)
     #reconstruct and return
     return [
         self.reconstructComponent(components[:, i], activations[i], phase)
         for i in range(0, len(activations))
     ]
Exemplo n.º 26
0
def add_noise(audio_path, noise_path, percent=0.5, sr=16000):
    src, sr = librosa.load(audio_path, sr=sr)
    src_noise, sr = librosa.load(noise_path, sr=sr)
    #print(len(src), len(src_noise))
    if len(src) > len(src_noise):
        n = int(len(src) / len(src_noise))
        src_noise = src_noise.repeat(n + 1)
    flag = random.randint(0, len(src_noise) - len(src))
    src_noise = src_noise[flag:flag + len(src)]
    percent = 0.002 * random.randint(1, 5)
    src = src + percent * src_noise
    S = librosa.core.stft(src, n_fft=N_FFT, hop_length=HOP_LEN,
                          window=hamming)  #
    feature, _ = librosa.magphase(S)
    return feature
Exemplo n.º 27
0
def load_data(path, win_length=400, sr=16000, hop_length=160, n_fft=512, spec_len=250, mode='train'):
    wav = load_wav(path, sr=sr, mode=mode)
    linear_spect = lin_spectogram_from_wav(wav, hop_length, win_length, n_fft)
    mag, _ = librosa.magphase(linear_spect)  # magnitude
    mag_T = mag.T
    freq, time = mag_T.shape
    if mode == 'train':
        randtime = np.random.randint(0, time - spec_len)
        spec_mag = mag_T[:, randtime:randtime + spec_len]
    else:
        spec_mag = mag_T
    # preprocessing, subtract mean, divided by time-wise var
    mu = np.mean(spec_mag, 0, keepdims=True)
    std = np.std(spec_mag, 0, keepdims=True)
    return (spec_mag - mu) / (std + 1e-5)
Exemplo n.º 28
0
 def load_vocal_audio(self, y, sr):
     S_full, phase = librosa.magphase(librosa.stft(y))
     S_filter = librosa.decompose.nn_filter(S_full,
                                            aggregate=np.median,
                                            metric='cosine',
                                            width=int(librosa.time_to_frames(2, sr=sr)))
     S_filter = np.minimum(S_full, S_filter)
     margin_i, margin_v = 2, 10
     power = 2
     mask_v = librosa.util.softmask(S_full - S_filter,
                                    margin_v * S_filter,
                                    power=power)
     S_foreground = mask_v * S_full
     output_data = librosa.griffinlim(S_foreground)
     return output_data, sr
Exemplo n.º 29
0
def get_spectrum(track_name, sr, N, M, H):
    W  = np.hanning(M) # Window Type
    ## Load WAV File
    track,sr = load(track_name+'.wav', sr = sr, mono = 'False')
    ## Perform Short Term Fourier Transform
    stft_ = stft(y = track, n_fft = N,win_length=M, hop_length=H, window = 'hann')
    ## Magnitudes (excluding phase)
    magnitude, _ = magphase(stft_)
    magnitude = magnitude / np.sum(W) #normalising STFT output
    ## Spectrum Average
    spec_avg = np.average(magnitude,axis=1) 
    spec_avg = spec_avg/np.max(spec_avg)
    len_signal = spec_avg.shape[0] # filter bank length

    return spec_avg, len_signal
Exemplo n.º 30
0
def test_rms(y_ex, y2, frame_length, hop_length, center):
    y1, sr = y_ex
    # Ensure audio is divisible into frame size.
    y1 = librosa.util.fix_length(y1, y1.size - y1.size % frame_length)
    y2 = librosa.util.fix_length(y2, y2.size - y2.size % frame_length)
    assert y1.size % frame_length == 0
    assert y2.size % frame_length == 0

    # STFT magnitudes with a constant windowing function and no centering.
    S1 = librosa.magphase(
        librosa.stft(
            y1, n_fft=frame_length, hop_length=hop_length, window=np.ones, center=center
        )
    )[0]
    S2 = librosa.magphase(
        librosa.stft(
            y2, n_fft=frame_length, hop_length=hop_length, window=np.ones, center=center
        )
    )[0]

    # Try both RMS methods.
    rms1 = librosa.feature.rms(S=S1, frame_length=frame_length, hop_length=hop_length)
    rms2 = librosa.feature.rms(
        y=y1, frame_length=frame_length, hop_length=hop_length, center=center
    )
    rms3 = librosa.feature.rms(S=S2, frame_length=frame_length, hop_length=hop_length)
    rms4 = librosa.feature.rms(
        y=y2, frame_length=frame_length, hop_length=hop_length, center=center
    )

    assert rms1.shape == rms2.shape
    assert rms3.shape == rms4.shape

    # Ensure results are similar.
    np.testing.assert_allclose(rms1, rms2, atol=5e-4)
    np.testing.assert_allclose(rms3, rms4, atol=5e-4)
Exemplo n.º 31
0
def spectrogram(wav, normalize=True):
    D = librosa.stft(wav,
                     n_fft=n_fft,
                     hop_length=hop_length,
                     win_length=win_length,
                     window=window)

    spec, phase = librosa.magphase(D)
    spec = np.log1p(spec)
    spec = torch.FloatTensor(spec)

    if normalize:
        spec = (spec - spec.mean()) / spec.std()

    return spec
Exemplo n.º 32
0
def load_data(path_spk_tuples, win_length=400, sr=16000, hop_length=160, n_fft=512, min_win_time=240, max_win_time=1600):
    win_time = np.random.randint(min_win_time, max_win_time, 1)[
        0]  # win_length in [240,1600] ms
    win_spec = win_time//(1000//(sr//hop_length))  # win_length in spectrum
    hop_spec = win_spec//2

    wavs = np.array([])
    change_points = []
    paths = list(zip(*path_spk_tuples))[0]
    speakers = list(zip(*path_spk_tuples))[1]

    for path in paths:
        wav = load_wav(path, sr=sr)  # VAD
        wavs = np.concatenate((wavs, wav))
        # change_point in spectrum
        change_points.append(wavs.shape[0]//hop_length)

    linear_spect = lin_spectogram_from_wav(wavs, hop_length, win_length, n_fft)
    mag, _ = librosa.magphase(linear_spect)  # magnitude
    mag_T = mag.T
    freq, time = mag_T.shape
    spec_mag = mag_T

    utterance_specs = []
    utterance_speakers = []

    cur_spec = 0
    cur_speaker = speakers[0]
    i = 0
    while(True):
        if(cur_spec+win_spec > time):
            break
        spec_mag = mag_T[:, cur_spec:cur_spec+win_spec]

        # cur win_spec span to the next speaker
        if(cur_spec+win_spec//2 > change_points[i]):
            i += 1
            cur_speaker = speakers[i]

        # preprocessing, subtract mean, divided by time-wise var
        mu = np.mean(spec_mag, 0, keepdims=True)
        std = np.std(spec_mag, 0, keepdims=True)
        spec_mag = (spec_mag - mu) / (std + 1e-5)
        utterance_specs.append(spec_mag)
        utterance_speakers.append(cur_speaker)
        cur_spec += hop_spec

    return utterance_specs, utterance_speakers
Exemplo n.º 33
0
def calculate_SDR(music, model, n_fft=2048, hop_length=512, slice_duration=2):
    model.eval()
    scores = []
    sr = music.rate
    ind = 0
    mixture = librosa.to_mono(music.audio.transpose())
    vocal = librosa.to_mono(music.targets['vocals'].audio.transpose())
    for i in range(0, len(music.audio), slice_duration * sr):
        ind += 1
        mixture = mixture[i:i + slice_duration * sr]
        vocal = vocal[i:i + slice_duration * sr]

        if np.all(vocal == 0):
            # print('[!] -  all 0s, skipping')
            continue

        if i + 2 * sr >= len(music.audio):
            break
        resampled_mixture = mixture
        mixture_stft = librosa.stft(resampled_mixture,
                                    n_fft=n_fft,
                                    hop_length=512,
                                    window='hann',
                                    center=True)
        magnitude_mixture_stft, mixture_phase = librosa.magphase(mixture_stft)
        normalized_magnitude_mixture_stft = torch.Tensor(Normalize().forward(
            [magnitude_mixture_stft])[0])

        sr_v = music.rate
        with torch.no_grad():
            mask = model.forward(
                normalized_magnitude_mixture_stft.unsqueeze(0)).squeeze(0)
            out = mask * torch.Tensor(normalized_magnitude_mixture_stft)
        predicted_vocal_stft = out.numpy() * mixture_phase
        predicted_vocal_audio = librosa.istft(predicted_vocal_stft.squeeze(0),
                                              win_length=n_fft,
                                              hop_length=hop_length,
                                              window='hann',
                                              center='True')
        try:
            scores.append(
                mir_eval.separation.bss_eval_sources(
                    vocal[:predicted_vocal_audio.shape[0]],
                    predicted_vocal_audio)[0])
        except ValueError:
            print(vocal.all() == 0)
            print(predicted_vocal_stft.all() == 0)
            print('Error but skipping')
Exemplo n.º 34
0
def spect_loader(path,
                 window_size,
                 window_stride,
                 window,
                 normalize,
                 max_len=101,
                 augment=False,
                 allow_speedandpitch=False,
                 allow_pitch=False,
                 allow_speed=False,
                 allow_dyn=False,
                 allow_noise=False,
                 allow_timeshift=False):
    y, sr = librosa.load(path, sr=None)
    n_fft = int(sr * window_size)
    win_length = n_fft
    hop_length = int(sr * window_stride)

    # STFT
    D = librosa.stft(y,
                     n_fft=n_fft,
                     hop_length=hop_length,
                     win_length=win_length,
                     window=window)
    spect, phase = librosa.magphase(D)

    # S = log(S+1)
    spect = np.log1p(spect)

    # make all spects with the same dims
    # TODO: change that in the future
    if spect.shape[1] < max_len:
        pad = np.zeros((spect.shape[0], max_len - spect.shape[1]))
        spect = np.hstack((spect, pad))
    elif spect.shape[1] > max_len:
        spect = spect[:max_len, ]
    spect = np.resize(spect, (1, spect.shape[0], spect.shape[1]))
    #spect = torch.FloatTensor(spect)

    # z-score normalization
    if normalize:
        mean = np.mean(np.ravel(spect))
        std = np.std(np.ravel(spect))
        if std != 0:
            spect = spect - mean
            spect = spect / std

    return spect
Exemplo n.º 35
0
def my_filter(y, sr):
    Y = librosa.stft(y, n_fft=4096, hop_length=512)
    Y_dB = librosa.amplitude_to_db(Y, ref=np.max)

    var_trust = var_trust_func(Y_dB)
    contrast = contrast_trust_func(np.abs(Y), sr)

    mask = np.multiply(contrast, var_trust)
    mask = (mask - np.min(mask)) / (np.max(mask) - np.min(mask))

    mag, phase = librosa.magphase(Y)
    newmag = np.multiply(mag, mask)
    Y_rec = np.multiply(newmag, np.exp(np.multiply(phase, (1j))))
    y_rec = librosa.istft(Y_rec, hop_length=512)

    return y_rec, Y_rec
Exemplo n.º 36
0
 def parse_audio(self,audio_path):
     y = load_audio(audio_path)
     n_fft = int(self.sample_rate * self.window_size)
     win_length = n_fft
     hop_length = int(self.sample_rate * self.window_stride)
     
     D = librosa.stft(y, n_fft=n_fft, hop_length = hop_length, win_length=win_length,window=self.window)
     
     spect, phase = librosa.magphase(D)
     
     spect = np.log1p(spect)
     mean = spect.mean()
     std = spect.std()
     spect = np.add(spect,-mean)
     spect = spect / std
     return spect
Exemplo n.º 37
0
def mask_from_timeseries(sr, x):
    x = x.astype('float16')
    S, ph = librosa.magphase(librosa.stft(x))
    # i'm not sure what the value of "time" should be. 0.1 works well for segment lengths of 0.5 seconds.
    time = 0.1
    S_filter = librosa.decompose.nn_filter(S,
                                           aggregate=np.median,
                                           metric='cosine',
                                           width=int(librosa.time_to_frames(time, sr=sr)))
    S_filter = np.minimum(S, S_filter)
    margin = 5
    power = 2
    mask = librosa.util.softmask(S - S_filter,
                                 margin * S_filter,
                                 power=power)
    return mask
Exemplo n.º 38
0
def make_feature(y, sr):  #提取特征,y是语音data部分,sr为采样率
    if FEATURE == 'fft':  #提取fft特征
        S = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LEN,
                         window=hamming)  #进行短时傅里叶变换,参数意义在一开始有定义
        feature, _ = librosa.magphase(S)
        feature = np.log1p(feature)  #log1p操作
        feature = feature.transpose()
    else:
        if FEATURE == 'logfbank':  #提取fbank特征
            feature = logfbank(y, sr, winlen=WIN_LEN, winstep=WIN_STEP)
        else:
            feature = mfcc(y, sr, winlen=WIN_LEN, winstep=WIN_STEP)  #提取mfcc特征
        feature_d1 = delta(feature, N=1)  #加上两个delta,特征维度X3
        feature_d2 = delta(feature, N=2)
        feature = np.hstack([feature, feature_d1, feature_d2])  #横向拼起来
    return normalize(feature)  #返回归一化的特征
def plot_spectral_centroid(number):
    example_mp3, sr, song_name = load_music.load_song(number)

    cent = librosa.feature.spectral_centroid(example_mp3, sr)
    S, phase = librosa.magphase(librosa.stft(y=example_mp3))

    fig, ax = plt.subplots()
    times = librosa.times_like(cent)
    librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
                             y_axis='log',
                             x_axis='time',
                             ax=ax)
    ax.plot(times, cent.T, label='Spectral centroid', color='w')
    ax.legend(loc='upper right')
    fig.suptitle('log power spectogram on' + ' ' + song_name, fontsize=8)
    plt.show()
Exemplo n.º 40
0
def plt_stft(data, title):
    titles = [
        'Sample', '440Hz Beep', 'No Sound', 'English Male', 'English Female',
        'Japanese Female', 'Japanese Male', 'OK, Google']
    plt.figure(figsize = (16, 9))
    plt.suptitle(title)
    for i in range(0,len(titles)):
        D = data[i]
        plt.subplot(2, 4, i + 1)
        librosa.display.specshow(
            librosa.amplitude_to_db(librosa.magphase(D)[0]),
            y_axis='log', x_axis='time')
        plt.title(titles[i])
        plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.savefig('./fig/fig_stft/' + title + '.png')
Exemplo n.º 41
0
 def filter_background(self, margin_v: int = 5, power: int = 2):
     stft = librosa.stft(self.waveform)
     S_full, phase = librosa.magphase(stft)
     S_filter = librosa.decompose.nn_filter(
         S_full,
         aggregate=np.median,
         metric='cosine',
         width=int(librosa.time_to_frames(2, sr=self.sample_rate)))
     S_filter = np.minimum(S_full, S_filter)
     mask_v = librosa.util.softmask(S_full - S_filter,
                                    margin_v * S_filter,
                                    power=power)
     S_foreground = mask_v * S_full
     y_foreground = librosa.istft(S_foreground * phase)
     self.waveform = y_foreground
     return self
Exemplo n.º 42
0
def griffin_lim(magnitudes, n_iters=50, n_fft=1024):
    """
    Griffin-Lim algorithm to convert magnitude spectrograms to audio signals
    """
    phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape))
    complex_spec = magnitudes * phase
    signal = librosa.istft(complex_spec)
    if not np.isfinite(signal).all():
        logging.warning("audio was not finite, skipping audio saving")
        return np.array([0])

    for _ in range(n_iters):
        _, phase = librosa.magphase(librosa.stft(signal, n_fft=n_fft))
        complex_spec = magnitudes * phase
        signal = librosa.istft(complex_spec)
    return signal
Exemplo n.º 43
0
 def parse_audio(self, audio_path):
     """
     parse audio
     """
     audio = self.load_audio(audio_path)
     n_fft = int(self.sample_rate * self.window_size)
     win_length = n_fft
     hop_length = int(self.sample_rate * self.window_stride)
     D = librosa.stft(y=audio, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=self.window)
     mag, _ = librosa.magphase(D)
     mag = np.log1p(mag)
     if self.is_normalization:
         mean = mag.mean()
         std = mag.std()
         mag = (mag - mean) / std
     return mag
Exemplo n.º 44
0
def reconstruct(file, spec1):
    y, sr_ = librosa.load(file)
    D = librosa.core.stft(y, n_fft=512)
    mag, phase = librosa.magphase(D)

    spec1 = np.transpose(spec1)

    mask = np.abs(spec1) / np.abs(D)

    rec_a = D * mask
    rec_b = D * (1 - mask)

    o_a = librosa.core.istft(rec_a)
    o_b = librosa.core.istft(rec_b)

    return o_a, o_b
Exemplo n.º 45
0
def decompose(filename, offset=0, duration=30, voice=True):
    '''Decompose a song into its pieces

    :parameters:
    - filename : str
        path to the audio
    - offset : float
        initial offset for loading audio
    - duration : float
        maximum amount of audio to load

    :returns:
    - D : np.array, dtype=complex
        STFT of the full signal
    - D_inst : np.array, dtype=complex
        STFT of the instruments
    - D_vox : np.array, dtype=complex
        STFT of the vocals
    - D_inst_harm : np.array, dtype=complex
        STFT of the instrument harmonics
    - D_inst_perc : np.array, dtype=complex
        STFT of the instruments percussives
    '''
    y, sr = librosa.load(filename, sr=SR, offset=offset, duration=duration)
    
    # Step 1: compute STFT
    D = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH).astype(np.complex64)
    
    # Step 2: separate magnitude and phase
    S, P = librosa.magphase(D)
    S    = S / S.max()
    
    if voice:
        tau = (D.shape[0] * 3) / 4
    
        # Step 3: RPCA to separate voice and background
        S1, S2, _ = rpca.robust_pca(S[:tau,:], max_iter=25)
        S1, S2    = rpca_correct(S[:tau,:], S1, S2)
    
        S1 = np.vstack((S1, S[tau:,:]))
        S2 = np.vstack((S2, S[tau:,:]))
    else:
        S1, S2 = librosa.hpss.hpss_median(S, win_H=WIN_HPSS, win_P=WIN_HPSS, p=1.0)
    
    # Step 4: recombine with phase
    return D, S1 * P, S2 * P
Exemplo n.º 46
0
def reconstruct(y, a_W, a_H, b_W, b_H):
    a = np.dot(a_W, a_H)
    b = np.dot(b_W, b_H)
    D = librosa.core.stft(y, n_fft=NMF.d_w, hop_length=NMF.d_h)
    mag, phase = librosa.magphase(D)
    
    mask_b = 1
    rec_a = a * phase
    #rec_b = b * phase
    mask_b = np.nan_to_num(b**1 / (a**1 + b**1))
    rec_b = b * mask_b * phase
    #np.abs(D) * mask_b * phase
    

    o_a = librosa.core.istft(rec_a, win_length=NMF.d_w, hop_length=NMF.d_h)
    o_b = librosa.core.istft(rec_b, win_length=NMF.d_w, hop_length=NMF.d_h)
    
    return o_a, o_b
def tf_wave_to_stft(wave):
    sample_rate = 16000
    window_size = 0.02
    window_stride = 0.01
    window = 'hamming'
    normalize = True
    
    # y = librosa.core.load(wave, sr=sample_rate)[0]
    # print(len(y))
    
    n_fft = 320  # int(sample_rate * window_size)
    win_length = n_fft
    hop_length = 160  # int(sample_rate * window_stride)
    # STFT
    D = librosa.stft(wave, n_fft=n_fft, hop_length=hop_length,
                     win_length=win_length, window=window)
    
    spect, phase = librosa.magphase(D)
    # S = log(S+1)
    spect = np.log1p(spect)
    return spect
Exemplo n.º 48
0
def parse_audio(path, audio_conf, windows):
    '''
    Input:
        path       : string 导入音频的路径
        audio_conf : dcit 求频谱的音频参数
        windows    : dict 加窗类型
    Output:
        spect      : FloatTensor  每帧的频谱
    '''
    y = load_audio(path)
    n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"])
    win_length = n_fft
    hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride'])
    window = windows[audio_conf['window']]
    #D = librosa.cqt(y, sr=audio_conf['sample_rate'])
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                        win_length=win_length, window=window)
    spect, phase = librosa.magphase(D)
    spect = np.log1p(spect)
    spect = torch.FloatTensor(spect)
    
    return spect.transpose(0,1)
Exemplo n.º 49
0
def griffin_lim(mag, phase_angle, n_fft, hop, num_iters):
  """Iterative algorithm for phase retrival from a magnitude spectrogram.
  Args:
    mag: Magnitude spectrogram.
    phase_angle: Initial condition for phase.
    n_fft: Size of the FFT.
    hop: Stride of FFT. Defaults to n_fft/2.
    num_iters: Griffin-Lim iterations to perform.
  Returns:
    audio: 1-D array of float32 sound samples.
  """
  fft_config = dict(n_fft=n_fft, win_length=n_fft, hop_length=hop, center=True)
  ifft_config = dict(win_length=n_fft, hop_length=hop, center=True)
  complex_specgram = inv_magphase(mag, phase_angle)
  for i in range(num_iters):
    audio = librosa.istft(complex_specgram, **ifft_config)
    if i != num_iters - 1:
      complex_specgram = librosa.stft(audio, **fft_config)
      _, phase = librosa.magphase(complex_specgram)
      phase_angle = np.angle(phase)
      complex_specgram = inv_magphase(mag, phase_angle)
  return audio
Exemplo n.º 50
0
def spectrogram2wav(mag, n_fft, win_length, hop_length, num_iters, phase_angle=None, length=None):
    '''

    :param mag: [f, t]
    :param n_fft: n_fft
    :param win_length: window length
    :param hop_length: hop length
    :param num_iters: num of iteration when griffin-lim reconstruction
    :param phase_angle: phase angle
    :param length: length of wav
    :return: 
    '''
    assert (num_iters > 0)
    if phase_angle is None:
        phase_angle = np.pi * np.random.rand(*mag.shape)
    spec = mag * np.exp(1.j * phase_angle)
    for i in range(num_iters):
        wav = librosa.istft(spec, win_length=win_length, hop_length=hop_length, length=length)
        if i != num_iters - 1:
            spec = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length)
            _, phase = librosa.magphase(spec)
            phase_angle = np.angle(phase)
            spec = mag * np.exp(1.j * phase_angle)
    return wav
Exemplo n.º 51
0
    def extract_features(self, audio_path):
        # torchaudio loading options recently changed. It's probably
        # straightforward to rewrite the audio handling to make use of
        # up-to-date torchaudio, but in the meantime there is a legacy
        # method which uses the old defaults
        sound, sample_rate_ = torchaudio.legacy.load(audio_path)
        if self.truncate and self.truncate > 0:
            if sound.size(0) > self.truncate:
                sound = sound[:self.truncate]

        assert sample_rate_ == self.sample_rate, \
            'Sample rate of %s != -sample_rate (%d vs %d)' \
            % (audio_path, sample_rate_, self.sample_rate)

        sound = sound.numpy()
        if len(sound.shape) > 1:
            if sound.shape[1] == 1:
                sound = sound.squeeze()
            else:
                sound = sound.mean(axis=1)  # average multiple channels

        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)
        # STFT
        d = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        spect, _ = librosa.magphase(d)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        if self.normalize_audio:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)
        return spect
Exemplo n.º 52
0
def get_sound_features(audio_path):
    
    print( audio_path)
    
    y, sr = librosa.load(audio_path)
    n_fft=2048
    hop_length=512
    output={}
    # Separate harmonics and percussives into two waveforms
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    
    #y is wave domain, y_harmonic is frequency domain
    output['y_harmonic_mean']= y_harmonic.mean()
    output['y_harmonic_std']= y_harmonic.std()

    output['y_percussive_mean']= y_percussive.mean()    
    output['y_percussive_std']= y_percussive.std()    
    
    # Beat track on the percussive signal
    tempo, beat_frames = librosa.beat.beat_track(y=y_percussive,sr=sr)
    output['tempo']= tempo

    # Compute MFCC features from the raw signal
    mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13)
    output['mfcc_mean']=mfcc.mean()
    output['mfcc_std']=mfcc.std()
    
    # And the first-order differences (delta features)
    mfcc_delta = librosa.feature.delta(mfcc)
    output['mfcc_delta_mean']=mfcc_delta.mean()
    output['mfcc_delta_std']=mfcc_delta.std()
    # Stack and synchronize between beat events
    # This time, we'll use the mean value (default) instead of median
    beat_mfcc_delta = librosa.feature.sync(np.vstack([mfcc, mfcc_delta]),beat_frames)
    output['beat_mfcc_delta_mean']=beat_mfcc_delta.mean()
    output['beat_mfcc_delta_std']=beat_mfcc_delta.std()
    
    # Compute chroma features from the harmonic signal
    chromagram = librosa.feature.chroma_cqt(y=y_harmonic,sr=sr)
    output['chromagram_mean']=chromagram.mean()
    output['chromagram_std']=chromagram.std()
    
    
    # Aggregate chroma features between beat events
    # We'll use the median value of each feature between beat frames
    beat_chroma = librosa.feature.sync(chromagram,beat_frames,aggregate=np.median)
    output['beat_chroma_mean']=beat_chroma.mean()
    output['beat_chroma_std']=beat_chroma.std()
    
    # Finally, stack all beat-synchronous features together
#    beat_features = np.vstack([beat_chroma, beat_mfcc_delta])
#    output['beat_features']=beat_features

    #nfft in example different ????????????? 4096
    # Compute a chromagram from a waveform or power spectrogram.
    S = np.abs(librosa.stft(y, n_fft=n_fft))
    chroma_stft = librosa.feature.chroma_stft(S=S, sr=sr)
    output['chroma_stft_mean']=chroma_stft.mean()
    output['chroma_stft_std']=chroma_stft.std()
    
    # Constant-Q chromagram
    chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr)
    output['chroma_cq_mean']=chroma_cq.mean()
    output['chroma_cq_std']=chroma_cq.std()
    
    # Compute a Mel-scaled power spectrogram.
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,fmax=8000)
    output['S_mean']=S.mean()
    output['S_std']=S.std()
    
    # Compute root-mean-square (RMS) energy for each frame.
    S, phase = librosa.magphase(librosa.stft(y))
    rms = librosa.feature.rmse(S=S)
    output['rms_mean']=rms.mean()
    output['rms_std']=rms.std()
        
    # Compute the spectral centroid.
    cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    output['cent_mean']=cent.mean()
    output['cent_std']=cent.std()    
    
    S, phase = librosa.magphase(librosa.stft(y=y))
    spec_bw=librosa.feature.spectral_bandwidth(S=S)
    output['spec_bw_mean']=spec_bw.mean()
    output['spec_bw_std']=spec_bw.std() 
    
    S = np.abs(librosa.stft(y))
    contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
    output['contrast_mean']=contrast.mean()
    output['contrast_std']=contrast.std()
    
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
    output['tonnetz_mean']=tonnetz.mean()
    output['tonnetz_std']=tonnetz.std()
    
    zero_cross = librosa.feature.zero_crossing_rate(y)
    output['zero_cross_mean']=zero_cross.mean()
    output['zero_cross_std']=zero_cross.std()
    
    oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
    output['oenv_mean']=oenv.mean()
    output['oenv_std']=oenv.std() 
    
    tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr,hop_length=hop_length)
    output['tempogram_mean']=tempogram.mean()
    output['tempogram_std']=tempogram.std()
    
    L = librosa.feature.logfsgram(y=y, sr=sr)
    output['L_mean']=L.mean()
    output['L_std']=L.std()

    R = librosa.segment.recurrence_matrix(mfcc)
    output['R_mean']=R.mean()
    output['R_std']=R.std()
    
    output['audio_path'] = audio_path
    
    return output
Exemplo n.º 53
0
def LoadAudio(file_path) :
    y, sr = load(file_path,sr=SR)
    stft = librosa.stft(y,n_fft=window_size,hop_length=hop_length)
    mag, phase = librosa.magphase(stft)
    return mag.astype(np.float32), phase
Exemplo n.º 54
0
    def read_audio_file(path, src_dir, side, sample_rate, window_size,
                        window_stride, window, normalize_audio,
                        truncate=None):
        """
        Args:
            path (str): location of a src file containing audio paths.
            src_dir (str): location of source audio files.
            side (str): 'src' or 'tgt'.
            sample_rate (int): sample_rate.
            window_size (float) : window size for spectrogram in seconds.
            window_stride (float): window stride for spectrogram in seconds.
            window (str): window type for spectrogram generation.
            normalize_audio (bool): subtract spectrogram by mean and divide
                by std or not.
            truncate (int): maximum audio length (0 or None for unlimited).

        Yields:
            a dictionary containing audio data for each line.
        """
        assert (src_dir is not None) and os.path.exists(src_dir),\
            "src_dir must be a valid directory if data_type is audio"

        global torchaudio, librosa, np
        import torchaudio
        import librosa
        import numpy as np

        with codecs.open(path, "r", "utf-8") as corpus_file:
            index = 0
            for line in corpus_file:
                audio_path = os.path.join(src_dir, line.strip())
                if not os.path.exists(audio_path):
                    audio_path = line

                assert os.path.exists(audio_path), \
                    'audio path %s not found' % (line.strip())

                sound, sample_rate = torchaudio.load(audio_path)
                if truncate and truncate > 0:
                    if sound.size(0) > truncate:
                        continue

                assert sample_rate == sample_rate, \
                    'Sample rate of %s != -sample_rate (%d vs %d)' \
                    % (audio_path, sample_rate, sample_rate)

                sound = sound.numpy()
                if len(sound.shape) > 1:
                    if sound.shape[1] == 1:
                        sound = sound.squeeze()
                    else:
                        sound = sound.mean(axis=1)  # average multiple channels

                n_fft = int(sample_rate * window_size)
                win_length = n_fft
                hop_length = int(sample_rate * window_stride)
                # STFT
                d = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length,
                                 win_length=win_length, window=window)
                spect, _ = librosa.magphase(d)
                spect = np.log1p(spect)
                spect = torch.FloatTensor(spect)
                if normalize_audio:
                    mean = spect.mean()
                    std = spect.std()
                    spect.add_(-mean)
                    spect.div_(std)

                example_dict = {side: spect,
                                side + '_path': line.strip(),
                                'indices': index}
                index += 1

                yield example_dict
Exemplo n.º 55
0
##################
# Standard imports
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
import librosa

import librosa.display

#############################################
# Load an example signal
y, sr = librosa.load('audio/sir_duke_slow.mp3')


# And compute the spectrogram magnitude and phase
S_full, phase = librosa.magphase(librosa.stft(y))


###################
# Plot the spectrum
plt.figure(figsize=(12, 4))
librosa.display.specshow(librosa.amplitude_to_db(S_full, ref=np.max),
                         y_axis='log', x_axis='time', sr=sr)
plt.colorbar()
plt.tight_layout()

###########################################################
# As you can see, there are periods of silence and
# non-silence throughout this recording.
#
def get_librosa_large(path):
  
    #Load an audio file as a floating point time series

    y, sr = librosa.load(path)

    # Decompose an audio time series into harmonic and percussive components

    y_harmonic, y_percussive = librosa.effects.hpss(y)

    # -------------BEAT AND TEMPO--------------------
    
    # Compute a spectral flux onset strength envelope

    hop_length = 512
    onset_env = librosa.onset.onset_strength(y = y_percussive, sr = sr, aggregate = numpy.median,  hop_length=hop_length)

    # Dynamic programming beat tracker  
    
    tempo, beats = librosa.beat.beat_track(onset_envelope = onset_env, sr = sr)

    # Locate note onset events by picking peaks in an onset strength envelope
    
    onset_frames = librosa.onset.onset_detect(onset_envelope = onset_env, sr = sr)

    
    # Compute the tempogram: local autocorrelation of the onset strength envelope.

    tempogram = librosa.feature.tempogram(onset_envelope = onset_env, sr = sr, hop_length = hop_length)

    #--------------SPECTRAL FEATURES----------------

    # Compute a chromagram from a waveform or power spectrogram
    
    chroma = librosa.feature.chroma_stft(y = y_harmonic, sr = sr)

    # Compute a Mel-scaled power spectrogram

    mel = librosa.feature.melspectrogram(y = y, sr = sr)
    mel_h = librosa.feature.melspectrogram(y = y_harmonic, sr = sr)
    mel_p = librosa.feature.melspectrogram(y = y_percussive, sr = sr)

    # Convert to log scale (dB). We'll use the peak power as reference.

    log_mel = librosa.logamplitude(mel, ref_power = numpy.max)
    log_mel_h = librosa.logamplitude(mel_h, ref_power = numpy.max)
    log_mel_p = librosa.logamplitude(mel_p, ref_power = numpy.max)    

    # Mel-frequency cepstral coefficients

    mfcc = librosa.feature.mfcc(S = log_mel)
    delta_mfcc  = librosa.feature.delta(mfcc)
    delta2_mfcc = librosa.feature.delta(mfcc, order=2)

    # Compute root-mean-square (RMS) energy for each frame

    S, phase = librosa.magphase(chroma)
    rms = librosa.feature.rmse(S = S)
    
    # Compute the spectral centroid
    
    cent = librosa.feature.spectral_centroid(S = S)

    # Compute p’th-order spectral bandwidth

    spec_bw = librosa.feature.spectral_bandwidth(S = S)

    # Compute spectral contrast

    S_abs = numpy.abs(chroma)
#    contrast = librosa.feature.spectral_contrast(S = S_abs, sr = sr)
    
    # Compute roll-off frequency

    rolloff = librosa.feature.spectral_rolloff(S = S, sr = sr)

    # Get coefficients of fitting an nth-order polynomial to the columns of a spectrogram
    
    line = librosa.feature.poly_features(S = S_abs, sr = sr)
    quad = librosa.feature.poly_features(S = S_abs, order = 2)

    # Compute the zero-crossing rate of an audio time series
    
    z_cross = librosa.feature.zero_crossing_rate(y)


    #-------------WRITING TO FILE----------------
    
    features = [tempo, onset_env, beats, onset_frames, tempogram, chroma, log_mel, log_mel_h, log_mel_p, mfcc,
                delta_mfcc, delta2_mfcc, rms, cent, spec_bw, rolloff, line, quad, z_cross]

    features_names = ["Tempo", "Onset strength envelope", "Beats", "Onset events", "Tempogram", "Chromagram",
                      "Log-scaled melspectrogram", "Log-scaled harmonic melspectrogram",
                      "Log-scaled percussive melspectrogram", "Mel-frequency cepstral coefficients", "Delta features",
                      "Delta square features", "Root mean square energy", "Spectral centroid", "Spectral bandwidth",
                      "Roll-off frequency", "Linear coefficients", "Quadratic coefficients", "Zero crossing rate"]

    
    suf_ind = path.find(".")

    if (suf_ind != -1):
        file_name = path[0:suf_ind] + ".dat"
    else:
        file_name = path + ".dat"


    with open(file_name, 'w') as f:
        
        tmp = "%s : %s\n" % (features_names[0], str(features[0]))     # Tempo
        f.write(tmp)
        
        for i in range(1, len(features)):

            tmp = "%s : %s\n" % (features_names[i], str(features[i].tolist()))
            f.write(tmp)         

    f.close()    

    return