예제 #1
0
def encode(data_file, output_file, key_file=None):
    print '* * encoding message in audio file...'
    data_file_size = os.path.getsize(data_file)

    if key_file is not None:
        signal, sr = librosa.load(key_file, sr=RATE)
        spec = stft(signal, WINDOW_LENGTH, HOP_SIZE)
    else:
        signal = make_sinewave(1, math.ceil(data_file_size / 20.), RATE)
        spec = stft(signal, WINDOW_LENGTH, HOP_SIZE)

    print 'data file size:', data_file_size
    print 'spec shape', spec.shape

    with open(data_file) as dfile:
        d = dfile.read(1)
        i = 0
        while d:
            h = int(d.encode("hex"), 16)
            if key_file is not None:
                spec[h][i] = np.max(
                    np.abs([spec[x][i] for x in range(spec.shape[0])])) + 200
            else:
                spec[h][i] = np.max(
                    np.abs([spec[x][i] for x in range(spec.shape[0])])) * 200
            spec[h - 1][i] = 0
            spec[h + 1][i] = 0
            d = dfile.read(1)
            i += 1
    spec = spec[:, :i]
    spec = add_start_stop(spec)
    wavwrite(output_file, istft(spec, 1024, 2048), RATE)
예제 #2
0
def audio2spec(audio_list, window_size, window_overlap, n_fft):
    """
    Args:
        audio_list: a numpy array of audio samples, with dimensions (naudio, nsample)
        window_size: the size of the stft window, in samples
        window_overlap: amount of window overlap, in samples
        n_fft: size of windowed signal after zero padding
    Returns:
        spec_tens: np array of spectrograms, shape=(naudio, 1 + nfft//2, 1 + nsamples // window_overlap)
    """
    naudio, nsamples = audio_list.shape
    spec_tens = np.zeros(shape=(naudio, 1 + n_fft // 2,
                                1 + nsamples // window_overlap))

    for idx, audio in enumerate(tqdm(audio_list)):
        stf = lc.stft(audio,
                      win_length=window_size,
                      hop_length=window_overlap,
                      n_fft=n_fft)
        spec_tens[idx] = np.abs(
            lc.stft(audio,
                    win_length=window_size,
                    hop_length=window_overlap,
                    n_fft=n_fft))

    return spec_tens
예제 #3
0
def main(argv):
    os.makedirs(FLAGS.output_dir, exist_ok=True)
    ''' Initialize model '''
    unet = Unet()
    restore(net=unet, ckpt_path=FLAGS.ckpt_path)

    ''' Load data '''
    mix_wav, _ = load(FLAGS.original_wav, sr=SAMPLE_RATE)
    mix_wav_mag, mix_wav_phase = magphase(stft(mix_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH))
    mix_wav_mag= mix_wav_mag[:, START:END]
    mix_wav_phase= mix_wav_phase[:, START:END]

    '''Load gt '''
    if FLAGS.gt == True:
        gt_wav, _ = load(FLAGS.original_gt, sr=SAMPLE_RATE)
        gt_wav_mag, gt_wav_phase = magphase(stft(gt_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH))
        gt_wav_mag= gt_wav_mag[:, START:END]
        gt_wav_phase= gt_wav_phase[:, START:END]

    '''Save input spectrogram image and gt'''
    write_wav(FLAGS.output_dir+'original_mix.wav', 
                istft(mix_wav_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH),
                SAMPLE_RATE, norm=True)
    spectogram_librosa(FLAGS.output_dir+'original_mix.wav',0)
    if FLAGS.gt == True:
        write_wav(FLAGS.output_dir+'gt.wav', 
                    istft(gt_wav_mag * gt_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH),
                    SAMPLE_RATE, norm=True)
        spectogram_librosa(FLAGS.output_dir+'gt.wav',0)

    ''' run data '''
    inputs = mix_wav_mag[1:].reshape(1, 512, 128, 1)
    mask = unet(inputs).numpy().reshape(512, 128)
    predict = inputs.reshape(512, 128)*mask

    ''' evaluation metrics '''
    if FLAGS.gt == True:
        expand_pre = np.expand_dims(predict.flatten(), axis=0)
        expand_gt = np.expand_dims(gt_wav_mag[1:].flatten(), axis=0)
        expand_input = np.expand_dims(inputs.flatten(), axis=0)
        (SDR, SIR, SAR, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_pre)
        (SDR2, _, _, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_input)
        NSDR = SDR - SDR2 #SDR(Se, Sr) − SDR(Sm, Sr)

        fout = open(FLAGS.output_dir+'metrics.txt','a')
        print('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****')
        fout.write('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****')
        fout.close()

    ''' Convert model output to target magnitude '''
    target_pred_mag = np.vstack((np.zeros((128)), predict))

    ''' Write vocal prediction audio files '''
    write_wav(FLAGS.output_dir+'pred_vocal.wav', 
                istft(target_pred_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH),
                SAMPLE_RATE, norm=True)

    spectogram_librosa(FLAGS.output_dir+'pred_vocal.wav',1)
예제 #4
0
def gl_rec(mag_stft, hop, wlen, init_rec, n_iter=40):
    # Function for Griffin-Lim reconstruction
    rec = 1.0 * init_rec
    rec_stft = core.stft(rec, n_fft=nfft, hop_length=hop, win_length=wlen)
    angles = rec_stft / np.abs(rec_stft)
    for i in range(n_iter):
        rec = core.istft(np.abs(mag_stft**1.2) * angles, hop, wlen)
        rec_stft = core.stft(rec, n_fft=nfft, hop_length=hop, win_length=wlen)
        angles = rec_stft / np.abs(rec_stft)
    return rec
예제 #5
0
def comp_lsd(ref_file, pred_file):
    ref = core.load(ref_file, sr=sr)[0]
    pred = core.load(pred_file, sr=sr)[0]
    stft_ref = np.abs(
        core.stft(ref, n_fft=nfft, hop_length=hop, win_length=wlen))
    stft_pred = np.abs(
        core.stft(pred, n_fft=nfft, hop_length=hop, win_length=wlen))
    logstft_ref = np.log(0.1 + stft_ref)
    logstft_pred = np.log(0.1 + stft_pred[:, :stft_ref.shape[1]])
    lsd = np.mean(
        np.sqrt(np.sum((logstft_ref[7:220] - logstft_pred[7:220])**2, axis=0)))
    return lsd
예제 #6
0
def SaveSpectrogram(y_mix, y_vocal, y_inst, fname, original_sr=44100):
    """extract features and save"""
    y_mix = resample(y_mix, original_sr, C.SR)
    y_vocal = resample(y_vocal, original_sr, C.SR)
    y_inst = resample(y_inst, original_sr, C.SR)

    S_mix = np.abs(stft(y_mix, n_fft=C.FFT_SIZE,
                        hop_length=C.H)).astype(np.float32)
    S_vocal = np.abs(stft(y_vocal, n_fft=C.FFT_SIZE,
                          hop_length=C.H)).astype(np.float32)
    S_inst = np.abs(stft(y_inst, n_fft=C.FFT_SIZE,
                         hop_length=C.H)).astype(np.float32)

    norm = S_mix.max()
    S_mix /= norm
    S_vocal /= norm
    S_inst /= norm

    # np.savez(os.path.join(C.PATH_FFT, fname+".npz"), mix=S_mix, vocal=S_vocal, inst=S_inst)

    # Generate sequence (1,512,128) and save
    cnt = 1
    i = 0
    while i + C.PATCH_LENGTH < S_mix.shape[1]:
        mix_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        #vocal_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        inst_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        mix_spec[0, :, :] = S_mix[1:, i:i + C.PATCH_LENGTH]
        #vocal_spec[0, :, :] = S_vocal[1:, i:i + C.PATCH_LENGTH]
        inst_spec[0, :, :] = S_inst[1:, i:i + C.PATCH_LENGTH]

        np.savez(os.path.join(C.VAL_PATH_FFT, fname + str(cnt) + ".npz"),
                 data=mix_spec,
                 label=inst_spec)

        i += C.PATCH_LENGTH
        cnt += 1

    if S_mix.shape[1] >= 128:
        mix_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        #vocal_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        inst_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        mix_spec[0, :, :] = S_mix[1:, S_mix.shape[1] -
                                  C.PATCH_LENGTH:S_mix.shape[1]]
        #vocal_spec[0, :, :] = S_vocal[1:, S_mix.shape[1] - C.PATCH_LENGTH:S_mix.shape[1]]
        inst_spec[0, :, :] = S_inst[1:, S_mix.shape[1] -
                                    C.PATCH_LENGTH:S_mix.shape[1]]

        np.savez(os.path.join(C.VAL_PATH_FFT, fname + str(cnt) + ".npz"),
                 data=mix_spec,
                 label=inst_spec)
        cnt += 1
예제 #7
0
def dual_stft(signal_0, signal_1, window_size, hop_percentage):
    hop_length = int(hop_percentage * window_size / 100)

    Zxx_0 = lc.stft(signal_0, n_fft=window_size, hop_length=hop_length)
    Zxx_1 = lc.stft(signal_1, n_fft=window_size, hop_length=hop_length)

    n_frames = Zxx_0.shape[1]
    n_freqs = Zxx_0.shape[0]

    print('Number of frames: {}'.format(n_frames))
    print('Frequency resolution: {}'.format(n_freqs))

    return Zxx_0, Zxx_1, n_freqs, n_frames, hop_length
예제 #8
0
    def pre_sff(self):
        feature_path = os.path.join(self.dataset['feature_path'], 'pre_sff')
        if not os.path.exists(feature_path):
            os.mkdir(feature_path)

        x_train = []
        y_train = []
        f_train = []
        for i, row in self.dataset.train_data.iterrows():
            print('[Train] {}) Getting pre_sff from {}...'.format(
                i, row['cur_name']),
                  end='')
            wav_name = os.path.join(self.dataset['data_path'], row['cur_name'])
            sr, wav_data = wavfile.read(wav_name)

            spec = stft(buf_to_float(wav_data),
                        n_fft=800,
                        hop_length=160,
                        win_length=320)[:200, :]  # до 4К KHz
            spec = np.log(np.abs(spec) + 1e-10)
            spec -= np.min(spec)

            x_train.append(spec)
            y_train.append(self._build_multilabel(row))
            f_train.append(row['cur_name'])
            print('done.')

        x_test = []
        y_test = []
        f_test = []
        for i, row in self.dataset.test_data.iterrows():
            print('[Test] {}) Getting sff from {}...'.format(
                i, row['cur_name']),
                  end='')
            wav_name = os.path.join(self.dataset['data_path'], row['cur_name'])
            sr, wav_data = wavfile.read(wav_name)

            spec = stft(buf_to_float(wav_data),
                        n_fft=800,
                        hop_length=160,
                        win_length=320)[:200, :]  # до 4К KHz
            spec = np.log(np.abs(spec) + 1e-10)
            spec -= np.min(spec)

            x_test.append(spec)
            y_test.append(self._build_multilabel(row))
            f_test.append(row['cur_name'])
            print('done.')

        self._save_pickles(feature_path, x_train, y_train, f_train, x_test,
                           y_test, f_test)
예제 #9
0
def Savespec(y_mix, y_inst, fname):
    S_mix = np.abs(
        stft(y_mix, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32)
    S_inst = np.abs(
        stft(y_inst, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32)
    S_vocal = np.maximum(0, S_mix - S_inst)
    # y_vocal = istft(S_vocal*phase, hop_length=C.H, win_length=C.FFT_SIZE)
    # write_wav(os.path.join("Audiocheck", fname+".wav"), y_vocal, C.SR)
    norm = S_mix.max()
    S_mix /= norm
    S_inst /= norm
    S_vocal /= norm
    np.savez(os.path.join(C.PATH_FFT, fname+".npz"),
             vocal=S_vocal, mix=S_mix, inst=S_inst)
예제 #10
0
def phase_MISI(inst_esti, vocal_esti, mix):
    delta = mix - (inst_esti + vocal_esti)
    inst = inst_esti + delta / 2
    vocal = vocal_esti + delta / 2
    S_inst = stft(inst,
                  n_fft=C.FFT_SIZE,
                  hop_length=C.H,
                  win_length=C.FFT_SIZE)
    S_vocal = stft(vocal,
                   n_fft=C.FFT_SIZE,
                   hop_length=C.H,
                   win_length=C.FFT_SIZE)
    P_inst = np.exp(1.j * np.angle(S_inst))
    P_vocal = np.exp(1.j * np.angle(S_vocal))
    return P_inst, P_vocal
예제 #11
0
def compute_features(data_loc='../data/genres/'):
    file_names = glob.glob(data_loc + '*/*.au')
    file_names.sort()

    assert len(
        file_names
    ) == 1000, "ERROR: Couldn't read files properly. Is your data_loc correct?"

    # Setup some vars
    sampleRate = 22050
    n_fft = 1024

    X = []
    genres_list = list(song_labels_dic.keys())
    genres_list.sort()
    genre_flag = 0

    if not os.path.exists('../ckpt'):
        os.makedirs('../ckpt')

        for file in file_names:
            song, _ = lc.load(file)
            song_dft = np.abs(lc.stft(song, n_fft=n_fft))
            X.append(song_dft)
            if len(X) == 100:
                print('Writing: ' + genres_list[genre_flag] + '.pkl file...')
                with open('../ckpt/' + genres_list[genre_flag] + '.pkl',
                          'wb') as f:
                    pickle.dump(X, f)
                X = []
                genre_flag = genre_flag + 1

    return True
예제 #12
0
def psd(audio, preprocess=False):
    audioSTFT = stft(audio, n_fft=512, hop_length=128, win_length=512)[:-1, :]
    Mag, Phase = np.abs(audioSTFT), np.angle(audioSTFT)
    nframes = int(256 * np.ceil(np.shape(Mag)[1] / 256))
    pad_size = nframes - np.shape(Mag)[1]
    variance = (np.mean(Mag[:10]) if np.mean(Mag[:10]) < 0.01 else 0.01)
    pad_seq = variance * np.random.randn(256, pad_size)
    Mag = np.hstack((Mag, pad_seq))
    Phase = np.hstack((Phase, 0.0 * pad_seq))

    if preprocess:
        Mag_smooth = mag2dB(norm(optimal_smoothing(Mag)))
        Mag_smooth[Mag_smooth < -120] = -120
        minmax_smooth = [np.min(Mag_smooth), np.max(Mag_smooth)]
        Mag_smooth_norm = np.interp(Mag_smooth, minmax_smooth, [-1, 1])

    Mag = mag2dB(norm(Mag))
    Mag[Mag < -120] = -120
    minmax = [np.min(Mag), np.max(Mag)]
    Mag_norm = np.interp(Mag, minmax, [-1, 1])

    psd = {}
    if preprocess:
        psd['MagdB_smooth'] = Mag_smooth_norm
        psd['Norm_smooth'] = minmax_smooth
    psd['MagdB'] = Mag_norm
    psd['Phase'] = Phase
    psd['Norm'] = minmax
    return psd
예제 #13
0
def librosaSpec(data):
    from librosa.core import resample, stft
    from librosa import amplitude_to_db, magphase

    spectrum = stft(data)
    mag, ph = magphase(spectrum)
    return amplitude_to_db(mag), np.angle(ph)
예제 #14
0
def audio_to_array(audio):

    #extract audio data and sampling rate from file
    data, fs = sf.read(audio)

    #convert to wav file at correct sampling rate
    sf.write(audio, data, fs)

    #read the audio sample
    audio = read(audio)

    #[removed]
    #y, sr = load(audio, offset=30, duration=5)
    #audio_arr = mfcc(y=y, sr=sr)

    #convert the audio to an array
    audio_arr = np.array(audio[1],dtype=float)

    #normalize
    audio_arr = normalize(audio_arr, np.inf, 0)

    #short-time Fourier transform
    audio_arr = np.abs(stft(audio_arr))

    #[removed]
    #Mel - frequency cepstral coefficients(MFCCs)
    #audio_arr = np.abs(mfcc(audio_arr))
    #audio_arr = mfcc(audio_arr, sr=44100)

    #reduce number of dimensions
    pca = PCA(n_components=5)
    audio_arr = pca.fit_transform(audio_arr)
    return audio_arr
예제 #15
0
def find_peaks(y,size):
    sgram = np.abs(stft(y,n_fft=512,hop_length=256))
    #sgram = np.log(np.maximum(sgram,np.max(sgram)/1e6))
    #sgram = sgram - np.mean(sgram)
    sgram_max = ndi.maximum_filter(sgram,size=size,mode="constant")
    maxima = (sgram==sgram_max) & (sgram > 0.2)
    return maxima
예제 #16
0
def pncc(audio_wave,
         n_fft=1024,
         sr=16000,
         window="hamming",
         n_mels=40,
         n_pncc=13,
         weight_N=4,
         power=2,
         dct=True):

    pre_emphasis_signal = scipy.signal.lfilter([1.0, -0.97], 1, audio_wave)
    stft_pre_emphasis_signal = np.abs(
        stft(pre_emphasis_signal, n_fft=n_fft, window=window))**power
    mel_filter = np.abs(filters.mel(sr, n_fft=n_fft, n_mels=n_mels))**power
    power_stft_pre_signal = np.dot(stft_pre_emphasis_signal.T, mel_filter.T)
    q_ = medium_time_power_calculation(power_stft_pre_signal)
    q_le = asymmetric_lawpass_filtering(q_, 0.999, 0.5)
    pre_q_0 = q_ - q_le
    q_0 = halfwave_rectification(pre_q_0)
    q_f = asymmetric_lawpass_filtering(q_0)
    q_th = temporal_masking(q_0)
    r_sp = after_temporal_masking(q_th, q_f)
    r_ = switch_excitation_or_non_excitation(r_sp=r_sp,
                                             q_f=q_f,
                                             q_le=q_le,
                                             q_power_stft_pre_signal=q_)
    s_ = weight_smoothing(r_=r_, q_=q_, N=weight_N)
    t_ = time_frequency_normalization(p_=power_stft_pre_signal, s_=s_)
    u_ = mean_power_normalization(t_, r_)
    v_ = power_function_nonlinearity(u_)
    dct_v = np.dot(filters.dct(n_pncc, v_.shape[1]), v_.T)
    if dct:
        return dct_v.T
    else:
        return v_.T
예제 #17
0
def time_stretch_hpss(audio, f):

    if f == 1.0:
        return audio

    stft = core.stft(audio)

    # Perform HPSS
    stft_harm, stft_perc = decompose.hpss(
        stft, kernel_size=31)  # original kernel size 31

    # OLA the percussive part
    y_perc = librosa.util.fix_length(core.istft(stft_perc, dtype=audio.dtype),
                                     len(audio))
    y_perc = time_stretch_sola(y_perc, f)

    #~ # Phase-vocode the harmonic part
    #~ stft_stretch = core.phase_vocoder(stft_harm, 1.0/f)
    #~ # Inverse STFT of harmonic
    #~ y_harm = librosa.util.fix_length(core.istft(stft_stretch, dtype=y_perc.dtype), len(y_perc))
    y_harm = librosa.util.fix_length(core.istft(stft_harm, dtype=audio.dtype),
                                     len(audio))
    y_harm = librosa.util.fix_length(
        time_stretch_sola(core.istft(stft_harm, dtype=audio.dtype),
                          f,
                          wsola=True), len(y_perc))

    # Add them together
    return y_harm + y_perc
def LoadAudio(fname):
    y, sr = load(fname, sr=C.SR)
    spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE)
    mag = np.abs(spec)
    mag /= np.max(mag)
    phase = np.exp(1.j*np.angle(spec))
    return mag, phase
예제 #19
0
def LoadAudio(fname):
    y, sr = load(fname, sr=C.SR)
    spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE)
    mag = np.abs(spec)
    mag /= np.max(mag)
    phase = np.exp(1.j * np.angle(spec))
    return mag, phase
예제 #20
0
 def __call__(self, data):
     s = stft(data,
              n_fft=self.n_fft,
              win_length=self.win_length,
              hop_length=self.hop_length,
              window=self.window)
     s = np.abs(s) if self.is_abs is True else s
     return s
예제 #21
0
def load_audio(fname):
    y = load(fname, sr=16000)[0]
    spec = stft(y, n_fft=1024, hop_length=512, win_length=1024)
    spec = np.pad(spec, [(0, 0), (0, 1024 - spec.shape[1] % 1024)], 'constant')
    mag = np.abs(spec)
    mag /= np.max(mag)
    phase = np.exp(1.j * np.angle(spec))
    return mag, phase, y.shape[0]
예제 #22
0
def audio_to_spectrogram(input_signal,
                         n_fft,
                         hop_length,
                         win_length,
                         window='hann',
                         center=True):
    return (audio.stft(np.asarray(input_signal), n_fft, hop_length, win_length,
                       window, center))
def separate(PATH_INPUT, PATH_OUTPUT, MODEL, SR=16000, FFT_SIZE = 1024, H = 512):
    
    if os.path.isdir( PATH_INPUT):
        # 入力がディレクトリーの場合、ファイルリストをつくる
        filelist_mixdown = find_files(PATH_INPUT, ext="wav", case_sensitive=True)
    else:
    	# 入力が単一ファイルの場合
        filelist_mixdown=[PATH_INPUT]
    print ('number of mixdown file', len(filelist_mixdown))
    
    # 出力用のディレクトリーがない場合は 作成する。
    _, path_output_ext = os.path.splitext(PATH_OUTPUT)
    print ('path_output_ext',path_output_ext)
    if len(path_output_ext)==0  and  not os.path.exists(PATH_OUTPUT):
        os.mkdir(PATH_OUTPUT)
    
    # モデルの読み込み
    unet = train.UNet()
    chainer.serializers.load_npz( MODEL,unet)
    config.train = False
    config.enable_backprop = False
    
    # ミックスされたものを読み込み、vocal(speech)の分離を試みる
    for fmixdown in filelist_mixdown:
        # audioread でエラーが発生した場合は、scipyを使う。
        try:
            y_mixdown, _ = load(fmixdown,  sr=SR, mono=True)
        except:
            sr_mixdown, y_mixdown = read(fmixdown)
            if not sr_mixdown == SR:
                y_mixdown = resample(y_mixdown, sr_mixdown, SR)
        
        # 入力の短時間スペクトラムを計算して、正規化する。
        spec = stft(y_mixdown, n_fft=FFT_SIZE, hop_length=H, win_length=FFT_SIZE)
        mag = np.abs(spec)
        mag /= np.max(mag)
        phase = np.exp(1.j*np.angle(spec))
        print ('mag.shape', mag.shape)  
        start = 0
        end = 128 * (mag.shape[1] // 128)  # 入力のフレーム数以下で、networkの定義に依存して 適切な値を選ぶこと。
        # speech(vocal)を分離するためのマスクを求める
        mask = unet(mag[:, start:end][np.newaxis, np.newaxis, 1:, :]).data[0, 0, :, :]
        mask = np.vstack((np.zeros(mask.shape[1], dtype="float32"), mask))
        # 入力の短時間スペクトラムにマスクを掛けて、逆FFTで波形を合成する。
        mag2=mag[:, start:end]*mask 
        phase2=phase[:, start:end]
        y = istft(mag2*phase2, hop_length=H, win_length=FFT_SIZE)
        
        # 分離した speech(vocal)を出力ファイルとして保存する。
        if len(path_output_ext)==0:
            # ディレクトリーへ出力
            foutname, _ = os.path.splitext( os.path.basename(fmixdown) )
            fname= os.path.join(PATH_OUTPUT, (foutname + '.wav'))
        else:
            # 指定されたファイルへ出力
            fname= PATH_OUTPUT
        print ('saving... ', fname)
        write_wav(fname, y, SR, norm=True)
예제 #24
0
파일: pncc.py 프로젝트: ironiksk/PNCC
def pncc(audio_wave,
         n_fft=512,
         sr=16000,
         winlen=0.020,
         winstep=0.010,
         n_mels=128,
         n_pncc=13,
         weight_N=4,
         power=2):

    pre_emphasis_signal = scipy.signal.lfilter([1.0, -0.97], 1, audio_wave)
    mono_wave = to_mono(pre_emphasis_signal.T)
    stft_pre_emphasis_signal = np.abs(
        stft(mono_wave,
             n_fft=n_fft,
             hop_length=int(sr * winstep),
             win_length=int(sr * winlen),
             window=np.ones(int(sr * winlen)),
             center=False))**power

    mel_filter = np.abs(filters.mel(sr, n_fft=n_fft, n_mels=n_mels))**power
    power_stft_signal = np.dot(stft_pre_emphasis_signal.T, mel_filter.T)

    medium_time_power = medium_time_power_calculation(power_stft_signal)

    lower_envelope = asymmetric_lawpass_filtering(medium_time_power, 0.999,
                                                  0.5)

    subtracted_lower_envelope = medium_time_power - lower_envelope

    rectified_signal = halfwave_rectification(subtracted_lower_envelope)

    floor_level = asymmetric_lawpass_filtering(rectified_signal)

    temporal_masked_signal = temporal_masking(rectified_signal)

    final_output = switch_excitation_or_non_excitation(temporal_masked_signal,
                                                       floor_level,
                                                       lower_envelope,
                                                       medium_time_power)

    spectral_weight_smoothing = weight_smoothing(final_output,
                                                 medium_time_power,
                                                 L=n_mels)

    transfer_function = time_frequency_normalization(
        power_stft_signal, spectral_weight_smoothing)

    normalized_power = mean_power_normalization(transfer_function,
                                                final_output,
                                                L=n_mels)

    power_law_nonlinearity = power_function_nonlinearity(normalized_power)

    dct = np.dot(power_law_nonlinearity,
                 filters.dct(n_pncc, power_law_nonlinearity.shape[1]).T)

    return dct
예제 #25
0
def find_peaks(data):
    sgram = np.abs(stft(data, n_fft=512, window='hamming'))
    neighborhood = sp.ndimage.morphology.iterate_structure(
        sp.ndimage.morphology.generate_binary_structure(2, 1), 8)
    sgram_max = sp.ndimage.maximum_filter(sgram,
                                          footprint=neighborhood,
                                          mode='constant')
    # => (peaks_freq, peaks_time)
    return np.asarray((sgram == sgram_max) & (sgram > 0.2)).nonzero()
예제 #26
0
def to_stft(seq, nfft):
    """
	:param seq:  Raw audio
	:param nfft: parameter of STFT
	:return: STFT of the input seq, broken down into magnitude in one channel and phase in the other.
	"""
    nfft_padlen = int(len(seq) + nfft / 2)
    stft = lc.stft(fix_length(seq, nfft_padlen), n_fft=nfft)
    return np.array([np.abs(stft), np.angle(stft)]).transpose(1, 2, 0)
예제 #27
0
 def __call__(self, x):
     if isinstance(x, torch.Tensor):
         x = x.numpy()
     X = stft(x, n_fft=self.n_fft)
     X_mag = np.abs(X)[:, :, None]
     if self.logpower:
         X_mag = np.log((X_mag ** 2))
         X_pha = np.angle(X)[:, :, None]
     return np.concatenate((X_mag, X_pha), axis=2)
예제 #28
0
def __magphase(y, n_fft, hop_length, win_length):
    spec = stft(y,
                n_fft=n_fft,
                hop_length=hop_length,
                win_length=win_length,
                window=C.WINDOW)
    mag = np.abs(spec).astype(np.float32)
    mag /= np.max(mag)
    phase = np.exp(1.j * np.angle(spec))
    return mag, phase
예제 #29
0
def GetMag(sig, rate, winlen, winstep, NFFT, fuc_name='Rect'):
    '''获取输入音频的频谱图'''
    mag = stft(np.asfortranarray(sig),
               n_fft=NFFT,
               hop_length=int(winstep * rate),
               win_length=int(winlen * rate),
               window=fuc_name)

    # 习惯上我们将频谱值表现在y轴上,故旋转
    return mag
예제 #30
0
def make_mag_spec(filelist, args):

    batch_length = args.batch_length
    for filename in filelist:
        basename = os.path.splitext(os.path.basename(filename))[0]
        # load wav
        wav = load(filename, args.fs, mono=False)[0]
        vocal_wav = wav[0].copy()
        mix_wav = wav[1].copy()
        # make magnitude spectrogram
        vocal_spec = stft(vocal_wav, args.frame_size, args.shift_size)
        mix_spec = stft(mix_wav, args.frame_size, args.shift_size)
        spec = np.stack((vocal_spec, mix_spec))
        mag_spec = np.abs(spec[:, 1:, :]).copy()
        for seg in range(mag_spec.shape[-1] // args.batch_length):
            seg_filename = basename + '_seg{}.npy'.format(seg)
            seg_mag_spec = \
                mag_spec[..., seg * batch_length:(seg + 1) * batch_length]
            np.save(os.path.join(args.dst_dir, seg_filename), seg_mag_spec)
예제 #31
0
 def process(self, data):
     """
     Returns 3-d matrix of sizes [257,301,2]
     :param data:
     :return:
     """
     spectr = stft(data, n_fft=512, hop_length=160)
     return np.concatenate(
         (spectr.real[:, :, np.newaxis], spectr.imag[:, :, np.newaxis]),
         axis=2)
def SaveSpectrogram(y_mix, y_vocal, y_inst, fname, original_sr=44100):
    y_mix = resample(y_mix, original_sr, C.SR)
    y_vocal = resample(y_vocal, original_sr, C.SR)
    y_inst = resample(y_inst, original_sr, C.SR)

    S_mix = np.abs(
        stft(y_mix, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32)
    S_vocal = np.abs(
        stft(y_vocal, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32)
    S_inst = np.abs(
        stft(y_inst, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32)

    norm = S_mix.max()
    S_mix /= norm
    S_vocal /= norm
    S_inst /= norm

    np.savez(os.path.join(C.PATH_FFT, fname+".npz"),
             mix=S_mix, vocal=S_vocal, inst=S_inst)
예제 #33
0
# -*- coding: utf-8 -*-
"""
Created on Sat May  7 13:51:42 2016

@author: parallels
"""
import numpy as np
from librosa.core import load,stft
import matplotlib.pyplot as plt
from librosa.display import specshow
import functions
 #from scipy.spatial.distance import euclidean

y,sr = load("wiwym.wav")
rec,sr = load("recording.wav")
y = y[:sr*30]

 spec = np.abs(stft(y,n_fft = 4960,hop_length = 512))
    query = np.abs(stft(rec,n_fft = 4960,hop_length = 512))
    maximum_spec = find_peak(spec,30)
    maximum_query = find_peak(query,30)

plt.plot(overlap)
def plotStructure(fullpath, order=1, sr=4, cutoff=.1, n_singv=3, window=8,
                  step_size=2, feature='chroma', dim_red='SVD', as_diff=0,
                  round_to=0, normalize=1, scale=1, medfil_len=0):
    print 'Analyzing {}'.format(fullpath)
    # extract filename, filepath and beat aligned feature
    filename, file_ext = os.path.splitext(fullpath)
    feats = {}
    feats[feature], beat_times = extractFeature(
        filename, file_ext, feature, scale, round_to, normalize)

    # apply low-pass filter and running mean on featgram
    feats['LPF'] = lpf(feats[feature], cutoff, sr, order)

    # perform dimensionality reduction (NMF or SVD)

    if dim_red == 'NMF':
        print '\tNon-Negative Matrix Factorization for {}'.format(feature)
        feats['NMF'] = NMF(n_singv).fit_transform(feats[feature].astype(float))
        feats['NMF(LPF)'] = NMF(n_singv).fit_transform(feats['LPF'])
        feats['LPF(NMF)'] = lpf(feats['NMF'], cutoff, sr, order)
    elif dim_red == 'NMF':
        print '\tSingular Vector Decomposition'
        feats['SVD'] = svd(feats[feature], n_singv, inc_proj=False)
        feats['SVD(LPF)'] = svd(feats['LPF'], n_singv, inc_proj=False)
        feats['LPF(SVD)'] = lpf(feats['SVD'], cutoff, sr, order)
    else:
        raise Exception(
            "{} is not a supported dimensionality reduction".format(dim_red))

    if round_to:
        feats['LPF'] = np.round(
            lpf(feat[feature], cutoff, sr, order) / round_to) * round_to
        feats[dim_red] = np.round(
            dim_red_fn(dim_ref, feats[feature], n_singv) / round_to) * round_to
        feats['{}(LPF)'.format(dim_red)] = np.round(dim_red_fn(dim_red,
            feats['LPF'], n_singv) / round_to) * round_to
        feats['LPF({})'.format(dim_red)] = np.round(
            lpf(feats[dim_red], cutoff, sr, order) / round_to) * round_to
    else:
        feats['LPF'] = lpf(feats[feature], cutoff, sr, order)
        feats[dim_red] = dim_red_fn(dim_red, feats[feature], n_singv)
        feats['{}(LPF)'.format(dim_red)] = dim_red_fn(
            dim_red, feats['LPF'], n_singv)
        feats['LPF({})'.format(dim_red)] = lpf(feats[dim_red], cutoff, sr, order)

    # FFT on all features
    n_fft = 8
    hop_length = 1
    for k, v in feats.items():
        data = np.array([stft(f, n_fft, hop_length)[1:, :] for f in v.T])
        data = data.T
        data = data.reshape(data.shape[0], data.shape[1]*data.shape[2])
        feats['FFT({})'.format(k)] = np.abs(data) ** 2

    def compute_distance(i, X, window, step_size):
        return np.sqrt(np.sum((
            X[i:i+window] - X[i+step_size:i+step_size+window]) ** 2))

    distances = {}
    for k, v in feats.items():
        distances[k] = np.array(map(functools.partial(compute_distance,
                                                      X=v,
                                                      window=window,
                                                      step_size=step_size),
                                xrange(0, len(v)+1-window*2)))
    if as_diff:
        print("\tComputing features as difference")
        for k, v in feats.items():
            feats[k] = np.append([0], np.diff(v))

    if medfil_len:
        print("\tApplying median filter {} to distances".format(medfil_len))
        for k, v in distances.items():
            distances[k] = medfilt(v, medfil_len)
    i = 0
    j = 0
    gs = mpl.gridspec.GridSpec(len(feats), 2, width_ratios=[1, 1])
    fig = plt.figure(figsize=(36, 18))
    for k in feats.keys():
        ts = np.arange(0, len(feats[k]))
        step_size = max(4, int(len(ts) * .02))
        data = feats[k]

        if data.shape[1] == 3:
            data = data.reshape(1, data.shape[0], data.shape[1])
        else:
            data = data.T
        if 'FFT' in k:
            step = hop_length * 2
        else:
            step = step_size

        ax = fig.add_subplot(gs[i, j])
        ax.set_title(k)
        ax.imshow(data,
                  interpolation='nearest',
                  origin='low',
                  aspect='auto',
                  cmap=plt.cm.Oranges)
        ax.set_xticks(ts[::step])
        ax.set_xticklabels(beat_times[::step], rotation=60)
        ax.grid(False)

        ax = fig.add_subplot(gs[i+1, j], sharex=ax)
        ax.set_title('{} Distances'.format(k))
        ax.plot(distances[k])
        ax.set_xticks(ts[::step])
        ax.set_xticklabels(beat_times[::step], rotation=60)
        ax.grid(False)
        if j == 1:
            i += 2
        j = (j+1) % 2

    plt.tight_layout()
    plt.savefig("{}_{}_{}_asdiff_{}_wab_{}_r_{}_n_{}_s_{}_{}.png".format(
        filename, feature, cutoff, as_diff, window, round_to, normalize, scale,
    dim_red))
    plt.close(fig)
예제 #35
0
#!/usr/bin/env python
from librosa.core import stft, istft
import numpy as np
import scipy

y = np.random.rand(44032)

stft_matrix = stft(y, window=scipy.signal.hann(2048), hop_length=1024)
y_hat = istft(stft_matrix, window=np.ones(2048), hop_length=1024)

diff = y - y_hat
print np.dot(diff, diff)