예제 #1
0
def time_stretch_hpss(audio, f):

    if f == 1.0:
        return audio

    stft = core.stft(audio)

    # Perform HPSS
    stft_harm, stft_perc = decompose.hpss(
        stft, kernel_size=31)  # original kernel size 31

    # OLA the percussive part
    y_perc = librosa.util.fix_length(core.istft(stft_perc, dtype=audio.dtype),
                                     len(audio))
    y_perc = time_stretch_sola(y_perc, f)

    #~ # Phase-vocode the harmonic part
    #~ stft_stretch = core.phase_vocoder(stft_harm, 1.0/f)
    #~ # Inverse STFT of harmonic
    #~ y_harm = librosa.util.fix_length(core.istft(stft_stretch, dtype=y_perc.dtype), len(y_perc))
    y_harm = librosa.util.fix_length(core.istft(stft_harm, dtype=audio.dtype),
                                     len(audio))
    y_harm = librosa.util.fix_length(
        time_stretch_sola(core.istft(stft_harm, dtype=audio.dtype),
                          f,
                          wsola=True), len(y_perc))

    # Add them together
    return y_harm + y_perc
예제 #2
0
def main(argv):
    os.makedirs(FLAGS.output_dir, exist_ok=True)
    ''' Initialize model '''
    unet = Unet()
    restore(net=unet, ckpt_path=FLAGS.ckpt_path)

    ''' Load data '''
    mix_wav, _ = load(FLAGS.original_wav, sr=SAMPLE_RATE)
    mix_wav_mag, mix_wav_phase = magphase(stft(mix_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH))
    mix_wav_mag= mix_wav_mag[:, START:END]
    mix_wav_phase= mix_wav_phase[:, START:END]

    '''Load gt '''
    if FLAGS.gt == True:
        gt_wav, _ = load(FLAGS.original_gt, sr=SAMPLE_RATE)
        gt_wav_mag, gt_wav_phase = magphase(stft(gt_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH))
        gt_wav_mag= gt_wav_mag[:, START:END]
        gt_wav_phase= gt_wav_phase[:, START:END]

    '''Save input spectrogram image and gt'''
    write_wav(FLAGS.output_dir+'original_mix.wav', 
                istft(mix_wav_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH),
                SAMPLE_RATE, norm=True)
    spectogram_librosa(FLAGS.output_dir+'original_mix.wav',0)
    if FLAGS.gt == True:
        write_wav(FLAGS.output_dir+'gt.wav', 
                    istft(gt_wav_mag * gt_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH),
                    SAMPLE_RATE, norm=True)
        spectogram_librosa(FLAGS.output_dir+'gt.wav',0)

    ''' run data '''
    inputs = mix_wav_mag[1:].reshape(1, 512, 128, 1)
    mask = unet(inputs).numpy().reshape(512, 128)
    predict = inputs.reshape(512, 128)*mask

    ''' evaluation metrics '''
    if FLAGS.gt == True:
        expand_pre = np.expand_dims(predict.flatten(), axis=0)
        expand_gt = np.expand_dims(gt_wav_mag[1:].flatten(), axis=0)
        expand_input = np.expand_dims(inputs.flatten(), axis=0)
        (SDR, SIR, SAR, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_pre)
        (SDR2, _, _, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_input)
        NSDR = SDR - SDR2 #SDR(Se, Sr) − SDR(Sm, Sr)

        fout = open(FLAGS.output_dir+'metrics.txt','a')
        print('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****')
        fout.write('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****')
        fout.close()

    ''' Convert model output to target magnitude '''
    target_pred_mag = np.vstack((np.zeros((128)), predict))

    ''' Write vocal prediction audio files '''
    write_wav(FLAGS.output_dir+'pred_vocal.wav', 
                istft(target_pred_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH),
                SAMPLE_RATE, norm=True)

    spectogram_librosa(FLAGS.output_dir+'pred_vocal.wav',1)
예제 #3
0
def SaveStereoAudio(fname, mag, phase, norm=True, save_path=None):
    y_l = istft(mag[0] * phase[0],
                hop_length=C.H,
                win_length=C.FFT_SIZE,
                window=C.WINDOW)
    y_r = istft(mag[1] * phase[1],
                hop_length=C.H,
                win_length=C.FFT_SIZE,
                window=C.WINDOW)
    stereo = np.array((y_l, y_r))
    if save_path is None:
        write_wav(C.PATH_MUSIC / fname, stereo, C.SR, norm=norm)
    else:
        write_wav(save_path / fname, stereo, C.SR, norm=norm)
예제 #4
0
파일: main.py 프로젝트: zyy341/U-net-svs
def test():
    vis = Visualizer(env='svs')
    model = getattr(models, 'Unet')().eval()
    #    model.cuda()
    model.load_state_dict(
        t.load('G:/Unet_svs/check/epoch_219__0724_16_57_35.pth'))
    mix_wav, _ = load("C:/Users/lenovo/Music/c.mp3", sr=8192)
    mix_wav_mag, mix_wav_phase = magphase(
        stft(mix_wav, n_fft=1024, hop_length=768))
    START = 700
    END = START + 128

    mix_wav_mag = mix_wav_mag[:, START:END]
    mix_wav_phase = mix_wav_phase[:, START:END]

    print(mix_wav_mag.shape)

    gg = mix_wav_mag[1:]
    gg = t.from_numpy(gg)
    gg.unsqueeze_(0)
    gg.unsqueeze_(0)
    vis.img('a', gg)
    print(gg.shape)
    with t.no_grad():
        gg = Variable(gg)
    score = model(gg)
    predict = gg.data * score.data
    print(predict.shape)
    target_pred_mag = predict.view(512, 128).cpu().numpy()
    target_pred_mag = np.vstack((np.zeros((128)), target_pred_mag))
    vis.img('b', t.from_numpy(target_pred_mag))
    print(target_pred_mag.shape)
    write_wav(
        f'C:/Users/lenovo/Music/pred_vocal.wav',
        istft(
            target_pred_mag * mix_wav_phase
            #     (mix_wav_mag * target_pred_mag) * mix_wav_phase
            ,
            win_length=1024,
            hop_length=768),
        8192,
        norm=True)
    write_wav(f'C:/Users/lenovo/Music/pred_mix.wav',
              istft(mix_wav_mag * mix_wav_phase,
                    win_length=1024,
                    hop_length=768),
              8192,
              norm=True)
예제 #5
0
def spectrogram_to_audio(input_spectrogram,
                         hop_length,
                         win_length,
                         window='hann',
                         center=True):
    return (audio.istft(input_spectrogram, hop_length, win_length, window,
                        center))
예제 #6
0
def encode(data_file, output_file, key_file=None):
    print '* * encoding message in audio file...'
    data_file_size = os.path.getsize(data_file)

    if key_file is not None:
        signal, sr = librosa.load(key_file, sr=RATE)
        spec = stft(signal, WINDOW_LENGTH, HOP_SIZE)
    else:
        signal = make_sinewave(1, math.ceil(data_file_size / 20.), RATE)
        spec = stft(signal, WINDOW_LENGTH, HOP_SIZE)

    print 'data file size:', data_file_size
    print 'spec shape', spec.shape

    with open(data_file) as dfile:
        d = dfile.read(1)
        i = 0
        while d:
            h = int(d.encode("hex"), 16)
            if key_file is not None:
                spec[h][i] = np.max(
                    np.abs([spec[x][i] for x in range(spec.shape[0])])) + 200
            else:
                spec[h][i] = np.max(
                    np.abs([spec[x][i] for x in range(spec.shape[0])])) * 200
            spec[h - 1][i] = 0
            spec[h + 1][i] = 0
            d = dfile.read(1)
            i += 1
    spec = spec[:, :i]
    spec = add_start_stop(spec)
    wavwrite(output_file, istft(spec, 1024, 2048), RATE)
예제 #7
0
def gl_rec(S):
    sr, nfft, wlen, hop = 22050, 1022, 1022, 256
    S = 10**(S)
    angles = 3.1415 * (np.random.randn(S.shape[0], S.shape[1]) - 0.5)
    #print (angles.shape)
    #print (S.shape)
    y = core.istft(S * angles, hop_length=hop, win_length=wlen)
    num_samples = y.shape[0]
    #print (y.shape)
    for i in range(40):
        angles = core.stft(y, n_fft=nfft, hop_length=hop, win_length=wlen)
        S = S[:, :angles.shape[1]]
        _, angles = core.magphase(angles)
        y = core.istft(S * angles, hop_length=hop, win_length=wlen)
        #y = y[:num_samples]
    return y
def separate(PATH_INPUT, PATH_OUTPUT, MODEL, SR=16000, FFT_SIZE = 1024, H = 512):
    
    if os.path.isdir( PATH_INPUT):
        # 入力がディレクトリーの場合、ファイルリストをつくる
        filelist_mixdown = find_files(PATH_INPUT, ext="wav", case_sensitive=True)
    else:
    	# 入力が単一ファイルの場合
        filelist_mixdown=[PATH_INPUT]
    print ('number of mixdown file', len(filelist_mixdown))
    
    # 出力用のディレクトリーがない場合は 作成する。
    _, path_output_ext = os.path.splitext(PATH_OUTPUT)
    print ('path_output_ext',path_output_ext)
    if len(path_output_ext)==0  and  not os.path.exists(PATH_OUTPUT):
        os.mkdir(PATH_OUTPUT)
    
    # モデルの読み込み
    unet = train.UNet()
    chainer.serializers.load_npz( MODEL,unet)
    config.train = False
    config.enable_backprop = False
    
    # ミックスされたものを読み込み、vocal(speech)の分離を試みる
    for fmixdown in filelist_mixdown:
        # audioread でエラーが発生した場合は、scipyを使う。
        try:
            y_mixdown, _ = load(fmixdown,  sr=SR, mono=True)
        except:
            sr_mixdown, y_mixdown = read(fmixdown)
            if not sr_mixdown == SR:
                y_mixdown = resample(y_mixdown, sr_mixdown, SR)
        
        # 入力の短時間スペクトラムを計算して、正規化する。
        spec = stft(y_mixdown, n_fft=FFT_SIZE, hop_length=H, win_length=FFT_SIZE)
        mag = np.abs(spec)
        mag /= np.max(mag)
        phase = np.exp(1.j*np.angle(spec))
        print ('mag.shape', mag.shape)  
        start = 0
        end = 128 * (mag.shape[1] // 128)  # 入力のフレーム数以下で、networkの定義に依存して 適切な値を選ぶこと。
        # speech(vocal)を分離するためのマスクを求める
        mask = unet(mag[:, start:end][np.newaxis, np.newaxis, 1:, :]).data[0, 0, :, :]
        mask = np.vstack((np.zeros(mask.shape[1], dtype="float32"), mask))
        # 入力の短時間スペクトラムにマスクを掛けて、逆FFTで波形を合成する。
        mag2=mag[:, start:end]*mask 
        phase2=phase[:, start:end]
        y = istft(mag2*phase2, hop_length=H, win_length=FFT_SIZE)
        
        # 分離した speech(vocal)を出力ファイルとして保存する。
        if len(path_output_ext)==0:
            # ディレクトリーへ出力
            foutname, _ = os.path.splitext( os.path.basename(fmixdown) )
            fname= os.path.join(PATH_OUTPUT, (foutname + '.wav'))
        else:
            # 指定されたファイルへ出力
            fname= PATH_OUTPUT
        print ('saving... ', fname)
        write_wav(fname, y, SR, norm=True)
예제 #9
0
def gl_rec(mag_stft, hop, wlen, init_rec, n_iter=40):
    # Function for Griffin-Lim reconstruction
    rec = 1.0 * init_rec
    rec_stft = core.stft(rec, n_fft=nfft, hop_length=hop, win_length=wlen)
    angles = rec_stft / np.abs(rec_stft)
    for i in range(n_iter):
        rec = core.istft(np.abs(mag_stft**1.2) * angles, hop, wlen)
        rec_stft = core.stft(rec, n_fft=nfft, hop_length=hop, win_length=wlen)
        angles = rec_stft / np.abs(rec_stft)
    return rec
예제 #10
0
def eval(net1, net2, speech_file_loc, melody_file_loc):
    # Evaluates the result of net1, net2 on a given speech file and melody file
    # speech_file_loc, melody_file_loc are strings that specify the location of the respective audio files
    network1, network2 = net1.eval(), net2.eval()
    # Read input audio
    orig_speech = core.load(speech_file_loc, sr)[0]
    inp_speech = DL.remove_silent_frames(orig_speech)
    #inp_speech = 1.0 * orig_speech
    stft_inp = core.stft(inp_speech,
                         n_fft=nfft,
                         hop_length=hop,
                         win_length=wlen)

    # Extract melody and create its image
    melody = utils.MelodyExt.melody_extraction(melody_file_loc,
                                               'runtime_folder/ref_melody')[0]
    ref_pc = melody[:, 1]
    ref_time = melody[:, 0]
    const = hop * 1.0 / sr
    new_sampling_time = np.arange(const, ref_time[-1], const)
    interp_melody = np.interp(new_sampling_time, ref_time, ref_pc)
    n_frames = new_sampling_time.shape[0]
    idx1 = (1.0 * interp_melody * nfft / sr).astype(int)
    idx2 = np.array(range(n_frames))
    pc = np.zeros([1 + nfft / 2, n_frames])
    pc[idx1, idx2] = 1
    pc[-1] = 1 * pc[0]
    pc[0] = 0 * pc[0]

    # Complete input preprocessing
    rate = stft_inp.shape[1] * 1.0 / n_frames
    stft_inp = core.phase_vocoder(stft_inp, rate,
                                  hop)  # Stretch input speech to target length
    n_frames += 8 - n_frames % 8
    # Append zeros to make it suitable for network
    stft_inp = np.concatenate([
        stft_inp,
        np.zeros([stft_inp.shape[0], n_frames - stft_inp.shape[1]])
    ],
                              axis=1)
    pc = np.concatenate(
        [pc, np.zeros([pc.shape[0], n_frames - pc.shape[1]])], axis=1)
    stft_inp = np.log(1 + np.abs(stft_inp))
    stft_inp, pc = torch.from_numpy(stft_inp).float().unsqueeze(
        0), torch.from_numpy(pc).float().unsqueeze(0)  # Make tensors

    # Extract output
    encode2 = network2(Variable(pc.to(device)))
    pred, encode1 = network1(Variable(stft_inp.to(device)), encode2)
    pred = pred[0].cpu().data.numpy()
    pred[pred < 0] = 0
    pred = np.exp(pred) - 1
    time_pred = 3.0 * utils.gl_rec(pred, hop, wlen, core.istft(
        pred, hop, wlen))  # Adding a multiplier to increase loudness
    return time_pred
예제 #11
0
def extract(spec, model, max_norm, fs, frame_size, shift_size):
    input = np.abs(spec[None, None, 1:, :].copy())
    input /= max_norm
    soft_mask = model(Variable(torch.from_numpy(
        input)).cuda()).data.cpu().numpy().squeeze()
    hard_mask = soft_mask > 0.5
    soft_vocal = istft(
        spec * np.vstack((np.zeros_like(spec[0, :]), soft_mask)),
        shift_size)
    soft_accom = istft(
        spec * np.vstack((np.zeros_like(spec[0, :]), 1 - soft_mask)),
        shift_size)
    hard_vocal = istft(
        spec * np.vstack((np.zeros_like(spec[0, :]), hard_mask)),
        shift_size)
    hard_accom = istft(
        spec * np.vstack((np.zeros_like(spec[0, :]), 1 - hard_mask)),
        shift_size)

    return soft_vocal, soft_accom, hard_vocal, hard_accom
    def istft(self, _stft=[]):
        # Take the stft using Librosa.core.stft
        # returns numpy array
        if _stft == []:
            X = self.Xc  # if an array is not specified use Xc (stft of xOriginal)
        else:
            X = _stft  # otherwise, use the specified signal

        _x = istft(X, hop_length=self.hopSize, win_length=self.frameSize)

        return _x
예제 #13
0
def decode(audioName, locations, model, device):
    PSD_frames = spectralImages_1D(audioName, locations['audioloc'])
    nframes = len([key for key in PSD_frames if 'Phase' in key])
    audio = {}
    noisy_mag = []
    noisy_phase = []
    noisy_norm = []
    clean_mag = []
    clean_phase = []
    clean_norm = []

    for k in range(nframes):
        uttname = 'MagdB_' + audioName + '_frame_' + str(k)
        noisy_mag.append(PSD_frames[uttname])
        noisy_phase.append(PSD_frames[uttname.replace('MagdB', 'Phase')])

    noisy_norm = PSD_frames[uttname.replace('MagdB',
                                            'Norm').split('_frame')[0]]
    samples = PSD_frames[uttname.replace('MagdB',
                                         'Samples').split('_frame')[0]]

    audio['noisy_mag'] = torch.from_numpy(np.expand_dims(noisy_mag, axis=1))
    audio['noisy_phase'] = np.hstack(noisy_phase)
    audio['noisy_norm'] = noisy_norm
    audio['utt_samples'] = int(samples)
    audio['uttname'] = audioName

    with torch.no_grad():
        input_mag = audio['noisy_mag'].float().to(device)
        enhanced_mag = model(input_mag).cpu().numpy()
    if enhanced_mag.shape[0] > 1:
        enhanced_mag = np.hstack(np.squeeze(enhanced_mag))
    else:
        enhanced_mag = np.squeeze(enhanced_mag)
    noisy_mag = np.hstack(np.squeeze(audio['noisy_mag'].numpy()))
    noisy_mag = np.interp(noisy_mag, [-1, 1], audio['noisy_norm'])
    enhanced_mag = np.interp(enhanced_mag, [-1, 1], audio['noisy_norm'])

    temp = np.zeros((257, enhanced_mag.shape[1])) + 1j * np.zeros(
        (257, enhanced_mag.shape[1]))
    temp[:-1, :] = 10**(enhanced_mag / 20) * (
        np.cos(audio['noisy_phase']) + np.sin(audio['noisy_phase']) * 1j)
    enhanced_audio = istft(temp)
    enhanced_audio = 0.98 * enhanced_audio / np.max(np.abs(enhanced_audio))
    enhanced_audio = enhanced_audio[:audio['utt_samples']]

    enhanceloc = locations['enhanceloc']
    Path(os.path.dirname(enhanceloc)).mkdir(parents=True, exist_ok=True)
    sf.write(enhanceloc, enhanced_audio, 16000)
    return
예제 #14
0
def fastgl_rec(mag_stft, hop, wlen, n_iter=40):
    angles = np.exp(2j * np.pi * np.random.rand(*mag_stft.shape))
    momentum = 1.1
    rebuilt = 0

    for i in range(n_iter):
        tprev = 1 * rebuilt
        inverse = core.istft(np.abs(mag_stft**1.2) * angles,
                             hop_length=hop,
                             win_length=wlen)
        rebuilt = core.stft(inverse,
                            n_fft=nfft,
                            hop_length=hop,
                            win_length=wlen)
        angles[:] = rebuilt - (momentum / (1 + momentum)) * tprev
        angles[:] /= np.abs(angles) + 1e-16

    return inverse
예제 #15
0
파일: loudness.py 프로젝트: jarey/Automix
def loudnessSTFTMatrix(matrix, sr, **kwargs):
    """Calculates the loudness of a signal encoded by its STFT matrix.

    Args:
        matrix (np.ndarray): STFT matrix of the actual signal.
        sr (int, optional): The sample rate of the input signal.
        **kwargs: Keywords for istft() (see
            https://librosa.github.io/librosa/generated/librosa.core.istft.html)

    Returns:
        float: The negative replay gain as the loudness in dB of the signal.

    """

    # Convert STFT matrix to signal and use loudnessSignal() to obtain loudness
    # TODO: this is inefficient
    y = istft(matrix, **kwargs)
    return loudnessSignal(y, sr)
예제 #16
0
def change_speed(input_signal, rate):
    """Change the playback speed of an audio signal

    Parameters
    ----------
    input_signal : numpy.array
        Input array, must have numerical type.
    rate : numeric
        Desired rate of change to the speed.
        To increase the speed, pass in a value greater than 1.0.
        To decrease the speed, pass in a value between 0.0 and 1.0.

    Returns
    -------
    numpy.array representing the audio signal with changed speed.

    """

    if input_signal.dtype.kind not in 'iu' and input_signal.dtype.kind != 'f':
        raise TypeError(
            "'input_signal' must be an array of integers or floats")

    if rate <= 0:
        raise Exception('rate must be a positive number')

    # Convert input signal to a -1.0 to 1.0 float if it's an integer type
    if input_signal.dtype.kind in 'iu':
        i = np.iinfo('float32')
        abs_max = 2**(i.bits - 1)
        offset = i.min + abs_max
        input_signal = (input_signal.astype('float32') - offset) / abs_max

    # Transform signal to frequency domain
    frequency_domain_signal = core.stft(input_signal)

    # Change speed with the phase vocoding method
    fds_changed_speed = core.phase_vocoder(frequency_domain_signal, rate)

    # Transform frequency domain signal back to time domain
    output_signal = core.istft(fds_changed_speed, dtype=input_signal.dtype)

    return output_signal
예제 #17
0
    def istft(self, spectrogram, length):
        result = []

        # 按列为主序存储,也就是按通道为主序
        spectrogram = np.asfortranarray(spectrogram)
        window = hann(self.frame_length, sym=False)
        channels = spectrogram.shape[-1]
        for c in range(channels):
            data = spectrogram[..., c].T
            wave = istft(data,
                         hop_length=self.frame_step,
                         window=window,
                         center=False,
                         length=length)
            wave = np.expand_dims(wave.T, axis=1)
            result.append(wave)

        result = np.concatenate(result, axis=-1)

        return result
예제 #18
0
def SaveAudio(fname, mag, phase, norm=True):
    """
    util.valid_audio(y, mono=False)
    if norm and np.issubdtype(y.dtype, np.floating):
        wav = util.normalize(y, norm=np.inf, axis=None)
    else:
        wav = y

    if wav.ndim > 1 and wav.shape[0] == 2:
        wav = wav.T
    sf.write(
        file=C.PATH_MUSIC / fname,
        data=wav,
        samplerate=C.SR)
    """
    y = istft(mag * phase,
              hop_length=C.H,
              win_length=C.FFT_SIZE,
              window=C.WINDOW)
    write_wav(C.PATH_MUSIC / fname, y, C.SR, norm=norm)
예제 #19
0
def decode(wavfile, key_file=None):
    print '* decoding signal'
    signal, sr = librosa.load(wavfile, sr=RATE)
    spec = stft(signal, WINDOW_LENGTH, HOP_SIZE)
    message = ""

    if key_file is not None:
        key_signal, sr = librosa.load(key_file, sr=RATE)
        signal = np.pad(signal, (0, key_signal.shape[0] - signal.shape[0]),
                        'edge')
        try:
            spec = np.subtract(stft(key_signal, WINDOW_LENGTH, HOP_SIZE), spec)
            wavwrite('minus.wav', istft(spec, 1024, 2048), RATE)
        except ValueError:
            print "Oops! Your encoded signal must be at least as long as the key signal"
            return

    i, decode = 0, False
    while i < spec.shape[1]:
        h = np.argmax(np.abs([spec[x][i] for x in range(spec.shape[0])]))
        if h == 500:
            decode = True
            i = i + 1
            continue

        if h == 550:
            break

        if decode:
            while h > 255:
                spec[h][i] = 0
                h = np.argmax(
                    np.abs([spec[x][i] for x in range(spec.shape[0])]))
            char = str(chr(h))
            message += char

        i = i + 1

    print message
    return message
예제 #20
0
def istft(mel_spec, phase):
    """
	mel_spec: numpy 256*256
	phase: numpy array 512*256
	audio_proc: numpy array 65280*1
	"""
    #mel_spec = mel_spec.numpy()
    yt = np.zeros((253, 256))
    #print(np.shape(mel_dedup))
    counter = 0
    for i in range(len(mel_idx)):
        if i > 0:
            if mel_idx[i] == mel_idx[i - 1]:
                #linear_spec[mel_idx[i]] = (mel_spec[i]+mel_spec[i-1])/2
                yt[counter - 1] = (yt[counter - 1] + mel_spec[i]) / 2
            else:
                #linear_spec[mel_idx[i]] = mel_spec[i]
                yt[counter] = mel_spec[i]
                counter += 1
        else:
            #linear_spec[mel_idx[i]] = mel_spec[i]
            yt[counter] = mel_spec[i]
            counter += 1

    f = interpolate.interp1d(mel_dedup, yt, 'cubic', axis=0)
    ynew = f(all_idx)
    ynew = ynew[:512]
    #print(ynew.shape)

    j = np.array([1j], dtype='complex')
    linear_spec = np.multiply(
        ynew, np.cos(phase)) + j * np.multiply(ynew, np.sin(phase))
    audio_proc = lc.istft(linear_spec, hop_length=256, win_length=1022)
    #print(audio_proc.shape)

    return audio_proc
예제 #21
0
def deocdeData(dataloc, specImageloc, destfolder, model, device):
    for dataset in ['Dev', 'Eval']:
        audiofiles = __readscpFiles__(dataloc + '/' + dataset + '_SimData.scp')
        audiofiles.update(
            __readscpFiles__(dataloc + '/' + dataset + '_RealData.scp'))
        pbar = pkbar.Pbar(name='Decoding ' + dataset + ' AudioFiles: ',
                          target=len(audiofiles))

        data = SpecImages(specImageloc + '/' + dataset, mode='decode')
        with torch.no_grad():
            for i, (k, v) in enumerate(audiofiles.items()):
                uttID = data.uttname2idx('MagdB_' + k)
                audio = data.__getaudio__(uttID)
                input_mag = audio['noisy_mag'].unsqueeze(1).to(device)
                enhanced_mag = model(input_mag).cpu().numpy()
                if enhanced_mag.shape[0] > 1:
                    enhanced_mag = np.hstack(np.squeeze(enhanced_mag))
                else:
                    enhanced_mag = np.squeeze(enhanced_mag)
                enhanced_mag = np.interp(enhanced_mag, [-1, 1],
                                         audio['noisy_norm'])
                temp = np.zeros((257, enhanced_mag.shape[1])) + 1j * np.zeros(
                    (257, enhanced_mag.shape[1]))
                temp[:-1, :] = 10**(enhanced_mag / 20) * (np.cos(
                    audio['noisy_phase']) + np.sin(audio['noisy_phase']) * 1j)
                enhanced_audio = istft(temp)
                enhanced_audio = 0.98 * enhanced_audio / np.max(
                    np.abs(enhanced_audio))
                enhanced_audio = enhanced_audio[:audio['utt_samples']]

                destloc = destfolder + v.split('Reverb_Challenge')[1]
                Path(os.path.dirname(destloc)).mkdir(parents=True,
                                                     exist_ok=True)
                sf.write(destloc, enhanced_audio, 16000)
                del audio, input_mag, enhanced_mag, temp, enhanced_audio
                pbar.update(i)
예제 #22
0
def stretch(x, factor, nfft=2048):
    '''
    From this repository: https://github.com/gaganbahga/time_stretch
    stretch an audio sequence by a factor using FFT of size nfft converting to frequency domain
    :param x: np.ndarray, audio array in PCM float32 format
    :param factor: float, stretching or shrinking factor, depending on if its > or < 1 respectively
    :return: np.ndarray, time stretched audio
    '''
    stft = core.stft(
        x, n_fft=nfft).transpose()  # i prefer time-major fashion, so transpose
    stft_rows = stft.shape[0]
    stft_cols = stft.shape[1]

    times = np.arange(0, stft.shape[0],
                      factor)  # times at which new FFT to be calculated
    hop = nfft / 4  # frame shift
    stft_new = np.zeros((len(times), stft_cols), dtype=np.complex_)
    phase_adv = (2 * np.pi * hop * np.arange(0, stft_cols)) / nfft
    phase = np.angle(stft[0])

    stft = np.concatenate((stft, np.zeros((1, stft_cols))), axis=0)

    for i, time in enumerate(times):
        left_frame = int(np.floor(time))
        local_frames = stft[[left_frame, left_frame + 1], :]
        right_wt = time - np.floor(time)  # weight on right frame out of 2
        local_mag = (1 - right_wt) * np.absolute(
            local_frames[0, :]) + right_wt * np.absolute(local_frames[1, :])
        local_dphi = np.angle(local_frames[1, :]) - np.angle(
            local_frames[0, :]) - phase_adv
        local_dphi = local_dphi - 2 * np.pi * np.floor(local_dphi /
                                                       (2 * np.pi))
        stft_new[i, :] = local_mag * np.exp(phase * 1j)
        phase += local_dphi + phase_adv

    return core.istft(stft_new.transpose())
            os.makedirs(saveFolderLoc)

        monoLoader = es.MonoLoader(filename=mixFile, sampleRate=44100)
        x = monoLoader()[:nSeconds * 44100]

        _stft = stft(x,
                     n_fft=fftSize,
                     hop_length=hopSize,
                     win_length=frameSize,
                     window=winType)

        X_H, X_P = hpss(_stft,
                        kernel_size=150)  # Get harmonic and percussive stfts

        x_h = istft(
            X_H, hop_length=hopSize,
            win_length=frameSize)  # Convert stfts to time domain signals
        x_p = istft(X_P, hop_length=hopSize, win_length=frameSize)

        MonoWriter = es.MonoWriter(sampleRate=44100,
                                   format="mp3")  # Write to file
        MonoWriter.configure(filename=saveFolderLoc + filename +
                             "_median_percussive.mp3")
        MonoWriter(array(x_p))

        MonoWriter = es.MonoWriter(sampleRate=44100,
                                   format="mp3")  # Write to file
        MonoWriter.configure(filename=saveFolderLoc + filename +
                             "_median_harmonic.mp3")
        MonoWriter(array(x_h))
예제 #24
0
def save_audio(mag, phase):
    return istft(mag * phase, hop_length=512, win_length=1024)
예제 #25
0
#!/usr/bin/env python
from librosa.core import stft, istft
import numpy as np
import scipy

y = np.random.rand(44032)

stft_matrix = stft(y, window=scipy.signal.hann(2048), hop_length=1024)
y_hat = istft(stft_matrix, window=np.ones(2048), hop_length=1024)

diff = y - y_hat
print np.dot(diff, diff)
def SaveAudio(fname, mag, phase):
    y = istft(mag*phase, hop_length=C.H, win_length=C.FFT_SIZE)
    write_wav(fname, y, C.SR, norm=True)
    vocal_wav_mag = vocal_wav_mag[:, START:END]
    vocal_wav_phase = vocal_wav_phase[:, START:END]

    # load saved model
    model = keras.models.load_model('../models/vocal_20_test_model.h5')
    #model = keras.models.load_model('../models/vocal_20.h5')

    # predict and write into file
    X = mix_wav_mag[1:].reshape(1, 512, 128, 1)
    y = model.predict(X, batch_size=32)

    target_pred_mag = np.vstack((np.zeros((128)), y.reshape(512, 128)))

    write_wav(f'../wav_files/vocal_20_sample_py.wav',
              istft(target_pred_mag * mix_wav_phase,
                    win_length=WINDOW_SIZE,
                    hop_length=HOP_LENGTH),
              SAMPLE_RATE,
              norm=True)
    write_wav(f'../wav_files/mix_downsampled.wav',
              istft(mix_wav_mag * mix_wav_phase,
                    win_length=WINDOW_SIZE,
                    hop_length=HOP_LENGTH),
              SAMPLE_RATE,
              norm=True)
    write_wav(f'../wav_files/vocals_downsampled.wav',
              istft(vocal_wav_mag * vocal_wav_phase,
                    win_length=WINDOW_SIZE,
                    hop_length=HOP_LENGTH),
              SAMPLE_RATE,
              norm=True)
예제 #28
0
def random_pred(model_list=['PMTL'],
                n_samp=2,
                min_length=1.0,
                fld=test_fld,
                psongs=test_psongs):
    # Predicts output of specified list of systems on some random samples from the dataset
    # It also takes as arguments the number of samples for evaluation, minimum length of each sample,

    nus_train_data = DL.NUS_48E(data_key, [sr, nfft, wlen, hop])
    sampler = DL.nus_samp(data_dir,
                          1,
                          n_samp,
                          fld,
                          psongs,
                          use_word=True,
                          randomize=True,
                          print_elem=True,
                          min_len=min_length)
    dataload = DataLoader(dataset=nus_train_data,
                          batch_sampler=sampler,
                          collate_fn=my_collate_e8)
    samp_idx = -1
    lsd = []
    for data in dataload:
        # Initialize, Load the networks and their weights properly taking into account the exceptions
        samp_idx += 1
        print 'Processing sample', samp_idx
        for idx in range(len(model_list)):
            cur_model = model_list[idx]
            suffix = suffix_dict[cur_model]
            network2 = defModel.exp_net(512, 512, freq=513).to(device)
            if cur_model == 'B2' or cur_model == 'b2':
                network1 = defModel.net_base(512, 512, freq=513).to(device)
            else:
                network1 = defModel.net_in_v2(512, 512, freq=513).to(device)

            if not (cur_model == 'B1' or cur_model == 'b1'):
                network2.load_state_dict(
                    torch.load('output/models/net2_' + suffix + '.pt',
                               map_location=device))  # Complete
            network1.load_state_dict(
                torch.load('output/models/net1_' + suffix + '.pt',
                           map_location=device))
            network1, network2 = network1.eval(), network2.eval()

            # Make predictions
            encode2 = int(not cur_model == 'B1') * network2(
                Variable(data[3].to(device)))

            pred, encode1 = network1(Variable(data[0].to(device)), encode2)
            pred = pred.cpu().data.numpy()
            pred[pred < 0] = 0

            #Save log-STFTs of input, target and prediction
            saving_dir = 'output/random_predictions/'
            logstft_inp = data[0].numpy()
            logstft_out = data[1].numpy()
            logstft_pred = 1.0 * pred
            np.save(saving_dir + 'inp_lgstft' + str(samp_idx), logstft_inp)
            np.save(saving_dir + 'out_lgstft' + str(samp_idx), logstft_out)
            np.save(saving_dir + 'pred_lgstft' + str(samp_idx), logstft_pred)

            # Get time domain signals
            stft_pred = np.zeros([513, pred.shape[2]])
            stft_pred[:pred.shape[1]] = np.exp(pred[0]) - 1

            time_pred = utils.gl_rec(stft_pred, hop, wlen,
                                     core.istft(stft_pred**1.0, hop, wlen))
            time_inp_orig = core.istft(data[4][0], hop, wlen)
            time_inp_phase = core.istft(data[5][0], hop, wlen)
            time_target_phase = core.istft(data[6][0], hop, wlen)

            # Save predictions
            librosa.output.write_wav(
                saving_dir + 'original_speech_' + str(samp_idx) + '.wav',
                time_inp_orig, sr)
            librosa.output.write_wav(
                saving_dir + 'stretched_speech_' + str(samp_idx) + '.wav',
                time_inp_phase, sr)
            librosa.output.write_wav(
                saving_dir + 'true_singing_' + str(samp_idx) + '.wav',
                time_target_phase, sr)
            librosa.output.write_wav(
                saving_dir + 'predicted_singing_' + str(samp_idx) + cur_model +
                '.wav', time_pred, sr)

    return
예제 #29
0
def to_time(image):
    """
    :param image: STFT with magnitude in one channel and phase in the other.
    :return: Raw audio
    """
    return lc.istft(image[:, :, 0] + 1j * image[:, :, 1])
예제 #30
0
    def score(self, loader, framewise=False, save_dir=None):
        """
        Score the model.

        Args
        ----
          loader : PyTorch DataLoader.

        """
        self.model.eval()
        class_sdr = defaultdict(list)
        class_sir = defaultdict(list)
        class_sar = defaultdict(list)

        # only perform framewise evaluation at testing time
        if self.n_fft == 1025:
            rate = 22050
            hop = 512
            win = 2048
        elif self.n_fft == 2049:
            rate = 44100
            hop = 1024
            win = 4096
        if not framewise:
            rate = np.inf

        if save_dir:
            class_map = {0: 'bass', 1: 'drums', 2: 'other', 3: 'vocals'}
            mus = musdb.DB(root_dir="data/musdb18")

        # list of batches
        preds, ys, cs, ts, _, nm = self.predict(loader)

        # for each batch
        for b_preds, b_ys, b_cs, b_ts, b_nm in tqdm(list(zip(preds, ys, cs, ts, nm))):
            # for each sample
            for pred, y, c, t, n in zip(b_preds, b_ys, b_cs, b_ts, b_nm):
                pred_recons = []
                y_recons = []
                pred_cs = []
                pred_recons_dict = defaultdict(list)
                y_recons_dict = defaultdict(list)
                # for each class
                for i, (c_pred, c_y, c_c) in enumerate(zip(pred, y, c)):
                    # if the class exists in the source signal
                    if c_c == 1 and np.abs(c_y).sum() > 0:
                        c_pred = c_pred[..., :t]
                        c_y = c_y[..., :t]
                        # predictions can be over multiple channels
                        pred_recon = []
                        y_recon = []
                        for c_pred_chan, c_y_chan in zip(c_pred, c_y):
                            pred_recon += [istft(
                                c_pred_chan, hop_length=hop, win_length=win)]
                            y_recon += [istft(
                                c_y_chan, hop_length=hop, win_length=win)]
                        pred_recon = np.stack(pred_recon, axis=-1)
                        y_recon = np.stack(y_recon, axis=-1)
                        # accumulate list of reconstructions for stacking
                        pred_recons += [pred_recon]
                        y_recons += [y_recon]
                        pred_cs += [i]
                        if save_dir:
                            pred_recons_dict[class_map[i]] = pred_recon
                            y_recons_dict[class_map[i]] = y_recon
                # possible to sample from targets that are all zeros
                if pred_recons:
                    pred_recons = np.stack(pred_recons)
                    # possible to predict all zeros...
                    # TODO: Figure out how to handle this case properly
                    if np.abs(pred_recons.sum()) > 0:
                        y_recons = np.stack(y_recons)
                        # nclassex x time
                        if self.eval_version == 'v3':
                            sdr, sir, sar, _ = bss_eval_sources(
                                y_recons, pred_recons,
                                compute_permutation=False)
                        elif self.eval_version == 'v4':
                            if save_dir:
                                name = loader.dataset.metadata.at[
                                    int(n.cpu().numpy()), 'urlId']
                                track = mus.load_mus_tracks(
                                    tracknames=[name])[0]
                                sdr, isr, sir, sar = evaluate(
                                    y_recons, pred_recons, win=rate, hop=rate,
                                    padding=True)
                                data = self._to_evalstore(
                                    sdr, sir, isr, sar, rate, rate, class_map)
                                self._save_framewise(data, save_dir, track)
                                continue
                            else:
                                sdr, isr, sir, sar = evaluate(
                                    y_recons, pred_recons, win=rate, hop=rate,
                                    padding=True)
                                cmb_sdr = np.concatenate([x for x in sdr])
                                sdr = np.nanmean(sdr, axis=1)
                                sir = np.nanmean(sir, axis=1)
                                sar = np.nanmean(sar, axis=1)
                        for m1, m2, m3, cl in zip(sdr, sir, sar, pred_cs):
                            class_sdr[cl] += [m1]
                            class_sir[cl] += [m2]
                            class_sar[cl] += [m3]

        class_sdr_out = defaultdict(list)
        class_sir_out = defaultdict(list)
        class_sar_out = defaultdict(list)

        class_sdr_out['median'] = {k: np.round(np.median(v), 2)
                                   for k, v in class_sdr.items()}
        class_sdr_out['mean'] = {k: np.round(np.mean(v), 2)
                                 for k, v in class_sdr.items()}
        class_sir_out['median'] = {k: np.round(np.median(v), 2)
                                   for k, v in class_sir.items()}
        class_sir_out['mean'] = {k: np.round(np.mean(v), 2)
                                 for k, v in class_sir.items()}
        class_sar_out['median'] = {k: np.round(np.median(v), 2)
                                   for k, v in class_sar.items()}
        class_sar_out['mean'] = {k: np.round(np.mean(v), 2)
                                 for k, v in class_sar.items()}

        return class_sdr_out, class_sir_out, class_sar_out, cmb_sdr
예제 #31
0
def eval_sys(model_list=['PMTL', 'PMSE', 'B1', 'B2'],
             n_samp=30,
             min_length=1.0,
             random=True,
             fld=test_fld,
             psongs=test_psongs):
    # Currently evaluates the specified models on the NUS dataset for the given songs. Default songs comprise of our test set
    # It also takes as arguments the number of samples for evaluation (n_samp), minimum length of speech in each sample (min_length),
    # Returns array of all computed LSD's and prints the mean LSD for each model

    nus_train_data = DL.NUS_48E(data_key, [sr, nfft, wlen, hop])
    sampler = DL.nus_samp(data_dir,
                          1,
                          n_samp,
                          fld,
                          psongs,
                          use_word=True,
                          randomize=random,
                          print_elem=False,
                          min_len=min_length)
    dataload = DataLoader(dataset=nus_train_data,
                          batch_sampler=sampler,
                          collate_fn=my_collate_e8)
    samp_idx = -1
    lsd = []
    for data in dataload:
        # Initialize, Load the networks and their weights properly taking into account the exceptions
        samp_idx += 1
        print 'Processing sample ', samp_idx
        for idx in range(len(model_list)):
            cur_model = model_list[idx]
            suffix = suffix_dict[cur_model]
            network2 = defModel.exp_net(512, 512, freq=513).to(device)
            if cur_model == 'B2':
                network1 = defModel.net_base(512, 512, freq=513).to(device)
            else:
                network1 = defModel.net_in_v2(512, 512, freq=513).to(device)

            if not cur_model == 'B1':
                network2.load_state_dict(
                    torch.load('output/models/net2_' + suffix + '.pt',
                               map_location=device))  # Complete
            network1.load_state_dict(
                torch.load('output/models/net1_' + suffix + '.pt',
                           map_location=device))
            network1, network2 = network1.eval(), network2.eval()

            # Make predictions
            encode2 = int(not cur_model == 'B1') * network2(
                Variable(data[3].to(device)))
            pred, encode1 = network1(Variable(data[0].to(device)), encode2)
            pred = pred.cpu().data.numpy()
            pred[pred < 0] = 0

            #Temporarily save log-STFTs of input target and prediction
            logstft_inp = data[0].numpy()
            logstft_out = data[1].numpy()
            logstft_pred = 1.0 * pred
            np.save('runtime_folder/inp_stft', logstft_inp)
            np.save('runtime_folder/out_stft', logstft_out)
            np.save('runtime_folder/pred_stft', logstft_pred)

            # Get time domain signals
            stft_inp = np.zeros([513, pred.shape[2]])
            stft_pred = np.zeros([513, pred.shape[2]])
            stft_target = np.zeros([513, pred.shape[2]])

            stft_pred[:pred.shape[1]] = np.exp(pred[0]) - 1
            time_pred = utils.gl_rec(stft_pred, hop, wlen,
                                     core.istft(stft_pred**1.0, hop, wlen))
            time_target_phase = core.istft(data[6][0], hop, wlen)

            # Save predictions in the runtime folder
            true_file = 'runtime_folder/runtime_true.wav'
            pred_file = 'runtime_folder/runtime_pred.wav'
            librosa.output.write_wav(true_file, time_target_phase, sr)
            librosa.output.write_wav(pred_file, time_pred, sr)
            calc_lsd = utils.comp_lsd(true_file, pred_file)
            #print cur_model, calc_lsd
            lsd.append(calc_lsd)

    # Print the results
    arr = np.zeros([len(model_list), n_samp])
    for i in range(len(model_list) * n_samp):
        arr[i % len(model_list), i // len(model_list)] = lsd[i]
    for i in range(len(model_list)):
        print model_list[i] + ' (mean LSD):', np.mean(arr[i])

    return lsd
예제 #32
0
def SaveAudio(fname, mag, phase):
    y = istft(mag * phase, hop_length=C.H, win_length=C.FFT_SIZE)
    write_wav(fname, y, C.SR, norm=True)