Exemplo n.º 1
0
def get_istft(spect, wsize=512, tstep=256, L=None):
    """ reshape the spectrum and get the inverse Fourier transform """

    if len(spect.shape) < 3:
        spect = spect.reshape((1, spect.shape[0], spect.shape[1]))
    if L is not None:
        return np.squeeze(stft.istft(spect, tstep, L))

    return stft.istft(spect, tstep)
Exemplo n.º 2
0
    def __fdndlp(self, data):
        """Frequency-domain variance-normalized delayed liner prediction 

        This is the core part of the WPE method. The variance-normalized 
        linear prediciton algorithm is implemented in each frequency bin 
        separately. Both the input and output signals are in time-domain.  

        Args:
            data: A 2-dimension numpy array with shape=(chanels, samples)

        Returns:
            A 2-dimension numpy array with shape=(output_channels, samples)
        """

        freq_data = stft.stft(data / np.abs(data).max(),
                              frame_size=self.frame_size,
                              overlap=self.overlap)
        self.freq_num = freq_data.shape[-1]
        drv_freq_data = freq_data[0:self.out_num].copy()
        for i in range(self.freq_num):
            xk = freq_data[:, :, i].T
            dk = self.__ndlp(xk)
            drv_freq_data[:, :, i] = dk.T
        drv_data = stft.istft(drv_freq_data,
                              frame_size=self.frame_size,
                              overlap=self.overlap)
        return drv_data / np.abs(drv_data).max()
Exemplo n.º 3
0
    def predict_channel(audio):
        length = np.shape(audio)[0]
        m = resample(audio, 44100, 22050)
        M = stft(m.reshape(-1, 1), hop_size, win_size, fft_size)
        Mmag = np.abs(M).T
        spec_frames, n_bins = Mmag.shape
        pad_size = int((n_frames - 1) / 2)
        Mmag = np.concatenate((np.zeros(
            (pad_size, n_bins)), Mmag, np.zeros((pad_size, n_bins))))
        new_strides = (Mmag.strides[0], Mmag.strides[0], Mmag.strides[1])
        Mmag = as_strided(Mmag, (spec_frames, n_frames, n_bins), new_strides)
        Mmag = Mmag[:, np.newaxis, :, :]
        vocals = np.zeros(M.T.shape)
        bass = np.zeros(M.T.shape)
        drums = np.zeros(M.T.shape)
        other = np.zeros(M.T.shape)

        for i in range(spec_frames):
            X = Mmag[i, :, :, :]
            in_data = torch.from_numpy(
                X.astype(np.float32)[np.newaxis, :, :, :])
            if torch.cuda.is_available():
                in_data = in_data.cuda()
            i_result = model(Variable(in_data)).cpu().data.numpy()
            vocals[i, :] = i_result[0, :n_bins]
            drums[i, :] = i_result[0, n_bins:2 * n_bins]
            bass[i, :] = i_result[0, 2 * n_bins:3 * n_bins]
            other[i, :] = i_result[0, 3 * n_bins:4 * n_bins]

        all_masks = vocals + bass + drums + other

        vocals = vocals / all_masks
        bass = bass / all_masks
        drums = drums / all_masks
        other = other / all_masks
        vocal_est = resample(istft(M * vocals.T, hop_size, win_size, 22050),
                             22050, 44100, 0)[:length, :]
        bass_est = resample(istft(M * bass.T, hop_size, win_size, 22050),
                            22050, 44100, 0)[:length, :]
        drums_est = resample(istft(M * drums.T, hop_size, win_size, 22050),
                             22050, 44100, 0)[:length, :]
        other_est = resample(istft(M * other.T, hop_size, win_size, 22050),
                             22050, 44100, 0)[:length, :]
        return (vocal_est, bass_est, drums_est, other_est)
Exemplo n.º 4
0
def pvoc(x,
         sr,
         factor,
         Hs=512,
         window=signal.hann(1024, sym=False),
         phase_lock=False):
    in_size = x.shape[0]
    win_len = window.shape[0]
    win_len_half = int(np.round(win_len / 2))
    out_size = int(np.ceil(factor * in_size))
    anchor_points = np.array([[0, 0], [in_size - 1, out_size - 1]])
    syn_positions = np.arange(0, out_size + win_len_half, Hs)
    an_positions = np.round(
        np.interp(syn_positions, anchor_points[:, 1], anchor_points[:, 0]))
    an_hops = np.concatenate(([0], an_positions[1:] - an_positions[:-1]))
    y = np.zeros((out_size + 2 * win_len))
    x = np.concatenate((np.zeros(
        (win_len_half)), x, np.zeros((win_len + int(an_hops[1])))))

    X = stft.stft(x, sr, an_positions, window, win_len)
    Y = np.zeros_like(X)
    Y[:, 0] = X[:, 0]  #assuming columns are frames
    k = np.arange(win_len_half + 1).T
    omega = 2 * np.pi * k / win_len
    print(an_hops[1])
    print(an_hops[-1])
    for i in range(1, X.shape[1]):
        dphi = omega * an_hops[i]
        current_phase = np.angle(X[:, i])
        prev_phase = np.angle(X[:, i - 1])
        phase_inc = current_phase - prev_phase - dphi
        phase_inc = phase_inc - 2 * np.pi * np.round(phase_inc / (2 * np.pi))
        ipa_sample = omega + phase_inc / an_hops[i]
        ipa_hop = ipa_sample * Hs
        syn_phase = np.angle(Y[:, i - 1])
        if not phase_lock:
            theta = syn_phase + ipa_hop - current_phase
            phasor = np.exp(1j * theta)
        else:
            p, v = get_peaks(np.abs(X[:, i]))
            theta = np.zeros_like(Y[:, i])
            for j in range(len(p)):
                theta[v[j]:v[j + 1]] = syn_phase[p[j]] + ipa_hop[
                    p[j]] - current_phase[p[j]]
            phasor = np.exp(1j * theta)
        Y[:, i] = phasor * X[:, i]
    y = stft.istft(Y, Hs, window)
    return y
Exemplo n.º 5
0
def image2audio(image_filename, audio_filename):
    X_image = read_image(image_filename)
    print("show result of stft.")

    print(X_image.dtype, X_image.shape)
    r = inv_normal(X_image[:, :, 0].astype('float64') / MAX)
    g = inv_normal(X_image[:, :, 1].astype('float64') / MAX)
    X = P2R(r, g)
    # X = np.zeros(X_image.shape[:2], 'complex128')
    # X.real = r
    # X.imag = g
    print(np.max(X.real), np.min(X.real))
    print(np.max(X.imag), np.min(X.imag))
    # Compute the ISTFT.
    xhat = stft.istft(X, fs, T, hop_length)
    xhat = float2pcm(xhat)
    scipy.io.wavfile.write(audio_filename, fs, xhat)
Exemplo n.º 6
0
def tf_agc(d, sr, t_scale=0.5, f_scale=1.0, causal_tracking=True, plot=False):
    """
    Perform frequency-dependent automatic gain control on an auditory
    frequency axis.
    d is the input waveform (at sampling rate sr);
    y is the output waveform with approximately constant
    energy in each time-frequency patch.
    t_scale is the "scale" for smoothing in time (default 0.5 sec).
    f_scale is the frequency "scale" (default 1.0 "mel").
    causal_tracking == 0 selects traditional infinite-attack, exponential release.
    causal_tracking == 1 selects symmetric, non-causal Gaussian-window smoothing.
    D returns actual STFT used in analysis.  E returns the
    smoothed amplitude envelope divided out of D to get gain control.
    """

    hop_size = 0.032  # in seconds

    # Make STFT on ~32 ms grid
    ftlen = int(2**np.round(np.log(hop_size * sr) / np.log(2.)))
    winlen = ftlen
    hoplen = winlen / 2
    D = stft(d, winlen, hoplen)  # using my code
    ftsr = sr / hoplen
    ndcols = D.shape[1]

    # Smooth in frequency on ~ mel resolution
    # Width of mel filters depends on how many you ask for,
    # so ask for fewer for larger f_scales
    nbands = max(10, 20 / f_scale)  # 10 bands, or more for very fine f_scale
    mwidth = f_scale * nbands / 10  # will be 2.0 for small f_scale
    (f2a_tmp, _) = fft2melmx(ftlen, sr, int(nbands), mwidth)
    f2a = f2a_tmp[:, :ftlen / 2 + 1]
    audgram = np.dot(f2a, np.abs(D))

    if causal_tracking:
        # traditional attack/decay smoothing
        fbg = np.zeros(audgram.shape)
        # state = zeros(size(audgram,1),1);
        state = np.zeros(audgram.shape[0])
        alpha = np.exp(-(1. / ftsr) / t_scale)
        for i in range(audgram.shape[1]):
            state = np.maximum(alpha * state, audgram[:, i])
            fbg[:, i] = state

    else:
        # noncausal, time-symmetric smoothing
        # Smooth in time with tapered window of duration ~ t_scale
        tsd = np.round(t_scale * ftsr) / 2
        htlen = 6 * tsd  # Go out to 6 sigma
        twin = np.exp(-0.5 * (((np.arange(-htlen, htlen + 1)) / tsd)**2)).T

        # reflect ends to get smooth stuff
        AD = audgram
        x = np.hstack((np.fliplr(AD[:, :htlen]), AD, np.fliplr(AD[:, -htlen:]),
                       np.zeros((AD.shape[0], htlen))))
        fbg = signal.lfilter(twin, 1, x, 1)

        # strip "warm up" points
        fbg = fbg[:, twin.size + np.arange(ndcols)]

    # map back to FFT grid, flatten bark loop gain
    sf2a = np.sum(f2a, 0)
    sf2a_fix = sf2a
    sf2a_fix[sf2a == 0] = 1.
    E = np.dot(np.dot(np.diag(1. / sf2a_fix), f2a.T), fbg)
    # Remove any zeros in E (shouldn't be any, but who knows?)
    E[E <= 0] = np.min(E[E > 0])

    # invert back to waveform
    y = istft(D / E, winlen, hoplen, window=np.ones(winlen))  # using my code

    if plot:
        try:
            import matplotlib.pyplot as plt
            plt.subplot(3, 1, 1)
            plt.imshow(20. * np.log10(np.flipud(np.abs(D))))
            plt.subplot(3, 1, 2)
            plt.imshow(20. * np.log10(np.flipud(np.abs(E))))
            A = stft(y, winlen, hoplen)  # using my code
            plt.subplot(3, 1, 3)
            plt.imshow(20. * np.log10(np.flipud(np.abs(A))))
            plt.show()
        except Exception, e:
            print "Failed to plot results"
            print e
Exemplo n.º 7
0
    other = np.zeros(M.T.shape)

    for i in range(spec_frames):
        X = Mmag[i, :, :, :]
        in_data = torch.from_numpy(X.astype(np.float32)[np.newaxis, :, :, :])
        if torch.cuda.is_available():
            in_data = in_data.cuda()
        i_result = model(Variable(in_data)).cpu().data.numpy()
        vocals[i, :] = np.argmax(i_result, 1) == 0
        drums[i, :] = np.argmax(i_result, 1) == 1
        bass[i, :] = np.argmax(i_result, 1) == 2
        other[i, :] = np.argmax(i_result, 1) == 3

    sr = 22050

    np.save("results/vocals/" + filename + "_mask", vocals.T)
    vocal_est = istft(M * vocals.T, hop_size, win_size, sr)
    wavfile.write("results/vocals/" + filename + "_target.wav", sr, vocal_est)

    np.save("results/bass/" + filename + "_mask", bass.T)
    bass_est = istft(M * bass.T, hop_size, win_size, sr)
    wavfile.write("results/bass/" + filename + "_target.wav", sr, bass_est)

    np.save("results/drums/" + filename + "_mask", drums.T)
    drums_est = istft(M * drums.T, hop_size, win_size, sr)
    wavfile.write("results/drums/" + filename + "_target.wav", sr, drums_est)

    np.save("results/other/" + filename + "_mask", other.T)
    other_est = istft(M * other.T, hop_size, win_size, sr)
    wavfile.write("results/other/" + filename + "_target.wav", sr, other_est)
def plot_mel_masks(args):
    
    # Arugments & parameters
    workspace = args.workspace
    holdout_fold = args.holdout_fold
    scene_type = args.scene_type
    snr = args.snr
    iteration = args.iteration
    model_type = args.model_type
    cuda = args.cuda

    labels = config.labels
    classes_num = len(labels)
    sample_rate = config.sample_rate
    window_size = config.window_size
    overlap = config.overlap
    hop_size = window_size-overlap
    mel_bins = config.mel_bins
    seq_len = config.seq_len
    ix_to_lb = config.ix_to_lb
    
    thres = 0.1
    batch_size = 24

    # Paths
    hdf5_path = os.path.join(workspace, 'features', 'logmel', 
        'scene_type={},snr={}'.format(scene_type, snr), 'development.h5')

    model_path = os.path.join(workspace, 'models', 'main_pytorch', 
        'model_type={}'.format(model_type), 'scene_type={},snr={}'
        ''.format(scene_type, snr), 'holdout_fold{}'.format(holdout_fold), 
        'md_{}_iters.tar'.format(iteration))
    
    yaml_path = os.path.join(workspace, 'mixture.yaml')
    
    audios_dir = os.path.join(workspace, 'mixed_audios', 
                              'scene_type={},snr={}'.format(scene_type, snr))
    
    sep_wavs_dir = os.path.join(workspace, 'separated_wavs', 'main_pytorch', 
        'model_type={}'.format(model_type), 
        'scene_type={},snr={}'.format(scene_type, snr), 
        'holdout_fold{}'.format(holdout_fold))
        
    create_folder(sep_wavs_dir)
    
    # Load yaml file
    load_yaml_time = time.time()
    with open(yaml_path, 'r') as f:
        meta = yaml.load(f)        
    print('Load yaml file time: {:.3f} s'.format(time.time() - load_yaml_time))
    
    feature_extractor = LogMelExtractor(
        sample_rate=sample_rate, 
        window_size=window_size, 
        overlap=overlap, 
        mel_bins=mel_bins)

    inverse_melW = feature_extractor.get_inverse_melW()
    
    # Load model
    Model = get_model(model_type)
    model = Model(classes_num, seq_len, mel_bins, cuda)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['state_dict'])

    if cuda:
        model.cuda()

    # Data generator
    generator = InferenceDataGenerator(
        hdf5_path=hdf5_path,
        batch_size=batch_size, 
        holdout_fold=holdout_fold)

    generate_func = generator.generate_validate(
        data_type='validate', 
        shuffle=False, 
        max_iteration=None)
    
    # Evaluate on mini-batch
    for (iteration, data) in enumerate(generate_func):
        
        (batch_x, batch_y, batch_audio_names) = data            
        batch_x = move_data_to_gpu(batch_x, cuda)

        # Predict
        with torch.no_grad():
            model.eval()
            (batch_output, batch_bottleneck) = model(
                batch_x, return_bottleneck=True)
    
        batch_output = batch_output.data.cpu().numpy()
        '''(batch_size, classes_num)'''
        
        batch_bottleneck = batch_bottleneck.data.cpu().numpy()  
        '''(batch_size, classes_num, seq_len, mel_bins)'''

        batch_pred_sed = np.mean(batch_bottleneck, axis=-1)
        batch_pred_sed = np.transpose(batch_pred_sed, (0, 2, 1))    
        '''(batch_size, seq_len, classes_num)'''
        
        batch_gt_masks = []
        
        for n in range(len(batch_audio_names)):
            curr_meta = search_meta_by_mixture_name(meta, batch_audio_names[n])
            curr_events = curr_meta['events']
              
            pred_indexes = np.where(batch_output[n] > thres)[0]
            gt_indexes = get_ground_truth_indexes(curr_events)
 
            gt_sed = get_sed_from_meta(curr_events) # (seq_len, classes_num)
            
            pred_sed = np.zeros((seq_len, classes_num))
            pred_sed[:, pred_indexes] = batch_pred_sed[n][:, pred_indexes]  # (seq_len, classes_num)
 
            (events_stft, scene_stft, _) = generator.get_events_scene_mixture_stft(batch_audio_names[n])
            events_stft = np.dot(events_stft, feature_extractor.melW)
            scene_stft = np.dot(scene_stft, feature_extractor.melW)
            
            gt_mask = ideal_binary_mask(events_stft, scene_stft)    # (seq_len, fft_size)
            
            gt_masks = gt_mask[:, :, None] * gt_sed[:, None, :] # (seq_len, fft_size, classes_num)
            gt_masks = gt_masks.astype(np.float32)
            batch_gt_masks.append(gt_masks)
            
            pred_masks = batch_bottleneck[n].transpose(1, 2, 0) # (seq_len, fft_size, classes_num)

            # Save out separated audio
            if True:
                curr_audio_name = curr_meta['mixture_name']
                audio_path = os.path.join(audios_dir, curr_audio_name)
                (mixed_audio, fs) = read_audio(audio_path, target_fs=sample_rate, mono=True)
                
                out_wav_path = os.path.join(sep_wavs_dir, curr_audio_name)
                write_audio(out_wav_path, mixed_audio, sample_rate)
                
                window = np.hamming(window_size)
                mixed_stft_cmplx = stft(x=mixed_audio, window_size=window_size, hop_size=hop_size, window=window, mode='complex')
                mixed_stft_cmplx = mixed_stft_cmplx[0 : seq_len, :]
                mixed_stft = np.abs(mixed_stft_cmplx)
                
                for k in gt_indexes:
                    masked_stft = np.dot(pred_masks[:, :, k], inverse_melW) * mixed_stft
                    masked_stft_cmplx = real_to_complex(masked_stft, mixed_stft_cmplx)
                    
                    frames = istft(masked_stft_cmplx)
                    cola_constant = get_cola_constant(hop_size, window)
                    sep_audio = overlap_add(frames, hop_size, cola_constant)
                    
                    sep_wav_path = os.path.join(sep_wavs_dir, '{}_{}.wav'.format(os.path.splitext(curr_audio_name)[0], ix_to_lb[k]))
                    write_audio(sep_wav_path, sep_audio, sample_rate)
                    print('Audio wrote to {}'.format(sep_wav_path))
      
        # Visualize learned representations
        if True:
            for n in range(len(batch_output)):
            
                # Plot segmentation masks. (00013.wav is used for plot in the paper)
                print('audio_name: {}'.format(batch_audio_names[n]))
                print('target: {}'.format(batch_y[n]))
                target_labels = target_to_labels(batch_y[n], labels)
                print('target labels: {}'.format(target_labels))
            
                (events_stft, scene_stft, _) = generator.get_events_scene_mixture_stft(batch_audio_names[n])
    
                fig, axs = plt.subplots(7, 7, figsize=(15, 10))
                for k in range(classes_num):
                    axs[k // 6, k % 6].matshow(batch_bottleneck[n, k].T, origin='lower', aspect='auto', cmap='jet')
                    if labels[k] in target_labels:
                        color = 'r'
                    else:
                        color = 'k'
                    axs[k // 6, k % 6].set_title(labels[k], color=color)
                    axs[k // 6, k % 6].xaxis.set_ticks([])
                    axs[k // 6, k % 6].yaxis.set_ticks([])
                    axs[k // 6, k % 6].set_xlabel('time')
                    axs[k // 6, k % 6].set_ylabel('mel bins')
                    
                axs[6, 5].matshow(np.log(events_stft + 1e-8).T, origin='lower', aspect='auto', cmap='jet')
                axs[6, 5].set_title('Spectrogram (in log scale)')
                axs[6, 5].xaxis.set_ticks([0, 310])
                axs[6, 5].xaxis.set_ticklabels(['0.0', '10.0 s'])
                axs[6, 5].xaxis.tick_bottom()
                axs[6, 5].yaxis.set_ticks([0, 1024])
                axs[6, 5].yaxis.set_ticklabels(['0', '1025'])
                axs[6, 5].set_xlabel('time')
                axs[6, 5].set_ylabel('FFT bins')
                
                axs[6, 6].matshow(np.log(np.dot(events_stft, feature_extractor.melW) + 1e-8).T, origin='lower', aspect='auto', cmap='jet')
                axs[6, 6].set_title('Log mel pectrogram')
                axs[6, 6].xaxis.set_ticks([0, 310])
                axs[6, 6].xaxis.set_ticklabels(['0.0', '10.0 s'])
                axs[6, 6].xaxis.tick_bottom()
                axs[6, 6].yaxis.set_ticks([0, 63])
                axs[6, 6].yaxis.set_ticklabels(['0', '64'])
                axs[6, 6].set_xlabel('time')
                axs[6, 6].set_ylabel('mel bins')
                
                plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5)
                plt.show()
                
                # Plot frame-wise SED
                fig, ax = plt.subplots(1, 1, figsize=(4, 4))
                score_mat = []
                for k in range(classes_num):
                    score = np.mean(batch_bottleneck[n, k], axis=-1)
                    score_mat.append(score)
                    
                score_mat = np.array(score_mat)
                
                ax.matshow(score_mat, origin='lower', aspect='auto', cmap='jet')
                ax.set_title('Frame-wise predictions')
                ax.xaxis.set_ticks([0, 310])
                ax.xaxis.set_ticklabels(['0.0', '10.0 s'])
                ax.xaxis.tick_bottom()
                ax.set_xlabel('time')
                ax.yaxis.set_ticks(np.arange(classes_num))
                ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small')
                ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3)
                
                plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5)
                plt.show()
                
                # Plot event-wise SED
                est_event_list = get_est_event_list(batch_pred_sed[n:n+1], batch_audio_names[n:n+1], labels)
                event_mat = event_list_to_matrix(est_event_list)
                
                fig, ax = plt.subplots(1, 1, figsize=(4, 4))
                ax.matshow(event_mat.T, origin='lower', aspect='auto', cmap='jet')
                ax.set_title('Event-wise predictions')
                ax.xaxis.set_ticks([0, 310])
                ax.xaxis.set_ticklabels(['0.0', '10.0 s'])
                ax.xaxis.tick_bottom()
                ax.set_xlabel('time')
                ax.yaxis.set_ticks(np.arange(classes_num))
                ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small')
                ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3)
                
                plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5)
                plt.show()
                
                # Plot event-wise ground truth
                ref_event_list = get_ref_event_list(meta, batch_audio_names[n:n+1])
                event_mat = event_list_to_matrix(ref_event_list)
                
                fig, ax = plt.subplots(1, 1, figsize=(4, 4))
                ax.matshow(event_mat.T, origin='lower', aspect='auto', cmap='jet')
                ax.set_title('Event-wise ground truth')
                ax.xaxis.set_ticks([0, 310])
                ax.xaxis.set_ticklabels(['0.0', '10.0 s'])
                ax.xaxis.tick_bottom()
                ax.set_xlabel('time')
                ax.yaxis.set_ticks(np.arange(classes_num))
                ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small')
                ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3)
                
                plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5)
                plt.show()
Exemplo n.º 9
0
def inference_wiener(args):
    workspace = args.workspace
    iter = args.iteration
    stack_num = args.stack_num
    filename = args.filename
    mini_num = args.mini_num
    visualize = args.visualize
    cuda = args.use_cuda and torch.cuda.is_available()
    print("cuda:", cuda)

    sample_rate = cfg.sample_rate
    fft_size = cfg.fft_size
    hop_size = cfg.hop_size
    window_type = cfg.window_type

    if window_type == 'hamming':
        window = np.hamming(fft_size)

    # Audio
    audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db"
    # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db"
    names = os.listdir(audio_dir)

    # Load model.
    target_type = ['speech', 'noise']
    model_dict = {}
    for e in target_type:
        n_freq = 257
        model = DNN(stack_num, n_freq)
        model_path = os.path.join(workspace, "models", filename, e,
                                  "md_%d_iters.tar" % iter)
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['state_dict'])

        # Move model to GPU.
        if cuda:
            model.cuda()
        model.eval()

        model_dict[e] = model

    # Load scalar
    scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    (mean_, std_) = cPickle.load(open(scalar_path, 'rb'))
    mean_ = move_data_to_gpu(mean_, cuda, volatile=True)
    std_ = move_data_to_gpu(std_, cuda, volatile=True)

    if mini_num > 0:
        n_every = len(names) / mini_num
    else:
        n_every = 1

    out_wav_dir = os.path.join(workspace, "enh_wavs", filename)
    pp_data.create_folder(out_wav_dir)

    for (cnt, name) in enumerate(names):
        if cnt % n_every == 0:
            audio_path = os.path.join(audio_dir, name)
            (audio, _) = pp_data.read_audio(audio_path, sample_rate)

            audio = pp_data.normalize(audio)
            cmplx_sp = pp_data.calc_sp(audio, fft_size, hop_size, window)
            x = np.abs(cmplx_sp)

            # Process data.
            n_pad = (stack_num - 1) / 2
            x = pp_data.pad_with_border(x, n_pad)
            x = pp_data.mat_2d_to_3d(x, stack_num, hop=1)

            # Predict.
            pred_dict = {}
            for e in target_type:
                pred = forward(model_dict[e], x, mean_, std_, cuda)
                pred = pred.data.cpu().numpy()
                pred_dict[e] = pred
            print(cnt, name)

            # Wiener filter.
            pred_mag_sp = pred_dict['speech'] / (
                pred_dict['speech'] + pred_dict['noise']) * np.abs(cmplx_sp)

            pred_cmplx_sp = stft.real_to_complex(pred_mag_sp, cmplx_sp)
            frames = stft.istft(pred_cmplx_sp)

            cola_constant = stft.get_cola_constant(hop_size, window)
            seq = stft.overlap_add(frames, hop_size, cola_constant)
            seq = seq[0:len(audio)]

            # Write out wav
            out_wav_path = os.path.join(out_wav_dir, name)
            pp_data.write_audio(out_wav_path, seq, sample_rate)
            print("Write out wav to: %s" % out_wav_path)

            if visualize:
                vmin = -5.
                vmax = 5.
                fig, axs = plt.subplots(3, 1, sharex=True)
                axs[0].matshow(np.log(np.abs(cmplx_sp)).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[1].matshow(np.log(np.abs(pred_dict['speech'])).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[2].matshow(np.log(np.abs(pred_dict['noise'])).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                plt.show()
Exemplo n.º 10
0
 def test_invertable(self):
   x = sawtooth(numpy.linspace(0, 1, 44100) * 2 * numpy.pi * 10)
   X = stft(x, 1024)
   xi = istft(X, len(x))
   self.assertTrue(len(xi) == len(x))
   self.assertTrue(numpy.allclose(xi, x, 1e-01))
Exemplo n.º 11
0
    def test_istft(self):

        """Test istft"""
        x = np.ones(16)
        segment_length = 4
        shift_length = 2
        segment_length_padded = 4
        window_function = boxcar
        original_size = x.shape
        p = 1
        x_stft, start_list, stop_list = stft(x, segment_length,
            segment_length_padded, shift_length, window_function)
        x_out = istft(x_stft, segment_length, segment_length_padded,
            start_list, stop_list,
            original_size, window_function, p)
        self.assertTrue( np.allclose(x_out, x))

        x = np.random.randn(16)
        segment_length = 4
        shift_length = 2
        segment_length_padded = 4
        window_function = boxcar
        original_size = x.shape
        p = 1
        x_stft, start_list, stop_list = stft(x, segment_length,
            segment_length_padded, shift_length, window_function)
        x_out = istft(x_stft, segment_length, segment_length_padded,
            start_list, stop_list,
            original_size, window_function, p)
        self.assertTrue( np.allclose(x_out, x))


        x = np.random.randn(16, 2, 2)
        segment_length = 4
        shift_length = 2
        segment_length_padded = 4
        window_function = boxcar
        original_size = x.shape
        p = 1
        x_stft, start_list, stop_list = stft(x, segment_length,
            segment_length_padded, shift_length, window_function)
        x_out = istft(x_stft, segment_length, segment_length_padded,
            start_list, stop_list,
            original_size, window_function, p)
        self.assertTrue( np.allclose(x_out, x))

        x = np.ones(16)
        segment_length = 4
        shift_length = 2
        segment_length_padded = 4
        window_function = hann
        original_size = x.shape
        p = 1
        x_stft, start_list, stop_list = stft(x, segment_length,
            segment_length_padded, shift_length, window_function)
        x_out = istft(x_stft, segment_length, segment_length_padded,
            start_list, stop_list,
            original_size, window_function, p)
        self.assertTrue( np.allclose(x_out, x))

        x = np.ones(16)
        segment_length = 4
        shift_length = 4
        segment_length_padded = 4
        window_function = hann
        original_size = x.shape
        p = 2
        x_stft, start_list, stop_list = stft(x, segment_length,
            segment_length_padded, shift_length, window_function)
        x_out = istft(x_stft, segment_length, segment_length_padded,
            start_list, stop_list,
            original_size, window_function, p)
        self.assertTrue( np.allclose(x_out, x))

        x = np.ones(16)
        segment_length = 4
        shift_length = 2
        segment_length_padded = 4
        window_function = boxcar
        original_size = x.shape
        p = 2
        x_stft, start_list, stop_list = stft(x, segment_length,
            segment_length_padded, shift_length, window_function)
        x_out = istft(x_stft, segment_length, segment_length_padded,
            start_list, stop_list,
            original_size, window_function, p)
        self.assertTrue( np.allclose(x_out, x))


        x = np.ones(16)
        segment_length = 4
        shift_length = 2
        segment_length_padded = 4
        window_function = hann
        original_size = x.shape
        p = 2
        x_stft, start_list, stop_list = stft(x, segment_length,
            segment_length_padded, shift_length, window_function)
        x_out = istft(x_stft, segment_length, segment_length_padded,
            start_list, stop_list,
            original_size, window_function, p)
        self.assertTrue( np.allclose(x_out, x))

        x = np.random.randn(16)
        segment_length = 4
        shift_length = 2
        segment_length_padded = 4
        window_function = hann
        original_size = x.shape
        p = 2
        x_stft, start_list, stop_list = stft(x, segment_length,
            segment_length_padded, shift_length, window_function)
        x_out = istft(x_stft, segment_length, segment_length_padded,
            start_list, stop_list,
            original_size, window_function, p)
        self.assertTrue( np.allclose(x_out, x))

        x = np.random.randn(16, 3)
        segment_length = 4
        shift_length = 2
        segment_length_padded = 4
        window_function = hann
        original_size = x.shape
        p = 2
        x_stft, start_list, stop_list = stft(x, segment_length,
            segment_length_padded, shift_length, window_function)
        x_out = istft(x_stft, segment_length, segment_length_padded,
            start_list, stop_list,
            original_size, window_function, p)
        self.assertTrue( np.allclose(x_out, x))

        x = np.ones(16)
        segment_length = 4
        shift_length = 2
        segment_length_padded = 7
        window_function = boxcar
        original_size = x.shape
        p = 1
        x_stft, start_list, stop_list = stft(x, segment_length,
            segment_length_padded, shift_length, window_function)
        x_out = istft(x_stft, segment_length, segment_length_padded, 
            start_list, stop_list,
            original_size, window_function, p)
        self.assertTrue( np.allclose(x_out, x))
Exemplo n.º 12
0
Arquivo: agc.py Projeto: imclab/pyagc
def tf_agc(d, sr, t_scale=0.5, f_scale=1.0, causal_tracking=True, plot=False):
    """
    Perform frequency-dependent automatic gain control on an auditory
    frequency axis.
    d is the input waveform (at sampling rate sr);
    y is the output waveform with approximately constant
    energy in each time-frequency patch.
    t_scale is the "scale" for smoothing in time (default 0.5 sec).
    f_scale is the frequency "scale" (default 1.0 "mel").
    causal_tracking == 0 selects traditional infinite-attack, exponential release.
    causal_tracking == 1 selects symmetric, non-causal Gaussian-window smoothing.
    D returns actual STFT used in analysis.  E returns the
    smoothed amplitude envelope divided out of D to get gain control.
    """

    hop_size = 0.032  # in seconds

    # Make STFT on ~32 ms grid
    ftlen = int(2 ** np.round(np.log(hop_size * sr) / np.log(2.)))
    winlen = ftlen
    hoplen = winlen / 2
    D = stft(d, winlen, hoplen)  # using my code
    ftsr = sr / hoplen
    ndcols = D.shape[1]

    # Smooth in frequency on ~ mel resolution
    # Width of mel filters depends on how many you ask for,
    # so ask for fewer for larger f_scales
    nbands = max(10, 20 / f_scale)  # 10 bands, or more for very fine f_scale
    mwidth = f_scale * nbands / 10  # will be 2.0 for small f_scale
    (f2a_tmp, _) = fft2melmx(ftlen, sr, int(nbands), mwidth)
    f2a = f2a_tmp[:, :ftlen / 2 + 1]
    audgram = np.dot(f2a, np.abs(D))

    if causal_tracking:
        # traditional attack/decay smoothing
        fbg = np.zeros(audgram.shape)
        # state = zeros(size(audgram,1),1);
        state = np.zeros(audgram.shape[0])
        alpha = np.exp(-(1. / ftsr) / t_scale)
        for i in range(audgram.shape[1]):
            state = np.maximum(alpha * state, audgram[:, i])
            fbg[:, i] = state

    else:
        # noncausal, time-symmetric smoothing
        # Smooth in time with tapered window of duration ~ t_scale
        tsd = np.round(t_scale * ftsr) / 2
        htlen = 6 * tsd  # Go out to 6 sigma
        twin = np.exp(-0.5 * (((np.arange(-htlen, htlen + 1)) / tsd) ** 2)).T

        # reflect ends to get smooth stuff
        AD = audgram
        x = np.hstack((np.fliplr(AD[:, :htlen]),
                       AD,
                       np.fliplr(AD[:, -htlen:]),
                       np.zeros((AD.shape[0], htlen))))
        fbg = signal.lfilter(twin, 1, x, 1)

        # strip "warm up" points
        fbg = fbg[:, twin.size + np.arange(ndcols)]

    # map back to FFT grid, flatten bark loop gain
    sf2a = np.sum(f2a, 0)
    sf2a_fix = sf2a
    sf2a_fix[sf2a == 0] = 1.
    E = np.dot(np.dot(np.diag(1. / sf2a_fix), f2a.T), fbg)
    # Remove any zeros in E (shouldn't be any, but who knows?)
    E[E <= 0] = np.min(E[E > 0])

    # invert back to waveform
    y = istft(D / E, winlen, hoplen, window=np.ones(winlen))  # using my code

    if plot:
        try:
            import matplotlib.pyplot as plt
            plt.subplot(3, 1, 1)
            plt.imshow(20. * np.log10(np.flipud(np.abs(D))))
            plt.subplot(3, 1, 2)
            plt.imshow(20. * np.log10(np.flipud(np.abs(E))))
            A = stft(y, winlen, hoplen)  # using my code
            plt.subplot(3, 1, 3)
            plt.imshow(20. * np.log10(np.flipud(np.abs(A))))
            plt.show()
        except Exception, e:
            print "Failed to plot results"
            print e
Exemplo n.º 13
0
def render_estimate(est, sr, name="test.wav"):
    y = stft.istft(est, 256)
    write_mono(y, "result/" + name, sr)
Exemplo n.º 14
0
    def process(self):

        if (self.signals is None or len(self.signals) == 0):
            raise NameError('No signal to beamform')

        if self.processing is 'FrequencyDomain':

            # create window function
            win = np.concatenate((np.zeros(self.zpf),
                                  windows.hann(self.L), 
                                  np.zeros(self.zpb)))

            # do real STFT of first signal
            tfd_sig = stft.stft(self.signals[0], 
                                self.L, 
                                self.hop, 
                                zp_back=self.zpb, 
                                zp_front=self.zpf,
                                transform=np.fft.rfft, 
                                win=win) * np.conj(self.weights[0])
            for i in xrange(1, self.M):
                tfd_sig += stft.stft(self.signals[i],
                                     self.L,
                                     self.hop,
                                     zp_back=self.zpb,
                                     zp_front=self.zpf,
                                     transform=np.fft.rfft,
                                     win=win) * np.conj(self.weights[i])

            #  now reconstruct the signal
            output = stft.istft(
                tfd_sig,
                self.L,
                self.hop,
                zp_back=self.zpb,
                zp_front=self.zpf,
                transform=np.fft.irfft)

            # remove the zero padding from output signal
            if self.zpb is 0:
                output = output[self.zpf:]
            else:
                output = output[self.zpf:-self.zpb]

        elif self.processing is 'TimeDomain':

            # go back to time domain and shift DC to center
            tw = np.sqrt(self.weights.shape[1])*np.fft.irfft(np.conj(self.weights), axis=1)
            tw = np.concatenate((tw[:, self.N/2:], tw[:, :self.N/2]), axis=1)

            from scipy.signal import fftconvolve

            # do real STFT of first signal
            output = fftconvolve(tw[0], self.signals[0])
            for i in xrange(1, len(self.signals)):
                output += fftconvolve(tw[i], self.signals[i])

        elif self.processing is 'Total':

            W = np.concatenate((self.weights, np.conj(self.weights[:,-2:0:-1])), axis=1)
            W[:,0] = np.real(W[:,0])
            W[:,self.N/2] = np.real(W[:,self.N/2])

            F_sig = np.zeros(self.signals.shape[1], dtype=complex)
            for i in xrange(self.M):
                F_sig += np.fft.fft(self.signals[i])*np.conj(W[i,:])

            f_sig = np.fft.ifft(F_sig)
            print np.abs(np.imag(f_sig)).mean()
            print np.abs(np.real(f_sig)).mean()

            output = np.real(np.fft.ifft(F_sig))

        return output
Exemplo n.º 15
0
                                                     line + '.wav')
                        clean_audio_2 = clean_audio_2.astype(
                            'float32') / np.power(2, 15)
                        sr, mix_audio = wav_read(wav_folders + 'mix/' + line +
                                                 '.wav')
                        mix_audio = mix_audio.astype('float32') / np.power(
                            2, 15)

                        # Compute time-domain estimated signals
                        RES_1 = est_1r + 1j * est_1i
                        RES_2 = est_2r + 1j * est_2i
                        RES_1 = np.concatenate(
                            (RES_1, np.conj(RES_1[:, ::-1][:, 1:-1])), axis=1)
                        RES_2 = np.concatenate(
                            (RES_2, np.conj(RES_2[:, ::-1][:, 1:-1])), axis=1)
                        res_1 = istft(RES_1, len(clean_audio_1))
                        res_2 = istft(RES_2, len(clean_audio_2))
                        res_1 = res_1.astype('float32')
                        res_2 = res_2.astype('float32')

                        # Save mixture, clean signals and estimates in the file folder for evaluation.
                        s_res = np.concatenate(
                            (res_1.reshape(-1, 1), res_2.reshape(-1, 1)), 1)
                        s_c = np.concatenate((clean_audio_1.reshape(
                            -1, 1), clean_audio_2.reshape(-1, 1)), 1)
                        # Pad or crop according to the clean source
                        if s_res.shape[0] > s_c.shape[0]:
                            s_res = s_res[:s_c.shape[0], :]
                        else:
                            s_res = np.concatenate(
                                (s_res,
    def process(self, FD=False):

        if self.signals is None or len(self.signals) == 0:
            raise NameError('No signal to beamform')

        if FD is True:

            # STFT processing

            if self.weights is None and self.filters is not None:
                self.weightsFromFilters()
            elif self.weights is None and self.filters is None:
                raise NameError('Beamforming weights or filters need to be computed first.')

            # create window function
            win = np.concatenate((np.zeros(self.zpf),
                                  windows.hann(self.L), 
                                  np.zeros(self.zpb)))

            # do real STFT of first signal
            tfd_sig = stft.stft(self.signals[0], 
                                self.L, 
                                self.hop, 
                                zp_back=self.zpb, 
                                zp_front=self.zpf,
                                transform=np.fft.rfft, 
                                win=win) * np.conj(self.weights[0])
            for i in xrange(1, self.M):
                tfd_sig += stft.stft(self.signals[i],
                                     self.L,
                                     self.hop,
                                     zp_back=self.zpb,
                                     zp_front=self.zpf,
                                     transform=np.fft.rfft,
                                     win=win) * np.conj(self.weights[i])

            #  now reconstruct the signal
            output = stft.istft(
                tfd_sig,
                self.L,
                self.hop,
                zp_back=self.zpb,
                zp_front=self.zpf,
                transform=np.fft.irfft)

            # remove the zero padding from output signal
            if self.zpb is 0:
                output = output[self.zpf:]
            else:
                output = output[self.zpf:-self.zpb]

        else:

            # TD processing

            if self.weights is not None and self.filters is None:
                self.filtersFromWeights()
            elif self.weights is None and self.filters is None:
                raise NameError('Beamforming weights or filters need to be computed first.')

            from scipy.signal import fftconvolve

            # do real STFT of first signal
            output = fftconvolve(self.filters[0], self.signals[0])
            for i in xrange(1, len(self.signals)):
                output += fftconvolve(self.filters[i], self.signals[i])


        return output