def get_mfcc(rate, sig): features = mfcc.mfcc(sig,rate) features = mfcc.logfbank(sig) features = mfcc.lifter(features) sum_of_squares = [] index = -1 for r in features: sum_of_squares.append(0) index = index + 1 for n in r: sum_of_squares[index] = sum_of_squares[index] + n**2 strongest_frame = sum_of_squares.index(max(sum_of_squares)) hz = mfcc.mel2hz(features[strongest_frame]) min_hz = min(hz) speech_booster = AudioEffectsChain().lowshelf(frequency=min_hz*(-1), gain=12.0, slope=0.5).highshelf(frequency=min_hz*(-1)*1.2, gain=-12.0, slope=0.5).limiter(gain=8.0) y_speech_boosted = speech_booster(sig) features = mfcc.mfcc(y_speech_boosted, rate, 0.025, 0.01, 16, nfilt=40, nfft=512, appendEnergy = False, winfunc=np.hamming) features = preprocessing.scale(features) #scaling to ensure that all values are within 0 and 1 return features[1:5, :]
def get_MFCC(sr, audio): features = mfcc.mfcc(audio, sr) ############################# # # # Noise Removal # # # ############################# features = mfcc.logfbank( audio) #computes the filterbank energy from an audio signal features = mfcc.lifter( features) #increases magnitude of high frequency DCT coefficients sum_of_squares = [] index = -1 for r in features: """ Since signals can be either positive or negative, taking n**2 allows us to compare the magnitudes """ sum_of_squares.append(0) index = index + 1 for n in r: sum_of_squares[index] = sum_of_squares[index] + n**2 strongest_frame = sum_of_squares.index(max(sum_of_squares)) hz = mfcc.mel2hz(features[strongest_frame] ) #converts the strongest frame's mfcc to hertz max_hz = max(hz) min_hz = min(hz) speech_booster = AudioEffectsChain().lowshelf( frequency=min_hz * (-1), gain=20.0, slope=0.5) #creates an audio booster that removes low hz y_speech_boosted = speech_booster(audio) #apply booster to original audio ############################# # # # FINAL MFCC CALCULATION # # # ############################# features = mfcc.mfcc(y_speech_boosted, sr, 0.025, 0.01, 16, nfilt=40, nfft=512, appendEnergy=False, winfunc=np.hamming) features = preprocessing.scale( features) #scaling to ensure that all values are within 0 and 1 return features
def __init__(self, input_dim, sr, num_filter, exp=False, filter_fix=False): super(fBPLayer, self).__init__() self.input_dim = input_dim self.num_filter = num_filter self.sr = sr self.exp = exp self.filter_fix = filter_fix requires_grad = not filter_fix input_freq = np.linspace(0, self.sr / 2, input_dim) self.input_freq = nn.Parameter(torch.from_numpy(input_freq).expand(num_filter, input_dim).float(), requires_grad=False) borders = np.linspace(0, hz2mel(sr / 2), num_filter + 2) borders = mel2hz(borders) self.bandwidth_low = nn.Parameter(torch.from_numpy(borders[:-2]).float().reshape(num_filter, 1), requires_grad=requires_grad) self.bandwidth = nn.Parameter(torch.from_numpy(borders[2:] - borders[:-2]).float().reshape(num_filter, 1), requires_grad=requires_grad)
def __init__(self, input_dim, sr, num_filter, exp=False, filter_fix=False): super(fBLayer, self).__init__() self.input_dim = input_dim self.num_filter = num_filter self.sr = sr self.exp = exp self.filter_fix = filter_fix requires_grad = not filter_fix input_freq = np.linspace(0, self.sr / 2, input_dim) self.input_freq = nn.Parameter(torch.from_numpy(input_freq).expand(num_filter, input_dim).float(), requires_grad=False) centers = np.linspace(0, hz2mel(sr / 2), num_filter + 2) centers = mel2hz(centers) bandwidth = np.diff(centers) self.frequency_center = nn.Parameter(torch.from_numpy(centers[1:-1]).float().reshape(num_filter, 1), requires_grad=requires_grad) self.bandwidth_left = nn.Parameter(torch.from_numpy(bandwidth[:-1]).float().reshape(num_filter, 1), requires_grad=requires_grad) self.bandwidth_right = nn.Parameter(torch.from_numpy(bandwidth[1:]).float().reshape(num_filter, 1), requires_grad=requires_grad)
def main(): # subsets = ['orignal', 'babble', 'noise', 'music', 'reverb'] # load selected input uids dir_path = pathlib.Path(args.extract_path) print('Path is %s' % str(dir_path)) # inputs [train/valid/test] try: with open(args.extract_path + '/freq.data.pickle', 'rb') as f: freq_data = pickle.load(f) # avg on time axis with open(args.extract_path + '/time.data.pickle', 'rb') as f: time_data = pickle.load(f) # avg on freq axis except: train_lst = list(dir_path.glob('*train*bin')) veri_lst = list(dir_path.glob('*ver*bin')) valid_lst = list(dir_path.glob('*valid*bin')) test_lst = list(dir_path.glob('*test*bin')) print(' Train set extracting:') time_data = [] num_utt = 0 for t in train_lst: p = str(t) with open(p, 'rb') as f: sets = pickle.load(f) for (data, grad, uid) in tqdm(sets): time_data.append((data, grad)) num_utt += 1 if num_utt >= args.samples: break with open(args.extract_path + '/time.data.pickle', 'wb') as f: pickle.dump(time_data, f, protocol=pickle.HIGHEST_PROTOCOL) freq_data = {} train_data_mean = np.zeros( (args.feat_dim)) # [data.mean/grad.abssum/grad.var] train_time_mean = np.zeros( (args.feat_dim)) # [data.mean/grad.abssum/grad.var] train_time_var = np.zeros((args.feat_dim)) num_utt = 0 for t in train_lst: p = str(t) with open(p, 'rb') as f: sets = pickle.load(f) for (data, grad, uid) in tqdm(sets): train_time_mean += np.mean(grad, axis=0) train_time_var += np.var(grad, axis=0) train_data_mean += np.mean(data, axis=0) num_utt += 1 train_time_mean /= num_utt train_time_var /= num_utt train_data_mean /= num_utt freq_data['train.time.mean'] = train_time_mean freq_data['train.time.var'] = train_time_var freq_data['train.data.mean'] = train_data_mean print(' Valid set extracting:') valid_data_mean = np.zeros( (args.feat_dim)) # [data.mean/grad.abssum/grad.var] valid_time_mean = np.zeros( (args.feat_dim)) # [data.mean/grad.abssum/grad.var] valid_time_var = np.zeros((args.feat_dim)) valid_data = np.zeros((3, args.feat_dim)) # [data/grad] num_utt = 0 for t in valid_lst: p = str(t) with open(p, 'rb') as f: sets = pickle.load(f) for (data, grad, uid) in tqdm(sets): valid_data_mean += np.mean(np.abs(data), axis=0) valid_time_mean += np.mean(np.abs(grad), axis=0) valid_time_var += np.var(grad, axis=0) num_utt += 1 if num_utt > 0: valid_time_mean = valid_time_mean / num_utt valid_time_var = valid_time_var / num_utt valid_data_mean = valid_data_mean / num_utt freq_data['valid.time.mean'] = valid_time_mean freq_data['valid.time.var'] = valid_time_var freq_data['valid.data.mean'] = valid_data_mean print(' Train verification set extracting:') veri_data = np.zeros( (3, 2, args.feat_dim)) # [data/grad, utt_a, utt_b] train_veri_data = np.zeros((args.feat_dim)) train_veri_mean = np.zeros((args.feat_dim)) train_veri_var = np.zeros((args.feat_dim)) train_veri_relu = np.zeros((args.feat_dim)) num_utt = 0 for t in veri_lst: p = str(t) with open(p, 'rb') as f: sets = pickle.load(f) for (label, grad_a, grad_b, data_a, data_b) in tqdm(sets): train_veri_data += (np.mean(data_a, axis=0) + np.mean(data_b, axis=0)) / 2 train_veri_mean += (np.mean(np.abs(grad_a), axis=0) + np.mean(np.abs(grad_b), axis=0)) / 2 train_veri_var += (np.var(grad_a, axis=0) + np.var(grad_b, axis=0)) / 2 num_utt += 1 if num_utt > 0: train_veri_data /= num_utt train_veri_mean /= num_utt train_veri_var /= num_utt freq_data['train.veri.time.mean'] = train_veri_mean freq_data['train.veri.time.var'] = train_veri_var freq_data['train.veri.data.mean'] = train_veri_data print(' Test set extracting:') # test_data = np.zeros((3, 2, args.feat_dim)) # [data/grad, utt_a, utt_b] test_veri_data = np.zeros((args.feat_dim)) test_veri_mean = np.zeros((args.feat_dim)) test_veri_var = np.zeros((args.feat_dim)) test_veri_relu = np.zeros((args.feat_dim)) num_utt = 0 for t in test_lst: p = str(t) with open(p, 'rb') as f: sets = pickle.load(f) for (label, grad_a, grad_b, data_a, data_b) in tqdm(sets): test_veri_data += (np.mean(data_a, axis=0) + np.mean(data_b, axis=0)) / 2 test_veri_mean += (np.mean(np.abs(grad_a), axis=0) + np.mean(np.abs(grad_b), axis=0)) / 2 test_veri_var += (np.var(grad_a, axis=0) + np.var(grad_b, axis=0)) / 2 num_utt += 1 if num_utt > 0: test_veri_data /= num_utt test_veri_mean /= num_utt test_veri_var /= num_utt freq_data['test.veri.time.mean'] = test_veri_mean freq_data['test.veri.time.var'] = test_veri_var freq_data['test.veri.data.mean'] = test_veri_data print('Saving inputs in %s' % args.extract_path) with open(args.extract_path + '/freq.data.pickle', 'wb') as f: pickle.dump(freq_data, f, protocol=pickle.HIGHEST_PROTOCOL) # all_data [5, 2, 120, 161] # plotting filters distributions # train_data [numofutt, feats[N, 161]] train_input = freq_data['train.data.mean'] valid_input = freq_data['valid.data.mean'] test_input = freq_data['test.veri.data.mean'] train_grad = freq_data['train.time.mean'] valid_grad = freq_data['valid.time.mean'] veri_grad = freq_data['train.veri.time.mean'] test_grad = freq_data['test.veri.time.mean'] x = np.arange(args.feat_dim) * 8000 / (args.feat_dim - 1) # [0-8000] if args.acoustic_feature == 'fbank': m = np.linspace(0, 2840.0230467083188, args.feat_dim) x = mel2hz(m) # y = np.sum(all_data, axis=2) # [5, 2, 162] pdf = PdfPages(args.extract_path + '/grad.veri.time.mean.pdf') plt.rc('font', family='Times New Roman') plt.figure(figsize=(12, 9)) # plt.title('Gradient Distributions', fontsize=22) plt.xlabel('Frequency (Hz)', fontsize=24) plt.xticks(fontsize=22) plt.ylabel('Weight', fontsize=24) plt.yticks(fontsize=22) m = np.arange(0, 2840.0230467083188) m = 700 * (10**(m / 2595.0) - 1) n = np.array([m[i] - m[i - 1] for i in range(1, len(m))]) n = 1 / n f = interpolate.interp1d(m[1:], n) xnew = np.arange(np.min(m[1:]), np.max(m[1:]), (np.max(m[1:]) - np.min(m[1:])) / 161) ynew = f(xnew) ynew = ynew / ynew.sum() plt.plot(xnew, ynew) # print(np.sum(ynew)) for s in train_grad, valid_grad, veri_grad, test_grad: # for s in test_a_set_grad, test_b_set_grad: f = interpolate.interp1d(x, s) xnew = np.linspace(np.min(x), np.max(x), 161) ynew = f(xnew) ynew = ynew / ynew.sum() plt.plot(xnew, ynew) # pdb.set_trace # if not os.path.exists(args.extract_path + '/grad.npy'): ynew = veri_grad ynew = ynew / ynew.sum() np.save(args.extract_path + '/train.grad.npy', train_grad) # save the gradient to a npy file # plt.legend(['Mel-scale', 'Train', 'Valid', 'Test_a', 'Test_b'], loc='upper right', fontsize=18) plt.legend(['Train', 'Valid', 'Train Verify', 'Test'], loc='upper right', fontsize=24) # plt.legend(['Mel-scale', 'Train', 'Valid', 'Train Verify', 'Test'], loc='upper right', fontsize=24) pdf.savefig() pdf.close() # plt.savefig(args.extract_path + "/grads.png") # plt.show() plt.figure(figsize=(8, 6)) plt.title('Data distributions', fontsize=22) plt.xlabel('Frequency (Hz)', fontsize=16) plt.ylabel('Log Power (-)', fontsize=16) # 插值平滑 ??? for s in train_input, valid_input, test_input: # for s in test_a_set_grad, test_b_set_grad: f = interpolate.interp1d(x, s) xnew = np.linspace(np.min(x), np.max(x), 161) ynew = f(xnew) plt.plot(xnew, ynew) plt.legend(['Train', 'Valid', 'Test'], loc='upper right', fontsize=16) plt.savefig(args.extract_path + "/inputs.freq.png") plt.show() plt.figure(figsize=(16, 8)) plt.title('Data distributions in Time Axis', fontsize=22) plt.xlabel('Time', fontsize=16) plt.ylabel('Magnetitude', fontsize=16) # 插值平滑 ??? # for i, (data, grad) in enumerate(time_data): # for s in test_a_set_grad, test_b_set_grad: data = time_data[0][0] grad = time_data[0][1] norm = matplotlib.colors.Normalize(vmin=0., vmax=1.) # data_mean = data.mean(axis=10 ax = plt.subplot(2, 1, 1) # data = (data - data.min()) / (data.max() - data.min()) # im = ax.imshow(np.log(data.transpose()), cmap='viridis', aspect='auto') im = ax.imshow(data.transpose(), cmap='viridis', aspect='auto') # print(data.min(), data.max()) plt.colorbar(im) # 显示颜色标尺 # ax.plot(data_mean) ax = plt.subplot(2, 1, 2) grad = np.abs(grad) grad_mean = grad # grad_mean = (grad - grad.min()) / (grad.max() - grad.min()) # im = ax.imshow(1/np.log(grad_mean.transpose()), norm=norm, cmap='viridis', aspect='auto') im = ax.imshow(grad_mean.transpose(), cmap='viridis', aspect='auto') # ax.plot(np.log(grad_mean)) ax.set_xlim(0, len(grad_mean)) # plt.legend(['Train', 'Valid', 'Test'], loc='upper right', fontsize=16) plt.colorbar(im) # 显示颜色标尺 plt.savefig(args.extract_path + "/inputs.time.png") plt.show() print('Completed!\n')
def main(): # subsets = ['orignal', 'babble', 'noise', 'music', 'reverb'] # load selected input uids dir_path = pathlib.Path(args.extract_path) print('Path is %s' % str(dir_path)) # inputs [train/valid/test] if os.path.exists(args.extract_path + '/inputs.train.npy'): train_data = np.load(args.extract_path + '/inputs.train.npy') valid_data = np.load(args.extract_path + '/inputs.valid.npy') test_data = np.load(args.extract_path + '/inputs.test.npy') veri_data = np.load(args.extract_path + '/inputs.veri.npy') else: train_lst = list(dir_path.glob('*train*bin')) veri_lst = list(dir_path.glob('*ver*bin')) valid_lst = list(dir_path.glob('*valid*bin')) test_lst = list(dir_path.glob('*test*bin')) print('Train set extracting:') train_data = np.zeros((3, args.feat_dim)) # [data/grad] num_utt = 0 for t in train_lst: p = str(t) with open(p, 'rb') as f: sets = pickle.load(f) for (data, grad) in sets: train_data[1] += np.sum(np.abs(grad), axis=0) this_weight = np.var(grad, axis=0) train_data[2] += this_weight # / this_weight.sum() # train_data[1] += np.mean(grad, axis=0) train_data[0] += np.mean(data, axis=0) num_utt += 1 train_data = train_data / num_utt print('Valid set extracting:') valid_data = np.zeros((3, args.feat_dim)) # [data/grad] num_utt = 0 for t in valid_lst: p = str(t) with open(p, 'rb') as f: sets = pickle.load(f) for (data, grad) in sets: valid_data[1] += np.sum(np.abs(grad), axis=0) this_weight = np.var(grad, axis=0) valid_data[2] += this_weight # / this_weight.sum() # valid_data[1] += np.mean(grad, axis=0) valid_data[0] += np.mean(data, axis=0) num_utt += 1 valid_data = valid_data / num_utt print('Train verification set extracting:') veri_data = np.zeros((3, 2, args.feat_dim)) # [data/grad, utt_a, utt_b] num_utt = 0 for t in veri_lst: p = str(t) with open(p, 'rb') as f: sets = pickle.load(f) for (label, grad_a, grad_b, data_a, data_b) in sets: veri_data[0][0] += np.mean(data_a, axis=0) veri_data[0][1] += np.mean(data_b, axis=0) veri_data[1][0] += np.sum(np.abs(grad_a), axis=0) veri_data[1][1] += np.sum(np.abs(grad_b), axis=0) this_weight_a = np.var(grad_a, axis=0) veri_data[2][0] += this_weight_a # / this_weight_a.sum() this_weight_b = np.var(grad_b, axis=0) veri_data[2][1] += this_weight_b # / this_weight_b.sum() num_utt += 1 veri_data = veri_data / num_utt print('Test set extracting:') test_data = np.zeros((3, 2, args.feat_dim)) # [data/grad, utt_a, utt_b] num_utt = 0 for t in test_lst: p = str(t) with open(p, 'rb') as f: sets = pickle.load(f) for (label, grad_a, grad_b, data_a, data_b) in sets: test_data[0][0] += np.mean(data_a, axis=0) test_data[0][1] += np.mean(data_b, axis=0) test_data[1][0] += np.sum(np.abs(grad_a), axis=0) test_data[1][1] += np.sum(np.abs(grad_b), axis=0) this_weight_a = np.var(grad_a, axis=0) test_data[2][0] += this_weight_a # / this_weight_a.sum() this_weight_b = np.var(grad_b, axis=0) test_data[2][1] += this_weight_b #/ this_weight_b.sum() num_utt += 1 test_data = test_data / num_utt print('Saving inputs in %s' % args.extract_path) train_data = np.array(train_data) valid_data = np.array(valid_data) test_data = np.array(test_data) np.save(args.extract_path + '/inputs.train.npy', train_data) np.save(args.extract_path + '/inputs.valid.npy', valid_data) np.save(args.extract_path + '/inputs.veri.npy', veri_data) np.save(args.extract_path + '/inputs.test.npy', test_data) # all_data [5, 2, 120, 161] # plotting filters distributions # train_data [numofutt, feats[N, 161]] train_set_input = train_data[0] valid_set_input = valid_data[0] test_a_set_input = test_data[0][0] test_b_set_input = test_data[0][1] train_set_grad = train_data[1] valid_set_grad = valid_data[1] veri_set_grad = veri_data[1][0] + veri_data[1][1] test_set_grad = test_data[1][0] + test_data[1][1] x = np.arange(args.feat_dim) * 8000 / (args.feat_dim - 1) # [0-8000] if args.acoustic_feature == 'fbank': m = np.linspace(0, 2840.0230467083188, args.feat_dim) x = mel2hz(m) # y = np.sum(all_data, axis=2) # [5, 2, 162] pdf = PdfPages(args.extract_path + '/grad.veri.pdf') plt.rc('font', family='Times New Roman') plt.figure(figsize=(12, 9)) # plt.title('Gradient Distributions', fontsize=22) plt.xlabel('Frequency (Hz)', fontsize=24) plt.xticks(fontsize=22) plt.ylabel('Weight', fontsize=24) plt.yticks(fontsize=22) m = np.arange(0, 2840.0230467083188) m = 700 * (10 ** (m / 2595.0) - 1) n = np.array([m[i] - m[i - 1] for i in range(1, len(m))]) n = 1 / n f = interpolate.interp1d(m[1:], n) xnew = np.arange(np.min(m[1:]), np.max(m[1:]), (np.max(m[1:]) - np.min(m[1:])) / 161) ynew = f(xnew) ynew = ynew / ynew.sum() plt.plot(xnew, ynew) # print(np.sum(ynew)) for s in train_set_grad, valid_set_grad, veri_set_grad, test_set_grad: # for s in test_a_set_grad, test_b_set_grad: f = interpolate.interp1d(x, s) xnew = np.linspace(np.min(x), np.max(x), 161) ynew = f(xnew) # ynew = ynew - ynew.min() ynew = ynew / ynew.sum() plt.plot(xnew, ynew) # pdb.set_trace # if not os.path.exists(args.extract_path + '/grad.npy'): ynew = veri_set_grad ynew = ynew / ynew.sum() np.save(args.extract_path + '/grad.veri.npy', ynew) # plt.legend(['Mel-scale', 'Train', 'Valid', 'Test_a', 'Test_b'], loc='upper right', fontsize=18) plt.legend(['Mel-scale', 'Train Set', 'Valid Set', 'train Verify Set', 'Test Set'], loc='upper right', fontsize=24) pdf.savefig() pdf.close() # plt.savefig(args.extract_path + "/grads.png") # plt.show() plt.figure(figsize=(8, 6)) plt.title('Data distributions', fontsize=22) plt.xlabel('Frequency (Hz)', fontsize=16) plt.ylabel('Log Power Energy (CMVN)', fontsize=16) # 插值平滑 ??? for s in train_set_input, valid_set_input, test_a_set_input, test_b_set_input: # for s in test_a_set_grad, test_b_set_grad: f = interpolate.interp1d(x, s) xnew = np.linspace(np.min(x), np.max(x), 161) ynew = f(xnew) plt.plot(xnew, ynew) plt.legend(['Train', 'Valid', 'Test_a', 'Test_b'], loc='upper right', fontsize=16) plt.savefig(args.extract_path + "/inputs.png") plt.show() print('Completed!\n')
def noise_suppressed_example(plot=False): """ In this example, we demonstrate how we suppress noise using dynamic gains in an audio equalizer [EQ]. The basic idea is we use the clean to noisy energy ratio of each frequency band as the gain of suppression. It is done in a very small windows (500 point = 31.25ms) so that it can respone very quickly. Then we apply these gains to an equalizer (a set of parallel bandpass filter). The gains are changing very fast so the noise will be suppressed when it is detected. This is also the principle that how do we generate the truth gains for the training data (y_train). """ # change here to select the file and its noise mixing level. nfilt = 20 test_num = 1 # which file test_noise_level = 10 # noise level in db, selected from 0, 10, 20, depeneded on dataset # change here to select the file and its noise mixing level. clean_file = "MS-SNSD/CleanSpeech_training/clnsp" + str(test_num) + ".wav" noisy_file = "MS-SNSD/NoisySpeech_training/noisy" + str( test_num) + "_SNRdb_" + str(test_noise_level) + ".0_clnsp" + str( test_num) + ".wav" (rate, clean_sig) = wav.read(clean_file) (rate, noisy_sig) = wav.read(noisy_file) clean_sig = clean_sig / 32768 noisy_sig = noisy_sig / 32768 # Calculate the energy of each frequency bands clean_band_eng, _ = fbank(clean_sig, rate, winlen=0.032, winstep=0.032 / 2, nfilt=nfilt, nfft=512, lowfreq=20, highfreq=8000, preemph=0) noisy_band_eng, _ = fbank(noisy_sig, rate, winlen=0.032, winstep=0.032 / 2, nfilt=nfilt, nfft=512, lowfreq=20, highfreq=8000, preemph=0) # gains gains = np.sqrt(clean_band_eng / noisy_band_eng) if (plot): plt.title("Gains") plt.plot(gains[:, :10]) plt.show() # convert mel scale back to frequency band mel_scale = get_mel_scale(nfilt=nfilt, lowfreq=20, highfreq=8000) band_freq = mel2hz(mel_scale) band_frequency = band_freq[1:-1] # the middle point of each band print('band frequency', band_frequency) # the noisy audio now pass to a set of parallel band pass filter. # which performed like an audio equalizer [EQ] # the different is we will change the gains of each band very quickly so that we suppress the noise while keeping the speech. # design our band pass filter for each band in the equalizer. # becasue the frequency band is overlapping, we need to reduce the signal to avoid overflow when converting back to int16. print("denoising using IIR filter") b, a = iir_design(band_freq, rate) if plot: plot_frequency_respond(b, a) print("b", b) print("a", a) step = int(0.03125 * rate / 2) print("audio process step:", step) filtered_signal = np.zeros(len(noisy_sig)) for i in range(len(b)): filtered_signal += bandpass_filter_iir(noisy_sig, b[i].copy(), a[i].copy(), step, gains[:, i]) print("filtering with frequency: ", band_frequency[i]) filtered_signal = filtered_signal * 0.6 filtered_signal = np.clip(filtered_signal, -1, 1) wav.write("_filtered_sample.wav", rate, np.asarray(filtered_signal * 32767, dtype=np.int16)) wav.write("_noisy_sample.wav", rate, np.asarray(noisy_sig * 32767, dtype=np.int16)) print("noisy signal is saved to:", "_noisy_sample.wav") print("filtered signal is saved to:", "_filtered_sample.wav")
if __name__ == "__main__": # This example will generate 2 files, noisy speech and noise suppressed speech. # You might open them with your player to get a feeling ot what does it sound like. # It give you an idea that how does this energy based noise suppression work. noise_suppressed_example() # change this will change the whole system, including equalizer and RNN # it set: number of filter in equalizer, number of mfcc feature, and number of RNN output. # choose from 10 ~ 30. num_filter = 20 # generate filter coefficient mel_scale = get_mel_scale(nfilt=num_filter, lowfreq=20, highfreq=8000) band_freq = mel2hz(mel_scale) b, a = iir_design( band_freq, 16000, order=1) # >2 order will not stable with only float32 accuracy in C. generate_filter_header(b, a, order=int(b[0].shape[-1] / 2), filename='equalizer_coeff.h') # plot frequency respond #plot_frequency_respond(b, a) print('Reading noisy and clean speech files...') # dataset generation start from here: # energy thresehold for voice activivity detection in clean speech. vad_energy_threashold = 0.1
def get_filterbanks(nfilt=20, nfft=512, samplerate=16000, lowfreq=0, highfreq=None, filtertype='mel', multi_weight=False): """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) :param nfilt: the number of filters in the filterbank, default 20. :param nfft: the FFT size. Default is 512. :param samplerate: the samplerate of the signal we are working with. Affects mel spacing. :param lowfreq: lowest band edge of mel filters, default 0 Hz :param highfreq: highest band edge of mel filters, default samplerate/2 :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. """ highfreq = highfreq or samplerate / 2 assert highfreq <= samplerate / 2, "highfreq is greater than samplerate/2" if filtertype == 'mel': # compute points evenly spaced in mels lowmel = hz2mel(lowfreq) highmel = hz2mel(highfreq) melpoints = np.linspace(lowmel, highmel, nfilt + 2) # our points are in Hz, but we use fft bins, so we have to convert from Hz to fft bin number bin = np.floor((nfft + 1) * mel2hz(melpoints) / samplerate) elif filtertype == 'amel': # compute points evenly spaced in mels lowmel = hz2amel(lowfreq) highmel = hz2amel(highfreq) melpoints = np.linspace(lowmel, highmel, nfilt + 2) # our points are in Hz, but we use fft bins, so we have to convert from Hz to fft bin number bin = np.floor((nfft + 1) * amel2hz(melpoints) / samplerate) elif filtertype == 'linear': linearpoints = np.linspace(lowfreq, highfreq, nfilt + 2) # our points are in Hz, but we use fft bins, so we have to convert from Hz to fft bin number bin = np.floor((nfft + 1) * linearpoints / samplerate) elif filtertype.startswith('dnn'): x = np.arange(0, 161) * samplerate / 2 / 160 if filtertype.endswith('timit.fix'): y = np.array(c.TIMIT_FIlTER_FIX) elif filtertype.endswith('timit.var'): y = np.array(c.TIMIT_FIlTER_VAR) elif filtertype.endswith('timit.mdv'): y = np.array(c.TIMIT_FIlTER_MDV) elif filtertype.endswith('libri.fix'): y = np.array(c.LIBRI_FILTER_FIX) elif filtertype.endswith('libri.var'): y = np.array(c.LIBRI_FILTER_VAR) elif filtertype.endswith('vox1.soft'): y = np.array(c.VOX_FILTER_SOFT) elif filtertype == 'dnn.vox1': y = np.array(c.VOX_FILTER) f = interpolate.interp1d(x, y) x_new = np.arange(nfft // 2 + 1) * samplerate / 2 / (nfft // 2) lowfreq_idx = np.where(x_new >= lowfreq)[0] highfreq_idx = np.where(x_new <= highfreq)[0] ynew = f(x_new) # 计算插值结果 ynew[:int(lowfreq_idx[0])] = 0 if highfreq_idx[-1] < len(x_new) - 1: ynew[int(highfreq[-1] + 1):] = 0 weight = ynew / np.sum(ynew) bin = [] bin.append(lowfreq_idx[0]) for j in range(nfilt): num_wei = 0. for i in range(nfft // 2 + 1): num_wei += weight[i] if num_wei > (j + 1) / (nfilt + 1): bin.append(i - 1) break else: continue bin.append(highfreq_idx[-1]) fbank = np.zeros([nfilt, nfft // 2 + 1]) for j in range(0, nfilt): for i in range(int(bin[j]), int(bin[j + 1])): fbank[j, i] = (i - bin[j]) / (bin[j + 1] - bin[j]) for i in range(int(bin[j + 1]), int(bin[j + 2])): fbank[j, i] = (bin[j + 2] - i) / (bin[j + 2] - bin[j + 1]) if multi_weight: y = np.array(c.TIMIT_FIlTER_VAR) fbank = fbank * (y / y.max()) return fbank
csf_ssc = csf.ssc(audio) assert (np.shape(psf_ssc) == np.shape(csf_ssc)) error2d(psf_ssc, csf_ssc) print '' print 'hz2mel' print '======' assert (get_error(psf.hz2mel(8000), csf.hz2mel(8000)) <= acceptable_error) assert (get_error(psf.hz2mel(16000), csf.hz2mel(16000)) <= acceptable_error) assert (get_error(csf.mel2hz(csf.hz2mel(8000)), 8000) <= acceptable_error) print ' ✓' print '' print 'mel2hz' print '======' assert (get_error(psf.mel2hz(2595), csf.mel2hz(2595)) <= acceptable_error) assert (get_error(csf.mel2hz(5190), csf.mel2hz(5190)) <= acceptable_error) assert (get_error(csf.hz2mel(csf.mel2hz(2595)), 2595) <= acceptable_error) print ' ✓' print '' print 'get_filterbanks' print '===============' psf_filterbanks = psf.get_filterbanks() csf_filterbanks = csf.get_filterbanks() assert (np.shape(psf_filterbanks) == np.shape(csf_filterbanks)) error2d(psf_filterbanks, csf_filterbanks) print '' print 'lifter' print '======'
def getmelpoint(_n_filt=N_FILT): lowmel = hz2mel(0) highmel = hz2mel(SAMPLING_RATE / 2) melpoints = np.linspace(lowmel, highmel, _n_filt + 1) return mel2hz(melpoints)[1:_n_filt + 1]
self.num_filter = num_filter self.sr = sr <<<<<<< HEAD ======= self.exp = exp self.filter_fix = filter_fix requires_grad = not filter_fix >>>>>>> Server/Server input_freq = np.linspace(0, self.sr / 2, input_dim) self.input_freq = nn.Parameter(torch.from_numpy(input_freq).expand(num_filter, input_dim).float(), requires_grad=False) centers = np.linspace(0, hz2mel(sr / 2), num_filter + 2) centers = mel2hz(centers) <<<<<<< HEAD self.frequency_center = nn.Parameter(torch.from_numpy(centers[1:-1]).float().reshape(num_filter, 1)) ======= self.frequency_center = nn.Parameter(torch.from_numpy(centers[1:-1]).float().reshape(num_filter, 1), requires_grad=requires_grad) >>>>>>> Server/Server bandwidth = [] for i in range(2, len(centers)): bandwidth.append(centers[i] - centers[i - 1]) <<<<<<< HEAD self.bandwidth = nn.Parameter(torch.tensor(bandwidth).reshape(num_filter, 1).float()) self.gain = nn.Parameter(torch.ones(num_filter, dtype=torch.float32).reshape(num_filter, 1)) def forward(self, input):