def process_and_serialize(data_type): """ Serialize, down-sample the sliced signals and save on separate folder. """ stride = 0.5 if data_type == 'train': clean_folder = clean_train_folder noisy_folder = noisy_train_folder signal_save_folder = signal_train_folder spec_save_folder = spec_train_folder else: clean_folder = clean_test_folder noisy_folder = noisy_test_folder signal_save_folder = signal_test_folder spec_save_folder = spec_test_folder # walk through the path, slice the audio file, and save the serialized result for root, dirs, files in os.walk(clean_folder): if len(files) == 0: continue for filename in tqdm( files, desc='Serialize and down-sample {} audios'.format(data_type)): clean_file = os.path.join(clean_folder, filename) noisy_file = os.path.join(noisy_folder, filename) # slice both clean signal and noisy signal clean_sliced = slice_signal(clean_file, window_size, stride, sample_rate) noisy_sliced = slice_signal(noisy_file, window_size, stride, sample_rate) # serialize - file format goes [original_file]_[slice_number].npy # ex) p293_154.wav_5.npy denotes 5th slice of p293_154.wav file for idx, slice_tuple in enumerate(zip(clean_sliced, noisy_sliced)): clean_empha, noisy_empha = emphasis( slice_tuple[0], pre=True), emphasis(slice_tuple[1], pre=True) # save the signal pair signal_pair = np.array([clean_empha, noisy_empha]) np.save(os.path.join(signal_save_folder, '{}_{}'.format(filename, idx)), arr=signal_pair) # save the spectrogram pair clean_spec = log_power_spectrogram( signal_to_spectrogram(clean_empha)) noisy_spec = log_power_spectrogram( signal_to_spectrogram(noisy_empha)) noisy_phase = get_phase(signal_to_spectrogram(slice_tuple[1])) spec_pair = np.array([clean_spec, noisy_spec, noisy_phase]) np.save(os.path.join(spec_save_folder, '{}_{}'.format(filename, idx)), arr=spec_pair)
def process_and_serialize(data_type): """ Serialize, down-sample the sliced signals and save on separate folder. """ stride = 0.5 if data_type == 'train': clean_folder = clean_train_folder noisy_folder = noisy_train_folder clean_save_folder = signal_train_clean_folder noisy_save_folder = signal_train_noisy_folder else: clean_folder = clean_test_folder noisy_folder = noisy_test_folder clean_save_folder = signal_test_clean_folder noisy_save_folder = signal_test_noisy_folder # walk through the path, slice the audio file, and save the serialized result for root, dirs, files in os.walk(clean_folder): if len(files) == 0: continue for filename in tqdm( files, desc='Serialize and down-sample {} audios'.format(data_type)): clean_file = os.path.join(clean_folder, filename) noisy_file = os.path.join(noisy_folder, filename) # slice both clean signal and noisy signal clean_sliced = slice_signal(clean_file, window_size, stride, sample_rate) noisy_sliced = slice_signal(noisy_file, window_size, stride, sample_rate) # serialize - file format goes [original_file]_[slice_number].npy # ex) p293_154.wav_5.npy denotes 5th slice of p293_154.wav file for idx, slice_tuple in enumerate(zip(clean_sliced, noisy_sliced)): clean_empha = emphasis(slice_tuple[0], pre=True) noisy_empha = emphasis(slice_tuple[1], pre=True) # save the signal librosa.output.write_wav(os.path.join( clean_save_folder, '{}_{}.wav'.format(filename.replace('.wav', ''), idx)), clean_empha, sr=16000) librosa.output.write_wav(os.path.join( noisy_save_folder, '{}_{}.wav'.format(filename.replace('.wav', ''), idx)), noisy_empha, sr=16000)
def __getitem__(self, idx): pair = np.load(self.filenames[idx]) pair = emphasis(pair[np.newaxis, :, :], emph_coeff=0.95).reshape(2, -1) clean = pair[0].reshape(1, -1).astype('float32') noisy = pair[1].reshape(1, -1).astype('float32') return noisy, clean
def _test_and_save(self, model, epoch): print('Saving test sample and model...') with torch.no_grad(): model.eval() test_bar = tqdm(self.test_data_loader, desc='Test model and save generated audios') for test_file_name, clean_t, noisy_t in test_bar: # calculate phase for sythesis # 1 x 16384 -> 16384 -> 257 x 1025 spec = signal_to_spectrogram(noisy_t.squeeze().numpy()) phase = get_phase(spec) # prepare data to feed model test_data = (clean_t, noisy_t) test_data = self._prepare_train_data(test_data) # only need noisy data if self.model_name == 'adversarial_MLP': noisy_data = test_data[2] else: noisy_data = test_data[1] if self.using_spectrogram: if 'GAN' in self.model_name or 'auto' in self.model_name: # 1 x 1 x 257 x 1025 -> 257 x 1025 fake_spec = model(noisy_data).detach().cpu().squeeze().numpy() elif 'MLP' in self.model_name: # 1 x 1025 x 257 -> 1025 x 257 -> 257 x 1025 fake_spec = model(noisy_data).detach().cpu().squeeze().numpy() fake_spec = fake_spec.T else: raise NotImplemented # log_power back to magnitude fake_spec = lps_to_mag(fake_spec) # magnitude back to cpmplex fake_spec = magnitude_to_complex(fake_spec, phase) # back to audio signal # 16384 fake_speech = librosa.istft(fake_spec, win_length=32, hop_length=16, window='hann') else: # 16384 fake_speech = model(noisy_data).detach().cpu().squeeze().numpy() save_path = os.path.join(f'{self.model_name}_results', 'results', f'{self.criterion_name}') if not os.path.exists(save_path): os.makedirs(save_path) # de-emphasis fake_speech = emphasis(fake_speech, emph_coeff=0.95, pre=False) # save speech as .wav file file_name = os.path.join(save_path, '{}.wav'.format(test_file_name[0].replace('.wav', ''))) wavfile.write(file_name, 16000, fake_speech) # save the model parameters for each epoch save_path = os.path.join(f'{self.model_name}_results', 'model') if not os.path.exists(save_path): os.makedirs(save_path) model_path = os.path.join(save_path, f'{self.model_name}-e{epoch}-{self.criterion_name}.pt') torch.save(model.state_dict(), model_path) print(f'model saved at {model_path}') return
# backprop + optimize g_loss.backward() g_optimizer.step() train_bar.set_description( 'Epoch {}: d_clean_loss {:.4f}, d_noisy_loss {:.4f}, g_loss {:.4f}, g_conditional_loss {:.4f}' .format(epoch + 1, clean_loss.data[0], noisy_loss.data[0], g_loss.data[0], g_cond_loss.data[0])) # TEST model test_bar = tqdm(test_data_loader, desc='Test model and save generated audios') for test_file_names, test_noisy in test_bar: z = nn.init.normal_(torch.Tensor(test_noisy.size(0), 1024, 8)) if torch.cuda.is_available(): test_noisy, z = test_noisy.cuda(), z.cuda() test_noisy, z = Variable(test_noisy), Variable(z) fake_speech = generator(test_noisy, z).data.cpu().numpy() # convert to numpy array fake_speech = emphasis(fake_speech, emph_coeff=0.95, pre=False) for idx in range(fake_speech.shape[0]): generated_sample = fake_speech[idx] file_name = os.path.join('results', '{}_e{}.wav'.format(test_file_names[idx].replace('.npy', ''), epoch + 1)) wavfile.write(file_name, sample_rate, generated_sample.T) # save the model parameters for each epoch g_path = os.path.join('epochs', 'generator-{}.pkl'.format(epoch + 1)) d_path = os.path.join('epochs', 'discriminator-{}.pkl'.format(epoch + 1)) torch.save(generator.state_dict(), g_path) torch.save(discriminator.state_dict(), d_path)
opt = parser.parse_args() FILE_NAME = opt.file_name EPOCH_NAME = opt.epoch_name generator = Generator() generator.load_state_dict( torch.load('epochs/' + EPOCH_NAME, map_location='cpu')) if torch.cuda.is_available(): generator.cuda() noisy_slices = slice_signal(FILE_NAME, window_size, 1, sample_rate) enhanced_speech = [] for noisy_slice in tqdm(noisy_slices, desc='Generate enhanced audio'): z = nn.init.normal_(torch.Tensor(1, 1024, 8)) noisy_slice = torch.from_numpy( emphasis(noisy_slice[np.newaxis, np.newaxis, :])).type(torch.FloatTensor) if torch.cuda.is_available(): noisy_slice, z = noisy_slice.cuda(), z.cuda() noisy_slice, z = Variable(noisy_slice), Variable(z) generated_speech = generator(noisy_slice, z).data.cpu().numpy() generated_speech = emphasis(generated_speech, emph_coeff=0.95, pre=False) generated_speech = generated_speech.reshape(-1) enhanced_speech.append(generated_speech) enhanced_speech = np.array(enhanced_speech).reshape(1, -1) file_name = os.path.join( os.path.dirname(FILE_NAME), 'enhanced_{}.wav'.format(os.path.basename(FILE_NAME).split('.')[0])) wavfile.write(file_name, sample_rate, enhanced_speech.T)