示例#1
0
def process_and_serialize(data_type):
    """
    Serialize, down-sample the sliced signals and save on separate folder.
    """
    stride = 0.5

    if data_type == 'train':
        clean_folder = clean_train_folder
        noisy_folder = noisy_train_folder
        signal_save_folder = signal_train_folder
        spec_save_folder = spec_train_folder
    else:
        clean_folder = clean_test_folder
        noisy_folder = noisy_test_folder
        signal_save_folder = signal_test_folder
        spec_save_folder = spec_test_folder

    # walk through the path, slice the audio file, and save the serialized result
    for root, dirs, files in os.walk(clean_folder):
        if len(files) == 0:
            continue
        for filename in tqdm(
                files,
                desc='Serialize and down-sample {} audios'.format(data_type)):
            clean_file = os.path.join(clean_folder, filename)
            noisy_file = os.path.join(noisy_folder, filename)
            # slice both clean signal and noisy signal
            clean_sliced = slice_signal(clean_file, window_size, stride,
                                        sample_rate)
            noisy_sliced = slice_signal(noisy_file, window_size, stride,
                                        sample_rate)
            # serialize - file format goes [original_file]_[slice_number].npy
            # ex) p293_154.wav_5.npy denotes 5th slice of p293_154.wav file
            for idx, slice_tuple in enumerate(zip(clean_sliced, noisy_sliced)):
                clean_empha, noisy_empha = emphasis(
                    slice_tuple[0], pre=True), emphasis(slice_tuple[1],
                                                        pre=True)
                # save the signal pair
                signal_pair = np.array([clean_empha, noisy_empha])
                np.save(os.path.join(signal_save_folder,
                                     '{}_{}'.format(filename, idx)),
                        arr=signal_pair)
                # save the spectrogram pair
                clean_spec = log_power_spectrogram(
                    signal_to_spectrogram(clean_empha))
                noisy_spec = log_power_spectrogram(
                    signal_to_spectrogram(noisy_empha))
                noisy_phase = get_phase(signal_to_spectrogram(slice_tuple[1]))
                spec_pair = np.array([clean_spec, noisy_spec, noisy_phase])
                np.save(os.path.join(spec_save_folder,
                                     '{}_{}'.format(filename, idx)),
                        arr=spec_pair)
def process_and_serialize(data_type):
    """
    Serialize, down-sample the sliced signals and save on separate folder.
    """
    stride = 0.5

    if data_type == 'train':
        clean_folder = clean_train_folder
        noisy_folder = noisy_train_folder
        clean_save_folder = signal_train_clean_folder
        noisy_save_folder = signal_train_noisy_folder

    else:
        clean_folder = clean_test_folder
        noisy_folder = noisy_test_folder
        clean_save_folder = signal_test_clean_folder
        noisy_save_folder = signal_test_noisy_folder

    # walk through the path, slice the audio file, and save the serialized result
    for root, dirs, files in os.walk(clean_folder):
        if len(files) == 0:
            continue
        for filename in tqdm(
                files,
                desc='Serialize and down-sample {} audios'.format(data_type)):
            clean_file = os.path.join(clean_folder, filename)
            noisy_file = os.path.join(noisy_folder, filename)
            # slice both clean signal and noisy signal
            clean_sliced = slice_signal(clean_file, window_size, stride,
                                        sample_rate)
            noisy_sliced = slice_signal(noisy_file, window_size, stride,
                                        sample_rate)
            # serialize - file format goes [original_file]_[slice_number].npy
            # ex) p293_154.wav_5.npy denotes 5th slice of p293_154.wav file
            for idx, slice_tuple in enumerate(zip(clean_sliced, noisy_sliced)):
                clean_empha = emphasis(slice_tuple[0], pre=True)
                noisy_empha = emphasis(slice_tuple[1], pre=True)
                # save the signal
                librosa.output.write_wav(os.path.join(
                    clean_save_folder,
                    '{}_{}.wav'.format(filename.replace('.wav', ''), idx)),
                                         clean_empha,
                                         sr=16000)
                librosa.output.write_wav(os.path.join(
                    noisy_save_folder,
                    '{}_{}.wav'.format(filename.replace('.wav', ''), idx)),
                                         noisy_empha,
                                         sr=16000)
示例#3
0
    def __getitem__(self, idx):
        pair = np.load(self.filenames[idx])
        pair = emphasis(pair[np.newaxis, :, :], emph_coeff=0.95).reshape(2, -1)
        clean = pair[0].reshape(1, -1).astype('float32')
        noisy = pair[1].reshape(1, -1).astype('float32')

        return noisy, clean
示例#4
0
    def _test_and_save(self, model, epoch):
        print('Saving test sample and model...')
        with torch.no_grad():
            model.eval()
            test_bar = tqdm(self.test_data_loader, desc='Test model and save generated audios')
            for test_file_name, clean_t, noisy_t in test_bar:
                # calculate phase for sythesis
                # 1 x 16384 -> 16384 -> 257 x 1025
                spec = signal_to_spectrogram(noisy_t.squeeze().numpy())
                phase = get_phase(spec)
                # prepare data to feed model
                test_data = (clean_t, noisy_t)
                test_data = self._prepare_train_data(test_data)
                # only need noisy data
                if self.model_name == 'adversarial_MLP':
                    noisy_data = test_data[2]
                else:
                    noisy_data = test_data[1]

                if self.using_spectrogram:
                    if 'GAN' in self.model_name or 'auto' in self.model_name:
                        # 1 x 1 x 257 x 1025 -> 257 x 1025
                        fake_spec = model(noisy_data).detach().cpu().squeeze().numpy()
                    elif 'MLP' in self.model_name:
                        # 1 x 1025 x 257 -> 1025 x 257 -> 257 x 1025
                        fake_spec = model(noisy_data).detach().cpu().squeeze().numpy()
                        fake_spec = fake_spec.T
                    else:
                        raise NotImplemented

                    # log_power back to magnitude
                    fake_spec = lps_to_mag(fake_spec)
                    # magnitude back to cpmplex
                    fake_spec = magnitude_to_complex(fake_spec, phase)
                    # back to audio signal
                    # 16384
                    fake_speech = librosa.istft(fake_spec, win_length=32, hop_length=16, window='hann')

                else:
                    # 16384
                    fake_speech = model(noisy_data).detach().cpu().squeeze().numpy()

                save_path = os.path.join(f'{self.model_name}_results', 'results', f'{self.criterion_name}')
                if not os.path.exists(save_path):
                    os.makedirs(save_path)
                # de-emphasis
                fake_speech = emphasis(fake_speech, emph_coeff=0.95, pre=False)
                # save speech as .wav file
                file_name = os.path.join(save_path, '{}.wav'.format(test_file_name[0].replace('.wav', '')))
                wavfile.write(file_name, 16000, fake_speech)

            # save the model parameters for each epoch
            save_path = os.path.join(f'{self.model_name}_results', 'model')
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            model_path = os.path.join(save_path, f'{self.model_name}-e{epoch}-{self.criterion_name}.pt')
            torch.save(model.state_dict(), model_path)
            print(f'model saved at {model_path}')
            return
示例#5
0
文件: main.py 项目: yhgon/segan-pyt
            # backprop + optimize
            g_loss.backward()
            g_optimizer.step()

            train_bar.set_description(
                'Epoch {}: d_clean_loss {:.4f}, d_noisy_loss {:.4f}, g_loss {:.4f}, g_conditional_loss {:.4f}'
                    .format(epoch + 1, clean_loss.data[0], noisy_loss.data[0], g_loss.data[0], g_cond_loss.data[0]))

        # TEST model
        test_bar = tqdm(test_data_loader, desc='Test model and save generated audios')
        for test_file_names, test_noisy in test_bar:
            z = nn.init.normal_(torch.Tensor(test_noisy.size(0), 1024, 8))
            if torch.cuda.is_available():
                test_noisy, z = test_noisy.cuda(), z.cuda()
            test_noisy, z = Variable(test_noisy), Variable(z)
            fake_speech = generator(test_noisy, z).data.cpu().numpy()  # convert to numpy array
            fake_speech = emphasis(fake_speech, emph_coeff=0.95, pre=False)

            for idx in range(fake_speech.shape[0]):
                generated_sample = fake_speech[idx]
                file_name = os.path.join('results',
                                         '{}_e{}.wav'.format(test_file_names[idx].replace('.npy', ''), epoch + 1))
                wavfile.write(file_name, sample_rate, generated_sample.T)

        # save the model parameters for each epoch
        g_path = os.path.join('epochs', 'generator-{}.pkl'.format(epoch + 1))
        d_path = os.path.join('epochs', 'discriminator-{}.pkl'.format(epoch + 1))
        torch.save(generator.state_dict(), g_path)
        torch.save(discriminator.state_dict(), d_path)
示例#6
0
    opt = parser.parse_args()
    FILE_NAME = opt.file_name
    EPOCH_NAME = opt.epoch_name

    generator = Generator()
    generator.load_state_dict(
        torch.load('epochs/' + EPOCH_NAME, map_location='cpu'))
    if torch.cuda.is_available():
        generator.cuda()

    noisy_slices = slice_signal(FILE_NAME, window_size, 1, sample_rate)
    enhanced_speech = []
    for noisy_slice in tqdm(noisy_slices, desc='Generate enhanced audio'):
        z = nn.init.normal_(torch.Tensor(1, 1024, 8))
        noisy_slice = torch.from_numpy(
            emphasis(noisy_slice[np.newaxis,
                                 np.newaxis, :])).type(torch.FloatTensor)
        if torch.cuda.is_available():
            noisy_slice, z = noisy_slice.cuda(), z.cuda()
        noisy_slice, z = Variable(noisy_slice), Variable(z)
        generated_speech = generator(noisy_slice, z).data.cpu().numpy()
        generated_speech = emphasis(generated_speech,
                                    emph_coeff=0.95,
                                    pre=False)
        generated_speech = generated_speech.reshape(-1)
        enhanced_speech.append(generated_speech)

    enhanced_speech = np.array(enhanced_speech).reshape(1, -1)
    file_name = os.path.join(
        os.path.dirname(FILE_NAME),
        'enhanced_{}.wav'.format(os.path.basename(FILE_NAME).split('.')[0]))
    wavfile.write(file_name, sample_rate, enhanced_speech.T)