Exemplo n.º 1
0
def create_rules_for_mixing_speech_with_noises(workspace, speech_dir, noise_dir, data_type, magnification):
    """Create csv containing mixture information.

    Each row in the .csv file contains [speech_filename, noise_filename, noise_onset, noise_offset]

    Args:
      workspace: str, path of workspace.
      speech_dir: str, path of speech data.
      noise_dir: str, path of noise data.
      data_type: str, 'train' | 'test'.
      magnification: int, only used when data_type='train', number of noise
          selected to mix with a speech. E.g., when magnification=3, then 4620
          speech with create 4620*3 mixtures. magnification should not larger
          than the species of noises.
    """
    time_start = time.time()

    random_state = np.random.RandomState(42)

    rules_dir = os.path.join(workspace, "mixing_rules")
    create_directory(rules_dir)

    rules_filename = os.path.join(rules_dir, "{}.csv".format(data_type))
    with open(rules_filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["speech_file_name", "noise_file_name", "noise_begin", "noise_end"])

        noise_paths = glob.glob(noise_dir + "*.wav")
        speech_paths = glob.glob(speech_dir + "*.wav")

        for speech_path in speech_paths:
            (speech_audio, _) = read_audio(speech_path)

            # For training data, mix each speech with randomly picked #magnification noises.
            # For test data, mix each speech with all noises.
            if data_type == "train":
                noise_paths = random_state.choice(noise_paths, size=magnification, replace=False)

            for noise_path in noise_paths:
                (noise_audio, _) = read_audio(noise_path)

                if noise_audio.shape[0] <= speech_audio.shape[0]:
                    noise_begin = 0
                    noise_end = noise_audio.shape[0]
                else:
                    # If noise longer than speech then randomly select a segment of noise.
                    noise_begin = random_state.randint(0, noise_audio.shape[0] - speech_audio.shape[0], size=1)[0]
                    noise_end = noise_begin + speech_audio.shape[0]

                writer.writerow([os.path.basename(speech_path), os.path.basename(noise_path), noise_begin, noise_end])

    print()
    print("Mixing clean {} speech with noises time: {}".format(data_type, time.time() - time_start))
    print()
Exemplo n.º 2
0
def extract_features(workspace, speech_to_enhance_dir, snr):
    time_start = time.time()

    sample_rate = cfg.sample_rate

    audios_to_enhance_dir = os.path.join(workspace, speech_to_enhance_dir)
    for audio_id, audio_path in enumerate(
            glob.glob(audios_to_enhance_dir + "/*.wav")):
        speech_audio = read_audio(audio_path, target_fs=sample_rate)[0]

        speech_audio_complex_spectrogram = calculate_spectrogram(
            speech_audio,
            mode="complex",
            window_size=cfg.n_window,
            n_overlap=cfg.n_overlap)

        # Save features.
        features = [
            speech_audio_complex_spectrogram,
            os.path.basename(audio_path).split(".")[0]
        ]

        features_filename = "{}.pickle".format(
            os.path.basename(audio_path).split(".")[0])
        features_dir = os.path.join(workspace, "data", "speech_to_enhance",
                                    "features", "spectrogram",
                                    "{}db".format(int(snr)))
        create_directory(features_dir)

        features_path = os.path.join(features_dir, features_filename)
        pickle.dump(features,
                    open(features_path, "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)

    print()
    print("Extracting features time: %s" % (time.time() - time_start))
    print()
Exemplo n.º 3
0
def calculate_mixture_features(workspace, speech_dir, noise_dir, data_type, snr):
    """Calculate spectrogram for mixed, speech and noise audio. Then write the
    features to disk.

    Args:
      workspace: str, path of workspace.
      speech_dir: str, path of speech data.
      noise_dir: str, path of noise data.
      data_type: str, 'train' | 'test'.
      snr: float, signal to noise ratio to be mixed.
    """
    time_start = time.time()

    fs = cfg.sample_rate

    # Open mixture csv.
    rules_filename = os.path.join(workspace, "mixing_rules", "{}.csv".format(data_type))
    with open(rules_filename, "r", encoding="utf-8") as f:
        rules_reader = csv.reader(f)
        next(rules_reader, None)  # skip the headers

        for i, rule in enumerate(rules_reader):
            [speech_filename, noise_filename, noise_begin, noise_end] = rule

            speech_path = os.path.join(speech_dir, speech_filename)
            speech_audio = read_audio(speech_path, target_fs=fs)[0]

            noise_path = os.path.join(noise_dir, noise_filename)
            noise_audio = read_audio(noise_path, target_fs=fs)[0]

            # Repeat noise n_repeat times to cover entire clean speech sample.
            if noise_audio.shape[0] < speech_audio.shape[0]:
                n_repeat = int(np.ceil(speech_audio.shape[0] / noise_audio.shape[0]))
                noise_audio = np.tile(noise_audio, n_repeat)[:speech_audio.shape[0]]
            # Truncate noise to the same length as speech.
            else:
                noise_audio = noise_audio[int(noise_begin):int(noise_end)]

            # Scale speech to given SNR.
            scaler = get_amplitude_scaling_factor(speech_audio, noise_audio, snr=snr)
            speech_audio *= scaler

            # Get normalized mixture, speech, noise.
            mixed_audio, speech_audio, noise_audio, alpha = additive_mixing(speech_audio, noise_audio)

            rule_name = "{}.{}".format(speech_filename.split(".")[0], noise_filename.split(".")[0])

            # Save mixed audio.
            mixed_audio_filename = "{}.wav".format(rule_name)
            mixed_audio_dir = os.path.join(workspace, "mixed_audios", "spectrogram", data_type, "{}db".format(int(snr)))
            create_directory(mixed_audio_dir)

            write_audio(os.path.join(mixed_audio_dir, mixed_audio_filename), mixed_audio, fs)

            # Extract spectrograms.
            mixed_audio_complex_spectrogram = calculate_spectrogram(mixed_audio, mode='complex',
                                                                    window_size=cfg.n_window, n_overlap=cfg.n_overlap)
            speech_spectrogram = calculate_spectrogram(speech_audio, mode='magnitude',
                                                       window_size=cfg.n_window, n_overlap=cfg.n_overlap)
            noise_spectrogram = calculate_spectrogram(noise_audio, mode='magnitude',
                                                      window_size=cfg.n_window, n_overlap=cfg.n_overlap)

            # Save features.
            features_filename = "{}.{}.pickle".format(speech_filename.split(".")[0], noise_filename.split(".")[0])
            features_dir = os.path.join(workspace, "features", "spectrogram", data_type, "{}db".format(int(snr)))
            create_directory(features_dir)

            features = [mixed_audio_complex_spectrogram, speech_spectrogram, noise_spectrogram, alpha, rule_name]
            feature_path = os.path.join(features_dir, features_filename)
            pickle.dump(features, open(feature_path, "wb"), protocol=pickle.HIGHEST_PROTOCOL)

            if (i + 1) % 101 == 0:
                print("Iteration # {}".format(i))

    print()
    print("Extracting features time: %s" % (time.time() - time_start))
    print()
Exemplo n.º 4
0
def test(args):

    if not os.path.exists('experiments'):
        os.makedirs('experiments')

    transfs = transforms.Compose([
        # transforms.Scale(),
        prepro.DB_Spec(sr=11025, n_fft=400, hop_t=0.010, win_t=0.025)
    ])

    # mel_basis = librosa.filters.mel(16000, 256, n_mels=80, norm=1)
    # sr = 16000

    if args.model_type == 'vae_g_l':
        model = vae_g_l.VAE(args)
        model.load_state_dict(
            torch.load('experiments/' + args.model_name,
                       map_location=lambda storage, loc: storage))
    elif args.model_type == 'vae_l':
        model = vae_l.VAE(args)
        model.load_state_dict(
            torch.load('experiments/' + args.model_name,
                       map_location=lambda storage, loc: storage))

    model.eval()

    if args.dataset == "VCTK":
        # male example
        # data, sr = prepro.read_audio('/work/invx030/datasets/VCTK-Corpus/wav48/p245/p245_002.wav')
        # Female example
        data, sr = prepro.read_audio(
            '/work/invx030/datasets/VCTK-Corpus/wav48/p233/p233_003.wav')
    elif args.dataset == "LibriSpeech":
        # male
        # data, sr = prepro.read_audio('/work/invx030/datasets/LibriSpeech/test-clean/1089/134686/1089-134686-0001.flac')
        # female
        data, sr = prepro.read_audio(
            '/work/invx030/datasets/LibriSpeech/test-clean/4507/16021/4507-16021-0001.flac'
        )
    else:
        raise Exception('No valid dataset provided (use --dataset)')

    hop_length = int(sr * 0.010)
    n_fft = 400
    win_length = int(sr * 0.025)

    data = transfs(data)
    data = data / (torch.min(data))

    data = Variable(data)
    data = data.unsqueeze(0)

    data = data.transpose(1, 2)
    original = data

    if args.predictive:
        data = F.pad(data, (0, 0, 1, 0), "constant", 1.)
        original = F.pad(original, (0, 0, 0, 1), "constant", 1.)

    outs = model(data)
    reconstruction = outs.decoder_out
    reconstruction = reconstruction.transpose(1, 2)
    reconstruction = reconstruction.squeeze(0)
    reconstruction = (reconstruction.data.cpu()).numpy()
    reconstruction = reconstruction * -80.

    original = original.transpose(1, 2)
    original = original.squeeze(0).squeeze(0)
    original = (original.data.cpu()).numpy()
    original = original * -80.

    librosa.display.specshow(original,
                             sr=sr,
                             hop_length=hop_length,
                             x_axis='time',
                             y_axis='linear',
                             cmap='viridis')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Original DB spectrogram')
    pylab.savefig('experiments/original_spec.png')

    plt.clf()

    librosa.display.specshow(reconstruction,
                             sr=sr,
                             hop_length=hop_length,
                             x_axis='time',
                             y_axis='linear',
                             cmap='viridis')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Reconstruction DB spectrogram')
    pylab.savefig('experiments/reconstruction_spec.png')

    inverse = to_audio(original, sr=sr, n_fft=n_fft, hop_t=0.010, win_t=0.025)

    librosa.output.write_wav('experiments/original.wav',
                             inverse,
                             sr,
                             norm=True)

    inverse = to_audio(reconstruction,
                       sr,
                       n_fft=n_fft,
                       hop_t=0.010,
                       win_t=0.025)
    librosa.output.write_wav('experiments/reconstruction.wav',
                             inverse,
                             sr,
                             norm=True)
 def __init__(self, root, downsample=True, transform=None, target_transform=None, dev_mode=False, preprocessed=False, person_filter=None, filter_mode = 'exclude', max_len=201, split='train'):
     self.person_filter = person_filter
     self.filter_mode = filter_mode
     self.root = os.path.expanduser(root)
     self.downsample = downsample
     self.transform = transform
     self.target_transform = target_transform
     self.dev_mode = dev_mode
     self.num_samples = 0
     self.max_len = max_len
     self.split = split
     
     if preprocessed:
         self.root_dir = os.path.expanduser('librispeech_preprocessed/')
         if self.split == 'train':
             self.data_paths = os.listdir(os.path.join(self.root_dir,'train'))
             self.root_dir = os.path.join(self.root_dir,'train/')
         elif self.split == 'test':
             self.data_paths = os.listdir(os.path.join(self.root_dir,'test'))
             self.root_dir = os.path.join(self.root_dir,'test/')
         
         if person_filter:
             if self.filter_mode == 'include':                    
                 self.data_paths = [sample for sample in self.data_paths if any(sample.startswith(pers+'-') for pers in self.person_filter)]
             elif self.filter_mode == 'exclude':
                 self.data_paths = [sample for sample in self.data_paths if not any(sample.startswith(pers+'-') for pers in self.person_filter)]
         
         self.num_samples = len(self.data_paths)
         
     else:            
         paths = make_manifest(self.root)
         os.mkdir('librispeech_preprocessed')
         os.mkdir('librispeech_preprocessed/train')
         os.mkdir('librispeech_preprocessed/test')
         
         test_splits = open("librispeech_splits/test_split.txt")
         train_splits = open("librispeech_splits/train_split.txt")
         split_reader = csv.reader(test_splits)
         test_data = [r[0] for r in split_reader]
         split_reader = csv.reader(train_splits)
         train_data = [r[0] for r in split_reader]
         
         with open(os.path.join(self.root,"SPEAKERS.TXT")) as csvfile:                
             csvreader = csv.reader(csvfile, delimiter='|')
             for i in range(12):
                 next(csvreader)
             rows = [r for r in csvreader]
             dict = {x[0].strip():[x[1].strip()] for x in rows}
             for z, path in enumerate(paths):              
                 
                 keyword = 'train-clean-100/'
                 before_keyword, keyword, after_keyword = path.partition(keyword)
                 before_keyword, keyword, after_keyword = after_keyword.partition('/')
                 pers = before_keyword
                 before_keyword, keyword, after_keyword = after_keyword.partition('/')
                 before_keyword, keyword, after_keyword = after_keyword.partition('.flac')
                 
                 sig = read_audio(path)
                 if self.transform is not None:
                     sig = self.transform(sig[0])
                     
                 else:
                     sig = sig[0]
                 
                 try:
                     data = (sig.tolist(), dict[pers] + [pers])
                     if before_keyword in train_data:
                         ujson.dump(data,open("librispeech_preprocessed/train/{}.json".format(before_keyword), 'w'))
                     elif before_keyword in test_data:
                         ujson.dump(data,open("librispeech_preprocessed/test/{}.json".format(before_keyword), 'w'))
                     if z % 100 == 0:
                         print "{} iterations".format(z)
                     self.train_data_paths = os.listdir(os.path.expanduser('librispeech_preprocessed/train/'))
                     self.test_data_paths = os.listdir(os.path.expanduser('librispeech_preprocessed/test/'))
                 except:
                     continue
         
         self.train_data_paths = os.listdir(os.path.expanduser('librispeech_preprocessed/train/'))
         self.test_data_paths = os.listdir(os.path.expanduser('librispeech_preprocessed/test/'))
         self.num_samples = len(self.train_data_paths)
         print "{} samples processed".format(self.num_samples)
    def __init__(self,
                 root,
                 downsample=True,
                 transform=None,
                 target_transform=None,
                 dev_mode=False,
                 preprocessed=False,
                 person_filter=None,
                 filter_mode='exclude',
                 max_len=201):
        self.person_filter = person_filter
        self.filter_mode = filter_mode
        self.root = os.path.expanduser(root)
        self.downsample = downsample
        self.transform = transform
        self.target_transform = target_transform
        self.dev_mode = dev_mode
        self.num_samples = 0
        self.max_len = max_len

        if preprocessed:
            self.root_dir = os.path.expanduser('vctk_preprocessed/')
            self.data_paths = os.listdir(self.root_dir)

            if person_filter:
                if self.filter_mode == 'include':
                    self.data_paths = [
                        sample for sample in self.data_paths
                        if any(pers in sample for pers in self.person_filter)
                    ]
                elif self.filter_mode == 'exclude':
                    self.data_paths = [
                        sample for sample in self.data_paths
                        if not any(pers in sample
                                   for pers in self.person_filter)
                    ]

            self.num_samples = len(self.data_paths)

        else:
            paths = make_manifest(self.root)
            os.mkdir('vctk_preprocessed/')
            with open(os.path.join(self.root, "speaker-info.txt")) as csvfile:
                csvreader = csv.reader(csvfile, delimiter=' ')
                next(csvreader)
                rows = [r for r in csvreader]
                dict = {x[0]: [x[4], x[2], x[8]] for x in rows}
                for z, path in enumerate(paths):

                    keyword = 'wav48/'
                    befor_keyowrd, keyword, after_keyword = path.partition(
                        keyword)
                    pers = after_keyword[1:4]

                    sig = read_audio(path)
                    if self.transform is not None:
                        sig = self.transform(sig[0])
                    else:
                        sig = sig[0]
                    try:
                        self.data = (sig.tolist(), dict[pers] + [pers])
                        ujson.dump(
                            self.data,
                            open(
                                "vctk_preprocessed/{}.json".format(
                                    after_keyword[5:13]), 'w'))
                        if z % 100 == 0:
                            print "{} iterations".format(z)
                        self.data_paths = os.listdir(
                            os.path.expanduser('vctk_preprocessed/'))
                    except:
                        continue

            self.data_paths = os.listdir(
                os.path.expanduser('vctk_preprocessed/'))
            self.num_samples = len(self.data_paths)
            print "{} samples processed".format(self.num_samples)