def write_lmdb(out_file_name, data_list):
    lmdb_output = lmdb.open(out_file_name, map_size=get_map_size(data_list))
    with lmdb_output.begin(write=True) as txn:
        # txn is a Transaction object
        for audio_indx, audio_path in enumerate(tqdm(data_list)):
            if not ('mixture' in audio_path):
                continue  # looping over mixture and getting vocals from it
            mixed_data = load_wav(audio_path).astype('float32')
            vocals_data = load_wav(audio_path.replace(
                'mixture', 'vocals')).astype('float32')
            '''
            # to remove zeros from mixed and vocals based on vocals
            silent_set = get_silent_set(vocals_data)
            mixed_data = remove_silence(mixed_data, silent_set)
            vocals_data = remove_silence(vocals_data, silent_set)
            '''

            vocals_indices = get_sequence_with_singing_indices(
                vocals_data, 800)

            datum = datanum_pb2.DataNum()
            datum.mixture = mixed_data.tobytes()
            datum.vocals = vocals_data.tobytes()
            datum.vocals_indices = vocals_indices.tobytes(
            )  # used to store the indices having voice
            str_id = '{:08}'.format(audio_indx)
            txn.put(str_id.encode('ascii'), datum.SerializeToString())
Exemplo n.º 2
0
 def __getitem__(self, index):
     filename = self.files[index]
     filepath = os.path.join(config.TEST_DIR_PATH, filename)
     wave = load_wav(filepath)
     if self.transform:
         wave = self.transform(wave)
     return wave, filepath
Exemplo n.º 3
0
def synthesize_one(text, speaker='Aiyue', model_path='', with_alignment=False):
    if _mellotron is None:
        load_model_mellotron(model_path)

    text_encoded = torch.LongTensor(transform_text(
        text, text_cleaners='hanzi'))[None, :].to(_device)

    speaker_id = torch.LongTensor(transform_speaker(
        '', speaker_ids={})).to(_device)
    style_input = 0

    # pitch_contour = torch.ones(1, _hparams.prenet_f0_dim, text_encoded.shape[1] * 5, dtype=torch.float) * np.random.random()
    # pitch_contour = None

    wav = load_wav(str(speaker), sr=_hparams.sampling_rate)
    embed = transform_embed(wav, _encoder_model_fpath)
    embed = embed[::embed.shape[0] // _hparams.prenet_f0_dim]
    embed = embed if embed.shape[
        0] == _hparams.prenet_f0_dim else embed[:_hparams.prenet_f0_dim]
    f0 = np.tile(embed, (text_encoded.shape[1] * 5, 1)).T
    pitch_contour = torch.from_numpy(f0[None])

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, gate_outputs, alignments = _mellotron.inference(
            (text_encoded, style_input, speaker_id, pitch_contour))

    out_mel = mel_outputs_postnet.data.cpu().numpy()[0]
    if with_alignment:
        return out_mel, alignments[0]
    else:
        return out_mel
Exemplo n.º 4
0
def preprocess_from_path(dataset_dir,
                         metadata_filename,
                         output_dir,
                         num_workers=1,
                         tqdm=lambda x: x):
    """
    Preprocessing wav step by step
    Load -> Remove silences -> Divide to chunks -> Extract features
    Return: list of metadata samples
    """
    print("Start preprocess_from_path...")
    executor = ProcessPoolExecutor(max_workers=num_workers)
    futures = []
    with open(os.path.join(dataset_dir, metadata_filename),
              encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split('\t')
            wav_name = parts[0]
            target_class = get_label_number(parts[-1])
            wav_path = os.path.join(dataset_dir, 'audio',
                                    '%s' % wav_name)  # test_audio
            if os.path.isfile(wav_path):
                wav = utils.remove_all_silence(utils.load_wav(wav_path))
                index = 0
                for (start, end) in utils.windows(wav, hparams.window_size):
                    chunk = wav[start:end]
                    if (len(chunk) != hparams.window_size):
                        chunk = utils.pad_chunk(chunk, wav)
                    futures.append(
                        executor.submit(
                            partial(_process_utterance, output_dir, chunk,
                                    wav_name, target_class, index)))
                    index += 1
    results = [future.result() for future in tqdm(futures)]
    return [r for r in results if r is not None]
Exemplo n.º 5
0
def process_wav(wav_path, audio_path, mel_path, params):
    wav = load_wav(wav_path,
                   sample_rate=params["preprocessing"]["sample_rate"])
    wav /= np.abs(wav).max() * 0.999
    mel = melspectrogram(wav,
                         sample_rate=params["preprocessing"]["sample_rate"],
                         num_mels=params["preprocessing"]["num_mels"],
                         num_fft=params["preprocessing"]["num_fft"],
                         preemph=params["preprocessing"]["preemph"],
                         min_level_db=params["preprocessing"]["min_level_db"],
                         hop_length=params["preprocessing"]["hop_length"],
                         win_length=params["preprocessing"]["win_length"],
                         fmin=params["preprocessing"]["fmin"])

    length_diff = len(mel) * params["preprocessing"]["hop_length"] - len(wav)
    wav = np.pad(wav, (0, length_diff), "constant")

    pad = (params["vocoder"]["sample_frames"] -
           params["vocoder"]["audio_slice_frames"]) // 2
    mel = np.pad(mel, ((pad, ), (0, )), "constant")
    wav = np.pad(wav, (pad * params["preprocessing"]["hop_length"], ),
                 "constant")
    wav = mulaw_encode(wav, mu=2**params["preprocessing"]["bits"])

    speaker = os.path.splitext(os.path.split(wav_path)[-1])[0].split("_")[0]
    np.save(audio_path, wav)
    np.save(mel_path, mel)
    return speaker, audio_path, mel_path, len(mel)
Exemplo n.º 6
0
def process_wav(dataset, wav_path, audio_path, mel_path, params):
    """Convert wav_path into speaker_id and internally save processed data in arg's pathes.
    """
    # auto resample based on params (internally, librosa)
    wav = load_wav(wav_path, sample_rate=params["preprocessing"]["sample_rate"])
    wav /= np.abs(wav).max() * 0.999
    mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"],
                         preemph=params["preprocessing"]["preemph"],
                         num_mels=params["preprocessing"]["num_mels"],
                         num_fft=params["preprocessing"]["num_fft"],
                         min_level_db=params["preprocessing"]["min_level_db"],
                         hop_length=params["preprocessing"]["hop_length"],
                         win_length=params["preprocessing"]["win_length"],
                         fmin=params["preprocessing"]["fmin"])

    length_diff = len(mel) * params["preprocessing"]["hop_length"] - len(wav)
    wav = np.pad(wav, (0, length_diff), "constant")

    pad = (params["vocoder"]["sample_frames"] - params["vocoder"]["audio_slice_frames"]) // 2
    mel = np.pad(mel, ((pad,), (0,)), "constant")
    wav = np.pad(wav, (pad * params["preprocessing"]["hop_length"],), "constant")
    wav = mulaw_encode(wav, mu=2 ** params["preprocessing"]["bits"])

    # speakerID acuisition
    speaker = get_speakerid(wav_path, dataset)

    # save processed data
    np.save(audio_path, wav)
    np.save(mel_path, mel)
    
    return speaker, audio_path, mel_path, len(mel)
Exemplo n.º 7
0
def evaluate(args):
    x = tf.placeholder("float", [None, hparams.n_steps, hparams.n_input],
                       name="x")

    bias = tf.Variable(tf.random_normal([hparams.n_classes]), name="bias")
    weight = tf.Variable(tf.truncated_normal(
        [hparams.n_hidden, hparams.n_classes], stddev=0.1),
                         name="weights")
    prediction = RNN(x, weight, bias)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver()
    saver.restore(sess, args.path_to_model)

    list_of_files = get_files(args.test_data_dir)

    for file in list_of_files:
        wav = utils.remove_all_silence(
            utils.load_wav(os.path.join(args.test_data_dir, '%s' % file)))
        features = preprocess_one(wav)
        pred = sess.run(prediction, feed_dict={x: features})
        # Prediction for one example
        join_pred = np.around(np.sum(pred, axis=0) / pred.shape[0], decimals=3)
        # Write to file
        with open("result.txt", "a") as f:
            f.write(file + '\t' + '{0:.3f}'.format(np.max(join_pred)) + '\t' +
                    get_text_label(np.argmax(join_pred)) + '\n')
Exemplo n.º 8
0
def transcribe(m, input, output, threshold):
    import magenta.music as mm
    import data
    wav = utils.load_wav(input, cfg.SAMPLE_RATE)
    frames, _ = data.audio2frame(wav, cfg.FRAME_SIZE, cfg.SPECTROGRAM)
    onset, _ = m.predict(frames, threshold)
    sequence = data.matrix2sequence(onset[0], onset=onset[0])
    mm.sequence_proto_to_midi_file(sequence, output)
Exemplo n.º 9
0
	def __getitem__(self, index):
		'''
		returns the file in wav format
		'''
		wavfile = self.wavfiles[index]

		mixed, s1, s2 = load_wav(wavfile, sr=self.sr)
		
		return mixed, s1, s2, len(mixed)
Exemplo n.º 10
0
def load_noise_waves():
    noise_waves = []
    noise_directory = os.path.join(config.TRAIN_DIR_PATH, '_background_noise_')
    for filepath in sorted(os.listdir(noise_directory)):
        if not filepath.endswith('.wav'):
            continue
        wave = load_wav(os.path.join(noise_directory, filepath))
        noise_waves.append(wave)
    return noise_waves
Exemplo n.º 11
0
 def __getitem__(self, index):
     i = self.dataset_index[index]
     if i == 'silence':
         label = label_to_idx['silence']
         wave = self.silence_wave
     else:
         filepath, label, user_id = self.data[i]
         wave = load_wav(filepath)
     if self.transform:
         wave = self.transform(wave)
     return wave, label
Exemplo n.º 12
0
    def load_samples(self):
        """
        Loads samples from file ./drum_samples/
        File should include files bass_drum.wav, snare_drum.wav 
        and hi_hat.wav
        """
        path = 'drum_samples/'

        fs_bass, bass_drum = load_wav(path + 'bass_drum.wav')
        self._bass_drum = stereo_to_mono(bass_drum)

        fs_snare, snare_drum = load_wav(path + 'snare_drum.wav')
        self._snare_drum = stereo_to_mono(snare_drum)

        fs_hi_hat, hi_hat = load_wav(path + 'hi_hat.wav')
        self._hi_hat = stereo_to_mono(hi_hat)

        # Checking if loaded samples have matching sampling frequency with the set fs
        error_msg = ' sample does not have matching sample frequency'
        assert fs_bass == self._fs, 'Bass drum' + error_msg
        assert fs_snare == self._fs, 'Snare drum' + error_msg
        assert fs_hi_hat == self._fs, 'hi hat' + error_msg
Exemplo n.º 13
0
def augmantation_from_path(dataset_dir,
                           metadata_filename,
                           output_dir,
                           current_class,
                           augmantation_amount,
                           num_workers=1,
                           tqdm=lambda x: x):
    """
    Preprocessing wav step by step
    Load -> Remove silences -> Random start -> Divide to chunks -> Extract features
    Return: list of metadata samples
    """
    print("Start augmantation_from_path...")
    executor = ProcessPoolExecutor(max_workers=num_workers)
    futures = []
    aug_index = 0
    while_loop = True
    while while_loop:
        with open(os.path.join(dataset_dir, metadata_filename),
                  encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split('\t')
                wav_name = parts[0]
                target_class = get_label_number(parts[-1])
                if target_class == current_class:
                    wav_path = os.path.join(dataset_dir, 'audio',
                                            '%s' % wav_name)  # audio
                    if os.path.isfile(wav_path):
                        wav = utils.remove_all_silence(
                            utils.load_wav(wav_path))
                        wav = wav[np.random.randint(1, 22050):]
                        index = 0
                        for (start,
                             end) in utils.windows(wav, hparams.window_size):
                            if aug_index >= augmantation_amount:
                                while_loop = False
                                break
                            chunk = wav[start:end]
                            if (len(chunk) != hparams.window_size):
                                chunk = utils.pad_chunk(chunk, wav)
                            futures.append(
                                executor.submit(
                                    partial(
                                        _process_utterance, output_dir, chunk,
                                        "aug-%s-%s" % (aug_index, wav_name),
                                        target_class, index)))
                            index += 1
                            aug_index += 1
    results = [future.result() for future in tqdm(futures)]
    return [r for r in results if r is not None]
Exemplo n.º 14
0
def gen_from_wav(model, wav, output):
    wav = load_wav(wav, params["preprocessing"]["sample_rate"], trim=False)
    utterance_id = os.path.basename(args.input).split(".")[0]
    wav = wav / np.abs(wav).max() * 0.999
    mel = melspectrogram(wav,
                         sample_rate=params["preprocessing"]["sample_rate"],
                         preemph=params["preprocessing"]["preemph"],
                         num_mels=params["preprocessing"]["num_mels"],
                         num_fft=params["preprocessing"]["num_fft"],
                         min_level_db=params["preprocessing"]["min_level_db"],
                         ref_level_db=params["preprocessing"]["ref_level_db"],
                         hop_length=params["preprocessing"]["hop_length"],
                         fmin=params["preprocessing"]["fmin"],
                         fmax=params["preprocessing"]["fmax"])
    gen_from_mel(model, mel, output)
Exemplo n.º 15
0
def pad2drums(read_from_fname, save_to_fname):
    """
    Reads .wav-file in folder "raw_audio" from a drum pad (with mic about 10 cm away)
    and converts it to an .wav-file with drum sounds in place of 
    the pad sounds. Created file is placed in folder "results".
    """

    load_path = 'raw_audio/'
    fs, raw_audio = load_wav(load_path + read_from_fname)

    # Detecting the pad hits from the raw_audio
    hit_indices, hit_strengths = detect_sound(raw_audio, stereo=True)

    dg = DrumGenerator(fs=fs)
    drum_audio = dg.generate_drum_audio(hit_indices, hit_strengths,
                                        raw_audio.size)

    # Save drum_audio to file name for save_to_file added by user
    save_path = 'results/' + save_to_fname
    save_wav(save_path, drum_audio, fs)
def delete_error_audio(path):
    sounds_path = []
    for root, dirs, files in os.walk(path):
        for file in files:
            sound_path = os.path.join(root, file)
            if sound_path[-4:] == '.wav' or sound_path[-4:] == '.m4a':
                sounds_path.append(sound_path)
    for audio_path in tqdm(sounds_path):
        try:
            wav = utils.load_wav(audio_path, sr=16000, mode='train')
            linear_spect = utils.lin_spectogram_from_wav(wav, 160, 400, 512)
            mag, _ = librosa.magphase(linear_spect)  # magnitude
            mag_T = mag.T
            freq, time = mag_T.shape
            if time <= 250:
                # os.remove(audio_path)
                print('音频过短,删除:%s' % audio_path)
        except:
            # os.remove(audio_path)
            print('音频错误,删除:%s' % audio_path)
Exemplo n.º 17
0
def transform(ENV, args):
    train_wav_files, train_phn_files = load_wavPhn(ENV.train_data)
    test_wav_files = load_wav(ENV.test_data)
    train_output_path = os.path.join(ENV.output, 'train')
    test_output_path = os.path.join(ENV.output, 'test')
    if not os.path.exists(train_output_path):
        os.makedirs(train_output_path)
    if not os.path.exists(test_output_path):
        os.makedirs(test_output_path)

    for i in tqdm(range(len(train_wav_files))):
        transform_wav(train_wav_files[i], train_output_path)
        phn_file = os.path.join(os.path.dirname(train_wav_files[i]), 
                                os.path.basename(train_wav_files[i]).split('.')[0] + '.phn')
        copy_phn(phn_file, train_output_path)

    for i in tqdm(range(len(test_wav_files))):
        transform_wav(test_wav_files[i], test_output_path)
        phn_file = os.path.join(os.path.dirname(test_wav_files[i]), 
                                os.path.basename(test_wav_files[i]).split('.')[0] + '.phn')
        copy_phn(phn_file, test_output_path)
Exemplo n.º 18
0
    def load_from_browser(self, fpath=None):
        if fpath is None:
            fpath = Path(self.datasets_root, self.ui.current_dataset_name,
                         self.ui.current_src_spk,
                         self.ui.current_utterance_name)
            name = str(fpath.relative_to(self.datasets_root))
            speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_src_spk

            # Select the next utterance
            if self.ui.auto_next_checkbox.isChecked():
                self.ui.browser_select_next()
        elif fpath == "":
            return
        else:
            name = fpath.name
            speaker_name = fpath.parent.name

        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
        wav = utils.load_wav(str(fpath))
        self.ui.log("Loaded %s" % name)

        self.add_real_utterance(wav, cfg.data.sample_rate, name, speaker_name)
Exemplo n.º 19
0
all_wav_path = glob.glob(os.path.join(data_root, models[0], '*.wav'))
logf0_dict = {
    'm2m': [],
    'm2f': [],
    'f2m': [],
    'f2f': []
}
print(" [*] {} start!".format('GT'))
for wav_path in tqdm.tqdm(all_wav_path):
    wav_name = os.path.basename(wav_path)

    pattern = r"p[0-9]+_[0-9]+"
    src, trg = re.findall(pattern, wav_name)

    wav_path_gt = os.path.join(data_root, 'GT', trg + '.wav')
    wav_gt = load_wav(wav_path_gt, 22050)
    logf0_gt = get_logf0(wav_gt, 22050, frame_period=(256 / (0.001 * 22050)))
    logf0_gt = speaker_norm(logf0_gt)

    src_spk = src.split('_')[0]
    trg_spk = trg.split('_')[0]

    each_dict = {'GT': logf0_gt}
    for m in models:
        temp = os.path.join(data_root, m, wav_name)

        wav = load_wav(temp, 22050)
        logf0 = get_logf0(wav, 22050, frame_period=(256 / (0.001 * 22050)))
        logf0 = speaker_norm(logf0)
        each_dict[m] = logf0
Exemplo n.º 20
0
def main():
    parser = argparse.ArgumentParser('PreprocessingParser')
    parser.add_argument('--data_dir', type=str, help='data root directory')
    parser.add_argument('--save_dir',
                        type=str,
                        help='extracted feature save directory')
    parser.add_argument('--dev_rate',
                        type=float,
                        help='dev set rate',
                        default=0.05)
    parser.add_argument('--test_rate',
                        type=float,
                        help='test set rate',
                        default=0.05)
    args = parser.parse_args()
    # args validation
    if args.dev_rate < 0 or args.dev_rate >= 1:
        raise ValueError('dev rate should be in [0, 1)')
    if args.test_rate < 0 or args.test_rate >= 1:
        raise ValueError('dev rate should be in [0, 1)')
    if args.test_rate + args.dev_rate >= 1:
        raise ValueError('dev rate + test rate should not be >= 1.')
    if not os.path.isdir(args.data_dir):
        raise FileNotFoundError('Directory {} not found!'.format(
            args.data_dir))
    if not os.path.isdir(args.save_dir):
        os.makedirs(args.save_dir)
    mel_dir = os.path.join(args.save_dir, 'mels')
    os.makedirs(mel_dir, exist_ok=True)
    linear_dir = os.path.join(args.save_dir, 'linears')
    os.makedirs(linear_dir, exist_ok=True)
    f0_dir = os.path.join(args.save_dir, 'f0s')
    os.makedirs(f0_dir, exist_ok=True)
    ppg_dir = os.path.join(args.save_dir, 'ppgs')
    os.makedirs(ppg_dir, exist_ok=True)
    for mode in ['train', 'dev', 'test']:
        if os.path.isfile(
                os.path.join(args.save_dir, "{}_meta.csv".format(mode))):
            os.remove(os.path.join(args.save_dir, "{}_meta.csv".format(mode)))
    wav_files = []
    for rootdir, subdir, files in os.walk(args.data_dir):
        for f in files:
            if f.endswith('.wav'):
                wav_files.append(os.path.join(rootdir, f))
    random.shuffle(wav_files)

    print('Set up PPGs extraction network')
    # Set up network
    ppg_extractor_hps = hps.PPGExtractor.CNNBLSTMClassifier
    mfcc_pl = tf.placeholder(dtype=tf.float32,
                             shape=[None, None, 3 * hps.Audio.n_mfcc],
                             name='mfcc_pl')
    ppg_extractor = CNNBLSTMClassifier(
        out_dims=hps.Audio.ppg_dim,
        n_cnn=ppg_extractor_hps.n_cnn,
        cnn_hidden=ppg_extractor_hps.cnn_hidden,
        cnn_kernel=ppg_extractor_hps.cnn_kernel,
        n_blstm=ppg_extractor_hps.n_blstm,
        lstm_hidden=ppg_extractor_hps.lstm_hidden)
    predicted_ppgs = ppg_extractor(inputs=mfcc_pl)['logits']

    # set up a session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    # load saved model
    saver = tf.train.Saver()
    print('Restoring ppgs extractor from {}'.format(ppg_extractor_hps.ckpt))
    saver.restore(sess, ppg_extractor_hps.ckpt)
    print('Extracting mel-spectrograms, spectrograms and log-f0s...')
    train_set = []
    dev_set = []
    test_set = []
    dev_start_idx = int(len(wav_files) * (1 - args.dev_rate - args.test_rate))
    test_stat_idx = int(len(wav_files) * (1 - args.test_rate))
    for i, wav_f in tqdm(enumerate(wav_files)):
        try:
            wav_arr = load_wav(wav_f)
        except:
            continue
        pre_emphasized_wav = _preemphasize(wav_arr)
        fid = '{}_{}'.format(
            wav_f.split('/')[-3].split('_')[2],
            wav_f.split('/')[-1].split('.')[0].split('_')[1])
        # extract mel-spectrograms
        mel_fn = os.path.join(mel_dir, '{}.npy'.format(fid))
        try:
            mel_spec = melspectrogram(pre_emphasized_wav).astype(np.float32).T
        except:
            continue
        # extract spectrograms
        linear_fn = os.path.join(linear_dir, '{}.npy'.format(fid))
        try:
            linear_spec = spectrogram(pre_emphasized_wav).astype(np.float32).T
        except:
            continue
        # extract log-f0s
        f0_fn = os.path.join(f0_dir, '{}.npy'.format(fid))
        log_f0 = logf0(wav_f)
        try:
            log_f0 = lf0_normailze(log_f0)
        except:
            continue
        # extract ppgs
        mfcc_feats = wav2unnormalized_mfcc(wav_arr)
        ppg = sess.run(predicted_ppgs,
                       feed_dict={mfcc_pl: np.expand_dims(mfcc_feats, axis=0)})
        ppg = softmax(np.squeeze(ppg, axis=0))
        ppg_fn = os.path.join(ppg_dir, '{}.npy'.format(fid))

        # save features to respective directory
        mel_spec, linear_spec, log_f0, ppg = length_validate(
            (mel_spec, linear_spec, log_f0, ppg))
        np.save(mel_fn, mel_spec)
        np.save(linear_fn, linear_spec)
        np.save(f0_fn, log_f0)
        np.save(ppg_fn, ppg)

        # write to csv
        if i < dev_start_idx:
            train_set.append(fid)
            with open(os.path.join(args.save_dir, 'train_meta.csv'),
                      'a',
                      encoding='utf-8') as train_f:
                train_f.write(
                    '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'.
                    format(fid, fid, fid, fid, fid))
        elif i < test_stat_idx:
            dev_set.append(fid)
            with open(os.path.join(args.save_dir, 'dev_meta.csv'),
                      'a',
                      encoding='utf-8') as dev_f:
                dev_f.write(
                    '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'.
                    format(fid, fid, fid, fid, fid))
        else:
            test_set.append(fid)
            with open(os.path.join(args.save_dir, 'test_meta.csv'),
                      'a',
                      encoding='utf-8') as test_f:
                test_f.write(
                    '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'.
                    format(fid, fid, fid, fid, fid))
    print('Done extracting features!')
    return
Exemplo n.º 21
0
 def load_utterance(self, spk_name, path):
     wav = utils.load_wav(path)
     return Utterance(wav,
                      cfg.data.sample_rate,
                      path=path,
                      spk_name=spk_name)
def get_map_size(files):
    return load_wav(files[0]).nbytes * 10 * (len(files) + 2)
Exemplo n.º 23
0
        embedding_dim=params["vocoder"]["embedding_dim"],
        rnn_channels=params["vocoder"]["rnn_channels"],
        fc_channels=params["vocoder"]["fc_channels"],
        bits=params["preprocessing"]["bits"],
        hop_length=params["preprocessing"]["hop_length"],
        nc=args.nc,
        device=device)
    model.to(device)

    print("Load checkpoint from: {}:".format(args.checkpoint))
    checkpoint = torch.load(args.checkpoint,
                            map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint["model"])
    model_step = checkpoint["step"]

    wav = load_wav(args.wav_path, params["preprocessing"]["sample_rate"])
    utterance_id = os.path.basename(args.wav_path).split(".")[0]
    wav = wav / np.abs(wav).max() * 0.999
    mel = melspectrogram(wav,
                         sample_rate=params["preprocessing"]["sample_rate"],
                         preemph=params["preprocessing"]["preemph"],
                         num_mels=params["preprocessing"]["num_mels"],
                         num_fft=params["preprocessing"]["num_fft"],
                         min_level_db=params["preprocessing"]["min_level_db"],
                         hop_length=params["preprocessing"]["hop_length"],
                         win_length=params["preprocessing"]["win_length"],
                         fmin=params["preprocessing"]["fmin"])
    mel = torch.FloatTensor(mel).unsqueeze(0).to(device)
    output = model.generate(mel)
    path = os.path.join(
        args.gen_dir,
Exemplo n.º 24
0
m2f_gt = []
f2m_gt = []
f2f_gt = []
print(" [*] {} start!".format('GT'))
for wav_path in tqdm.tqdm(all_wav_path):
    wav_name = os.path.basename(wav_path)

    pattern = r"p[0-9]+_[0-9]+"
    src, trg = re.findall(pattern, wav_name)

    wav_path_gt = os.path.join(data_root, 'GT', trg + '.wav')

    src_spk = src.split('_')[0]
    trg_spk = trg.split('_')[0]

    wav = load_wav(wav_path_gt, 22050)
    logf0 = get_logf0(wav, 22050, frame_period=(256 / (0.001 * 22050)))

    if src_spk in M and trg_spk in M:
        m2m_gt.append(logf0[logf0 > 0])
    elif src_spk in M and trg_spk in F:
        m2f_gt.append(logf0[logf0 > 0])
    elif src_spk in F and trg_spk in M:
        f2m_gt.append(logf0[logf0 > 0])
    elif src_spk in F and trg_spk in F:
        f2f_gt.append(logf0[logf0 > 0])

logf0_dict['m2m'] = np.concatenate(m2m_gt)
logf0_dict['m2f'] = np.concatenate(m2f_gt)
logf0_dict['f2m'] = np.concatenate(f2m_gt)
logf0_dict['f2f'] = np.concatenate(f2f_gt)
Exemplo n.º 25
0
os.chdir(proj_dir + '/data/train/wav')
wavfiles = glob.glob('*.wav')
trainfiles = []
os.chdir(proj_dir + '/data/train/mid')
for wname in wavfiles:
    base_fname = wname.split('_')[0]
    trainfiles += [('data/train/wav/' + wname,
                    'data/train/mid/' + base_fname + '.mid')]

# preprocess test
xs, ys = [], []
os.chdir(proj_dir)
for wav, mid in testfiles:
    # do constant-q transform on the wav file
    wavdata = utils.load_wav(wav)
    # cqt_windows = utils.cqt_windows(wavdata, 7, hop_length=hop_len)
    cqt_windows = utils.cqt(wavdata)
    savefile = 'data/test/preprocessed/' + wav.split('/')[-1]
    np.save(savefile, cqt_windows)
    xs.append(cqt_windows)
    print('wrote {}.npy\n dimensions: {}'.format(savefile, cqt_windows.shape),
          file=sys.stderr)

    pm = pmidi.PrettyMIDI(mid)
    t = librosa.frames_to_time(np.arange(cqt_windows.shape[1]),
                               sr=sample_rate,
                               hop_length=hop_len)
    piano_roll = pm.get_piano_roll(fs=sample_rate, times=t)
    savefile = 'data/test/preprocessed/' + mid.split('/')[-1]
    np.save(savefile, piano_roll)
Exemplo n.º 26
0
def example_wav():
    wav = load_wav(
        os.path.join(os.path.dirname(__file__),
                     "../datasets/test/example/example.wav"))
    assert len(wav.shape) == 1
    return wav
Exemplo n.º 27
0
def main():
    hps = Hparams
    parser = argparse.ArgumentParser('VC inference')
    parser.add_argument('--src_wav', type=str, help='source wav file path')
    parser.add_argument('--ckpt', type=str, help='model ckpt path')
    parser.add_argument('--save_dir', type=str, help='synthesized wav save directory')
    args = parser.parse_args()
    # 0.
    src_wav_arr = load_wav(args.src_wav)
    pre_emphasized_wav = _preemphasize(src_wav_arr)
    # 1. extract ppgs
    ppg_extractor_hps = hps.PPGExtractor.CNNBLSTMClassifier
    mfcc_pl = tf.placeholder(dtype=tf.float32,
                             shape=[None, None, 3 * hps.Audio.n_mfcc],
                             name='mfcc_pl')
    ppg_extractor = CNNBLSTMClassifier(out_dims=hps.Audio.ppg_dim,
                                       n_cnn=ppg_extractor_hps.n_cnn,
                                       cnn_hidden=ppg_extractor_hps.cnn_hidden,
                                       cnn_kernel=ppg_extractor_hps.cnn_kernel,
                                       n_blstm=ppg_extractor_hps.n_blstm,
                                       lstm_hidden=ppg_extractor_hps.lstm_hidden)
    predicted_ppgs = ppg_extractor(inputs=mfcc_pl)['logits']
    # set up a session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    # load saved model
    saver = tf.train.Saver()
    print('Restoring ppgs extractor from {}'.format(ppg_extractor_hps.ckpt))
    saver.restore(sess, ppg_extractor_hps.ckpt)
    mfcc_feats = wav2unnormalized_mfcc(src_wav_arr)
    ppg = sess.run(predicted_ppgs,
                   feed_dict={mfcc_pl: np.expand_dims(mfcc_feats, axis=0)})
    sess.close()
    ppg = softmax(np.squeeze(ppg, axis=0))

    # 2. extract lf0, mel-spectrogram
    log_f0 = logf0(args.src_wav)
    log_f0 = lf0_normailze(log_f0)
    # mel-spectrogram is extracted for comparison
    mel_spec = melspectrogram(pre_emphasized_wav).astype(np.float32).T

    # 3. prepare inputs
    min_len = min(log_f0.shape[0], ppg.shape[0])
    vc_inputs = np.concatenate([ppg[:min_len, :], log_f0[:min_len, :]], axis=1)
    vc_inputs = np.expand_dims(vc_inputs, axis=1)  # [time, batch, dim]

    # 4. setup vc model and do the inference
    model = BLSTMConversionModel(in_channels=hps.Audio.ppg_dim + 2,
                                 out_channels=hps.Audio.num_mels,
                                 lstm_hidden=hps.BLSTMConversionModel.lstm_hidden)
    device = torch.device('cpu')
    model.load_state_dict(torch.load(args.ckpt, map_location=device))
    model.eval()
    predicted_mels = model(torch.tensor(vc_inputs))
    predicted_mels = np.squeeze(predicted_mels.detach().numpy(), axis=1)

    # 5. synthesize wav
    synthesized_wav = inv_preemphasize(inv_mel_spectrogram(predicted_mels.T))
    resynthesized_wav = inv_preemphasize(inv_mel_spectrogram(mel_spec.T))
    ckpt_name = args.ckpt.split('/')[-1].split('.')[0]
    wav_name = args.src_wav.split('/')[-1].split('.')[0]
    save_wav(synthesized_wav, os.path.join(args.save_dir, '{}-{}-converted.wav'.format(wav_name, ckpt_name)))
    save_wav(resynthesized_wav, os.path.join(args.save_dir, '{}-{}-src-resyn.wav'.format(wav_name, ckpt_name)))
    return
Exemplo n.º 28
0
parser.add_argument(
    'weight_path',
    help="Path of checkpoint (ex:./result/weights/wavenet_0800)")
args = parser.parse_args()


def synthesize(mel_sp, save_path, weight_path):
    wavenet = WaveNet(hparams.num_mels, hparams.upsample_scales)
    wavenet.load_weights(weight_path)
    mel_sp = tf.expand_dims(mel_sp, axis=0)

    outputs = wavenet.synthesis(mel_sp)
    outputs = np.squeeze(outputs)
    outputs = inv_mulaw_quantize(outputs)

    save_wav(outputs, save_path, hparams.sampling_rate)


if __name__ == '__main__':
    wav = load_wav(args.input_path, hparams.sampling_rate)
    wav = normalize(wav) * 0.95

    mel_sp = melspectrogram(wav,
                            hparams.sampling_rate,
                            hparams.num_mels,
                            n_fft=hparams.n_fft,
                            hop_size=hparams.hop_size,
                            win_size=hparams.win_size)

    synthesize(mel_sp, args.output_path, args.weight_path)
Exemplo n.º 29
0
def convert(src_wav_dir, trg_wav_file):
    all_src_wav_files = glob.glob(f'{src_wav_dir}/*.wav')
    # This regex for src_wav_files creates about 20 output files to get a good sample without taking too
    # much time or memory. It can be altered (including setting to a single file or all_src_wav_files)
    # to create fewer/more output files.
    src_wav_files = glob.glob(f'{src_wav_dir}/p???_0[01][0-9].wav')
    src_wavs = [
        utils.load_wav(src_wav_file, utils.SAMPLING_RATE)
        for src_wav_file in src_wav_files
    ]
    trg_wav = utils.load_wav(trg_wav_file, utils.SAMPLING_RATE)
    trg_wav_name = splitext(basename(trg_wav_file))[0]
    converted_dir = VCTK_PATH.joinpath('converted_audio',
                                       'trg_' + trg_wav_name)
    os.makedirs(converted_dir, exist_ok=True)

    src_stats = get_stats(all_src_wav_files)
    trg_stats = get_stats([trg_wav_file])

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    G = get_model(device)

    _, _, trg_sp, _ = utils.world_decompose(wav=trg_wav,
                                            fs=utils.SAMPLING_RATE,
                                            frame_period=utils.FRAME_PERIOD)
    trg_coded_sp = utils.world_encode_spectral_envelop(sp=trg_sp,
                                                       fs=utils.SAMPLING_RATE,
                                                       dim=utils.NUM_MCEP)
    trg_coded_sp_norm = (trg_coded_sp - trg_stats['coded_sps_mean']
                         ) / trg_stats['coded_sps_std']
    assert trg_coded_sp_norm.shape[0] >= 8192
    trg_coded_sp_norm = trg_coded_sp_norm[:8192, :]
    trg_coded_sp_norm_tensor = torch.FloatTensor(
        trg_coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device)

    trg_embed = G.trg_downsample(trg_coded_sp_norm_tensor)

    with torch.no_grad():
        for i, src_wav in enumerate(tqdm(src_wavs)):
            f0, _, sp, ap = utils.world_decompose(
                wav=src_wav,
                fs=utils.SAMPLING_RATE,
                frame_period=utils.FRAME_PERIOD)
            coded_sp = utils.world_encode_spectral_envelop(
                sp=sp, fs=utils.SAMPLING_RATE, dim=utils.NUM_MCEP)

            f0_converted = utils.pitch_conversion(
                f0=f0,
                mean_log_src=src_stats['log_f0s_mean'],
                std_log_src=src_stats['log_f0s_std'],
                mean_log_target=trg_stats['log_f0s_mean'],
                std_log_target=trg_stats['log_f0s_std'])

            coded_sp_norm = (coded_sp - src_stats['coded_sps_mean']
                             ) / src_stats['coded_sps_std']
            coded_sp_norm_tensor = torch.FloatTensor(
                coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device)

            # coded_sp_converted_norm = G(coded_sp_norm_tensor, trg_embed).data.cpu().numpy()
            coded_sp_converted_norm = G.forward_with_trg_embed(
                coded_sp_norm_tensor, trg_embed)
            coded_sp_converted_norm = coded_sp_converted_norm.data.cpu().numpy(
            )
            coded_sp_converted = np.squeeze(coded_sp_converted_norm).T
            coded_sp_converted = coded_sp_converted * trg_stats[
                'coded_sps_std'] + trg_stats['coded_sps_mean']
            coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
            coded_sp_converted = coded_sp_converted.astype('double')
            wav_transformed = utils.world_speech_synthesis(
                f0=f0_converted,
                coded_sp=coded_sp_converted,
                ap=ap,
                fs=utils.SAMPLING_RATE,
                frame_period=utils.FRAME_PERIOD)

            output_path = converted_dir.joinpath(
                'src_' + os.path.basename(src_wav_files[i]))
            print(f'Saving to {output_path}')
            librosa.output.write_wav(output_path, wav_transformed,
                                     utils.SAMPLING_RATE)
Exemplo n.º 30
0
    input_root = args.input_root
    output_root = args.output_root
    os.makedirs(output_root, exist_ok=True)
    input_dir_names = sorted(os.listdir(input_root))

    gpu = args.gpu

    # ================ separation ================
    for idx, f in enumerate(input_dir_names):
        input_dir = os.path.join(input_root, f)
        print("Processing {}...".format(input_dir), end="")
        save_dir = os.path.join(output_root, f)
        os.makedirs(save_dir, exist_ok=True)

        # Input data and resample
        mix = utils.load_wav(input_dir, STFTPara['fs'])
        ns = mix.shape[1]

        # STFT
        frames_ = np.floor((mix.shape[0] + 2 * STFTPara['window_shift']) /
                           STFTPara['window_shift'])  # to meet NOLA
        frames = int(np.ceil(frames_ / 8) * 8)

        X = np.zeros(
            (int(STFTPara['window_size'] / 2 + 1), int(frames), mix.shape[1]),
            dtype=np.complex)
        for n in range(mix.shape[1]):
            f, t, X[:, :int(frames_), n] = signal.stft(
                mix[:, n],
                nperseg=STFTPara['window_size'],
                window=STFTPara['type'],