def main(): #Load all data and labels with open('music_genres_dataset.pkl', 'rb') as f: train_set = pickle.load(f) test_set = pickle.load(f) train_set_data = train_set['data'] train_set_labels = train_set['labels'] train_set_id = train_set['track_id'] test_set_data = test_set['data'] test_set_labels = test_set['labels'] test_set_id = test_set['track_id'] train_mel = [] #Convert audio data to spectrogram for i in range(np.shape(train_set_data)[0]): train_mel.append(melspectrogram(train_set_data[i][:])) print(i / 11250) test_mel = [] for i in range(np.shape(test_set_data)[0]): test_mel.append(melspectrogram(test_set_data[i][:])) print(i / 3750) return train_mel, test_mel, train_set_labels, test_set_labels, test_set_id
def process_wav(wav_path, audio_path, mel_path, params): wav = load_wav(wav_path, sample_rate=params["preprocessing"]["sample_rate"]) wav /= np.abs(wav).max() * 0.999 mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"], num_mels=params["preprocessing"]["num_mels"], num_fft=params["preprocessing"]["num_fft"], preemph=params["preprocessing"]["preemph"], min_level_db=params["preprocessing"]["min_level_db"], hop_length=params["preprocessing"]["hop_length"], win_length=params["preprocessing"]["win_length"], fmin=params["preprocessing"]["fmin"]) length_diff = len(mel) * params["preprocessing"]["hop_length"] - len(wav) wav = np.pad(wav, (0, length_diff), "constant") pad = (params["vocoder"]["sample_frames"] - params["vocoder"]["audio_slice_frames"]) // 2 mel = np.pad(mel, ((pad, ), (0, )), "constant") wav = np.pad(wav, (pad * params["preprocessing"]["hop_length"], ), "constant") wav = mulaw_encode(wav, mu=2**params["preprocessing"]["bits"]) speaker = os.path.splitext(os.path.split(wav_path)[-1])[0].split("_")[0] np.save(audio_path, wav) np.save(mel_path, mel) return speaker, audio_path, mel_path, len(mel)
def process_wav(dataset, wav_path, audio_path, mel_path, params): """Convert wav_path into speaker_id and internally save processed data in arg's pathes. """ # auto resample based on params (internally, librosa) wav = load_wav(wav_path, sample_rate=params["preprocessing"]["sample_rate"]) wav /= np.abs(wav).max() * 0.999 mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"], preemph=params["preprocessing"]["preemph"], num_mels=params["preprocessing"]["num_mels"], num_fft=params["preprocessing"]["num_fft"], min_level_db=params["preprocessing"]["min_level_db"], hop_length=params["preprocessing"]["hop_length"], win_length=params["preprocessing"]["win_length"], fmin=params["preprocessing"]["fmin"]) length_diff = len(mel) * params["preprocessing"]["hop_length"] - len(wav) wav = np.pad(wav, (0, length_diff), "constant") pad = (params["vocoder"]["sample_frames"] - params["vocoder"]["audio_slice_frames"]) // 2 mel = np.pad(mel, ((pad,), (0,)), "constant") wav = np.pad(wav, (pad * params["preprocessing"]["hop_length"],), "constant") wav = mulaw_encode(wav, mu=2 ** params["preprocessing"]["bits"]) # speakerID acuisition speaker = get_speakerid(wav_path, dataset) # save processed data np.save(audio_path, wav) np.save(mel_path, mel) return speaker, audio_path, mel_path, len(mel)
def _process_utterance(output_dir, chunk, wav_name, target_class, index): mel_spectrogram = utils.melspectrogram(chunk).astype(np.float32) mel_filename = "mel-%s-%s.npy" % (wav_name.split('.')[0], index) trimmed_wav_name = "%s-%s.wav" % (wav_name.split('.')[0], index) utils.save_feature(mel_spectrogram.T, os.path.join(output_dir, mel_filename)) #utils.save_wav(chunk, os.path.join(output_dir, trimmed_wav_name)) return (trimmed_wav_name, mel_filename, target_class)
def preprocess_one(wav): """ Devide wav to chunks """ chunked_features = [] for (start, end) in utils.windows(wav, hparams.window_size): chunk = wav[start:end] if (len(chunk) != hparams.window_size): chunk = utils.pad_chunk(chunk, wav) mel_spectrogram = utils.melspectrogram(chunk).astype(np.float32) chunked_features.append(mel_spectrogram.T) return np.array(chunked_features)
def gen_from_wav(model, wav, output): wav = load_wav(wav, params["preprocessing"]["sample_rate"], trim=False) utterance_id = os.path.basename(args.input).split(".")[0] wav = wav / np.abs(wav).max() * 0.999 mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"], preemph=params["preprocessing"]["preemph"], num_mels=params["preprocessing"]["num_mels"], num_fft=params["preprocessing"]["num_fft"], min_level_db=params["preprocessing"]["min_level_db"], ref_level_db=params["preprocessing"]["ref_level_db"], hop_length=params["preprocessing"]["hop_length"], fmin=params["preprocessing"]["fmin"], fmax=params["preprocessing"]["fmax"]) gen_from_mel(model, mel, output)
def main(): parser = argparse.ArgumentParser('PreprocessingParser') parser.add_argument('--data_dir', type=str, help='data root directory') parser.add_argument('--save_dir', type=str, help='extracted feature save directory') parser.add_argument('--dev_rate', type=float, help='dev set rate', default=0.05) parser.add_argument('--test_rate', type=float, help='test set rate', default=0.05) args = parser.parse_args() # args validation if args.dev_rate < 0 or args.dev_rate >= 1: raise ValueError('dev rate should be in [0, 1)') if args.test_rate < 0 or args.test_rate >= 1: raise ValueError('dev rate should be in [0, 1)') if args.test_rate + args.dev_rate >= 1: raise ValueError('dev rate + test rate should not be >= 1.') if not os.path.isdir(args.data_dir): raise FileNotFoundError('Directory {} not found!'.format( args.data_dir)) if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) mel_dir = os.path.join(args.save_dir, 'mels') os.makedirs(mel_dir, exist_ok=True) linear_dir = os.path.join(args.save_dir, 'linears') os.makedirs(linear_dir, exist_ok=True) f0_dir = os.path.join(args.save_dir, 'f0s') os.makedirs(f0_dir, exist_ok=True) ppg_dir = os.path.join(args.save_dir, 'ppgs') os.makedirs(ppg_dir, exist_ok=True) for mode in ['train', 'dev', 'test']: if os.path.isfile( os.path.join(args.save_dir, "{}_meta.csv".format(mode))): os.remove(os.path.join(args.save_dir, "{}_meta.csv".format(mode))) wav_files = [] for rootdir, subdir, files in os.walk(args.data_dir): for f in files: if f.endswith('.wav'): wav_files.append(os.path.join(rootdir, f)) random.shuffle(wav_files) print('Set up PPGs extraction network') # Set up network ppg_extractor_hps = hps.PPGExtractor.CNNBLSTMClassifier mfcc_pl = tf.placeholder(dtype=tf.float32, shape=[None, None, 3 * hps.Audio.n_mfcc], name='mfcc_pl') ppg_extractor = CNNBLSTMClassifier( out_dims=hps.Audio.ppg_dim, n_cnn=ppg_extractor_hps.n_cnn, cnn_hidden=ppg_extractor_hps.cnn_hidden, cnn_kernel=ppg_extractor_hps.cnn_kernel, n_blstm=ppg_extractor_hps.n_blstm, lstm_hidden=ppg_extractor_hps.lstm_hidden) predicted_ppgs = ppg_extractor(inputs=mfcc_pl)['logits'] # set up a session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # load saved model saver = tf.train.Saver() print('Restoring ppgs extractor from {}'.format(ppg_extractor_hps.ckpt)) saver.restore(sess, ppg_extractor_hps.ckpt) print('Extracting mel-spectrograms, spectrograms and log-f0s...') train_set = [] dev_set = [] test_set = [] dev_start_idx = int(len(wav_files) * (1 - args.dev_rate - args.test_rate)) test_stat_idx = int(len(wav_files) * (1 - args.test_rate)) for i, wav_f in tqdm(enumerate(wav_files)): try: wav_arr = load_wav(wav_f) except: continue pre_emphasized_wav = _preemphasize(wav_arr) fid = '{}_{}'.format( wav_f.split('/')[-3].split('_')[2], wav_f.split('/')[-1].split('.')[0].split('_')[1]) # extract mel-spectrograms mel_fn = os.path.join(mel_dir, '{}.npy'.format(fid)) try: mel_spec = melspectrogram(pre_emphasized_wav).astype(np.float32).T except: continue # extract spectrograms linear_fn = os.path.join(linear_dir, '{}.npy'.format(fid)) try: linear_spec = spectrogram(pre_emphasized_wav).astype(np.float32).T except: continue # extract log-f0s f0_fn = os.path.join(f0_dir, '{}.npy'.format(fid)) log_f0 = logf0(wav_f) try: log_f0 = lf0_normailze(log_f0) except: continue # extract ppgs mfcc_feats = wav2unnormalized_mfcc(wav_arr) ppg = sess.run(predicted_ppgs, feed_dict={mfcc_pl: np.expand_dims(mfcc_feats, axis=0)}) ppg = softmax(np.squeeze(ppg, axis=0)) ppg_fn = os.path.join(ppg_dir, '{}.npy'.format(fid)) # save features to respective directory mel_spec, linear_spec, log_f0, ppg = length_validate( (mel_spec, linear_spec, log_f0, ppg)) np.save(mel_fn, mel_spec) np.save(linear_fn, linear_spec) np.save(f0_fn, log_f0) np.save(ppg_fn, ppg) # write to csv if i < dev_start_idx: train_set.append(fid) with open(os.path.join(args.save_dir, 'train_meta.csv'), 'a', encoding='utf-8') as train_f: train_f.write( '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'. format(fid, fid, fid, fid, fid)) elif i < test_stat_idx: dev_set.append(fid) with open(os.path.join(args.save_dir, 'dev_meta.csv'), 'a', encoding='utf-8') as dev_f: dev_f.write( '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'. format(fid, fid, fid, fid, fid)) else: test_set.append(fid) with open(os.path.join(args.save_dir, 'test_meta.csv'), 'a', encoding='utf-8') as test_f: test_f.write( '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'. format(fid, fid, fid, fid, fid)) print('Done extracting features!') return
bits=params["preprocessing"]["bits"], hop_length=params["preprocessing"]["hop_length"], nc=args.nc, device=device) model.to(device) print("Load checkpoint from: {}:".format(args.checkpoint)) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["model"]) model_step = checkpoint["step"] wav = load_wav(args.wav_path, params["preprocessing"]["sample_rate"]) utterance_id = os.path.basename(args.wav_path).split(".")[0] wav = wav / np.abs(wav).max() * 0.999 mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"], preemph=params["preprocessing"]["preemph"], num_mels=params["preprocessing"]["num_mels"], num_fft=params["preprocessing"]["num_fft"], min_level_db=params["preprocessing"]["min_level_db"], hop_length=params["preprocessing"]["hop_length"], win_length=params["preprocessing"]["win_length"], fmin=params["preprocessing"]["fmin"]) mel = torch.FloatTensor(mel).unsqueeze(0).to(device) output = model.generate(mel) path = os.path.join( args.gen_dir, "gen_{}_model_steps_{}.wav".format(utterance_id, model_step)) save_wav(path, output, params["preprocessing"]["sample_rate"])
parser.add_argument( 'weight_path', help="Path of checkpoint (ex:./result/weights/wavenet_0800)") args = parser.parse_args() def synthesize(mel_sp, save_path, weight_path): wavenet = WaveNet(hparams.num_mels, hparams.upsample_scales) wavenet.load_weights(weight_path) mel_sp = tf.expand_dims(mel_sp, axis=0) outputs = wavenet.synthesis(mel_sp) outputs = np.squeeze(outputs) outputs = inv_mulaw_quantize(outputs) save_wav(outputs, save_path, hparams.sampling_rate) if __name__ == '__main__': wav = load_wav(args.input_path, hparams.sampling_rate) wav = normalize(wav) * 0.95 mel_sp = melspectrogram(wav, hparams.sampling_rate, hparams.num_mels, n_fft=hparams.n_fft, hop_size=hparams.hop_size, win_size=hparams.win_size) synthesize(mel_sp, args.output_path, args.weight_path)
def main(): hps = Hparams parser = argparse.ArgumentParser('VC inference') parser.add_argument('--src_wav', type=str, help='source wav file path') parser.add_argument('--ckpt', type=str, help='model ckpt path') parser.add_argument('--save_dir', type=str, help='synthesized wav save directory') args = parser.parse_args() # 0. src_wav_arr = load_wav(args.src_wav) pre_emphasized_wav = _preemphasize(src_wav_arr) # 1. extract ppgs ppg_extractor_hps = hps.PPGExtractor.CNNBLSTMClassifier mfcc_pl = tf.placeholder(dtype=tf.float32, shape=[None, None, 3 * hps.Audio.n_mfcc], name='mfcc_pl') ppg_extractor = CNNBLSTMClassifier(out_dims=hps.Audio.ppg_dim, n_cnn=ppg_extractor_hps.n_cnn, cnn_hidden=ppg_extractor_hps.cnn_hidden, cnn_kernel=ppg_extractor_hps.cnn_kernel, n_blstm=ppg_extractor_hps.n_blstm, lstm_hidden=ppg_extractor_hps.lstm_hidden) predicted_ppgs = ppg_extractor(inputs=mfcc_pl)['logits'] # set up a session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # load saved model saver = tf.train.Saver() print('Restoring ppgs extractor from {}'.format(ppg_extractor_hps.ckpt)) saver.restore(sess, ppg_extractor_hps.ckpt) mfcc_feats = wav2unnormalized_mfcc(src_wav_arr) ppg = sess.run(predicted_ppgs, feed_dict={mfcc_pl: np.expand_dims(mfcc_feats, axis=0)}) sess.close() ppg = softmax(np.squeeze(ppg, axis=0)) # 2. extract lf0, mel-spectrogram log_f0 = logf0(args.src_wav) log_f0 = lf0_normailze(log_f0) # mel-spectrogram is extracted for comparison mel_spec = melspectrogram(pre_emphasized_wav).astype(np.float32).T # 3. prepare inputs min_len = min(log_f0.shape[0], ppg.shape[0]) vc_inputs = np.concatenate([ppg[:min_len, :], log_f0[:min_len, :]], axis=1) vc_inputs = np.expand_dims(vc_inputs, axis=1) # [time, batch, dim] # 4. setup vc model and do the inference model = BLSTMConversionModel(in_channels=hps.Audio.ppg_dim + 2, out_channels=hps.Audio.num_mels, lstm_hidden=hps.BLSTMConversionModel.lstm_hidden) device = torch.device('cpu') model.load_state_dict(torch.load(args.ckpt, map_location=device)) model.eval() predicted_mels = model(torch.tensor(vc_inputs)) predicted_mels = np.squeeze(predicted_mels.detach().numpy(), axis=1) # 5. synthesize wav synthesized_wav = inv_preemphasize(inv_mel_spectrogram(predicted_mels.T)) resynthesized_wav = inv_preemphasize(inv_mel_spectrogram(mel_spec.T)) ckpt_name = args.ckpt.split('/')[-1].split('.')[0] wav_name = args.src_wav.split('/')[-1].split('.')[0] save_wav(synthesized_wav, os.path.join(args.save_dir, '{}-{}-converted.wav'.format(wav_name, ckpt_name))) save_wav(resynthesized_wav, os.path.join(args.save_dir, '{}-{}-src-resyn.wav'.format(wav_name, ckpt_name))) return