def main(mel_files, model_filename, output_dir, batch_size, implementation): mel_files = utils.files_to_list(mel_files) model = torch.load(model_filename)['model'] wavenet = nv_wavenet.NVWaveNet(**(model.export_weights())) for files in chunker(mel_files, batch_size): mels = [] for file_path in files: print(file_path) mel = torch.load(file_path) mel = utils.to_gpu(mel) mels.append(torch.unsqueeze(mel, 0)) cond_input = model.get_cond_input(torch.cat(mels, 0)) audio_data = wavenet.infer(cond_input, implementation) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path))[0] audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(), wavenet.A) audio = utils.MAX_WAV_VALUE * audio audio = 32768.0 * audio wavdata = audio.astype('int16') #wavdata = audio.astype('float16') write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
def main(audio_files, model_filename, output_dir, batch_size, speaker_id, implementation): audio_files = utils.files_to_list(audio_files) model = torch.load(model_filename)['model'] model.eval() wavenet = nv_wavenet.NVWaveNet( **(model.decoders[speaker_id].export_weights())) for files in chunker(audio_files, batch_size): audio_ = [] for file_path in files: print(file_path) audio, sampling_rate = utils.load_wav_to_torch(file_path) if sampling_rate != 16000: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, 16000)) audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, 256) audio = utils.to_gpu(audio) audio_.append(torch.unsqueeze(audio, 0)) latent = model.get_latent_input(torch.cat(audio_, 0)) cond_input = model.decoders[speaker_id].get_cond_input(latent) audio_data = wavenet.infer(cond_input, implementation) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path))[0] audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(), wavenet.A) audio = utils.MAX_WAV_VALUE * audio wavdata = audio.astype('int16') write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
def main(input_files, model_dir, output_dir, batch_size, implementation, data_config, audio_config, preload_mels=False): model_filename = get_latest_checkpoint(model_dir) print("Model path: {}".format(model_filename)) model = torch.load(model_filename)['model'] wavenet = nv_wavenet.NVWaveNet(**(model.export_weights())) print("Wavenet num layers: {}, max_dilation: {}".format( wavenet.num_layers, wavenet.max_dilation)) writer = SummaryWriter(output_dir) mel_extractor = Mel2SampOnehot(audio_config=audio_config, **data_config) input_files = utils.files_to_list(input_files) audio_processor = AudioProcessor(audio_config) for j, files in enumerate(chunker(input_files, batch_size)): mels = [] for i, file_path in enumerate(files): if preload_mels: mel = np.load(file_path[0]).T mel = torch.from_numpy(mel) mel = utils.to_gpu(mel) else: audio, _ = utils.load_wav_to_torch(file_path) file_name = os.path.splitext(os.path.basename(file_path))[0] writer.add_audio("eval_true/{}/{}".format(i, file_name), audio / utils.MAX_WAV_VALUE, 0, 22050) mel = mel_extractor.get_mel(audio) mel = mel.t().cuda() mels.append(torch.unsqueeze(mel, 0)) mels = torch.cat(mels, 0) cond_input = model.get_cond_input(mels) audio_data = wavenet.infer(cond_input, implementation) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path[0]))[0] audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(), 256) print("Range of {}.wav before deemphasis : {} to {}".format( file_name, audio.min(), audio.max())) if mel_extractor.apply_preemphasis: audio = audio.astype("float32") audio = audio_processor.deemphasis(audio[None, :]) audio = audio.numpy()[0] print("Range of {}.wav after deemphasis : {} to {}".format( file_name, audio.min(), audio.max())) audio = np.tanh(audio) output_filepath = "{}.wav".format(file_name) output_filepath = os.path.join(output_dir, output_filepath) assert audio.dtype in [np.float64, np.float32] assert (np.abs(audio)).max() <= 1 writer.add_audio(output_filepath, audio, 0, 22050) audio = (audio * 32767).astype("int16") scipy.io.wavfile.write(output_filepath, 22050, audio)
def __init__(self, training_files, segment_length, mu_quantization, filter_length, hop_length, win_length, sampling_rate): audio_files = utils.files_to_list(training_files) self.audio_files = audio_files random.seed(1234) random.shuffle(self.audio_files) self.segment_length = segment_length self.mu_quantization = mu_quantization self.sampling_rate = sampling_rate
def __init__(self, audio_files, mu_quantization, no_chunks, audio_config, segment_length, use_tf=False, use_lws=True, load_mel=False, verbose=False): audio_files = utils.files_to_list(audio_files) self.audio_files = audio_files random.seed(1234) random.shuffle(self.audio_files) if not load_mel: if use_tf: audio_processor_cls = AudioProcessor elif use_lws: audio_processor_cls = LwsAudioProcessor else: raise ValueError( "Mel spectrum can be calculated only with tf or lws!") self.audio_processor = audio_processor_cls(audio_config) self.mu_quantization = mu_quantization self.segment_length = segment_length audio_params = AudioProcessor._load_params(audio_config) if verbose: print("Audio params:") pprint(audio_params) self.audio_params = audio_params self.window_size = audio_params["window_size"] self.preemphasis_coeff = audio_params["preemphasis_coef"] self.apply_preemphasis = audio_params["apply_preemphasis"] self.window_step = audio_params["window_step"] self.sample_rate = audio_params["sample_rate"] self.mel_segment_length = int( np.ceil((segment_length - self.window_size) / self.window_step)) self.num_mels = audio_params["num_mel_bins"] self.use_tf = use_tf self.load_mel = load_mel self.no_chunks = no_chunks self.use_lws = use_lws
def __init__(self, training_files, segment_length, mu_quantization, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): audio_files = utils.files_to_list(training_files) self.audio_files = audio_files random.seed(1234) random.shuffle(self.audio_files) mel_fmax = None if mel_fmax == -1 else mel_fmax self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.mu_quantization = mu_quantization self.sampling_rate = sampling_rate
required=True, type=str, help='File containing list of wavefiles') parser.add_argument('-o', "--output_dir", required=True, type=str, help='Directory to put Mel-Spectrogram Tensors') parser.add_argument('-c', '--config', type=str, help='JSON file for configuration') args = parser.parse_args() filepaths = utils.files_to_list(args.audio_list) # Make directory if it doesn't exist if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) os.chmod(args.output_dir, 0o775) # Parse config. Only using data processing with open(args.config) as f: data = f.read() config = json.loads(data) data_config = config["data_config"] mel_factory = Mel2SampOnehot(**data_config) for filepath in filepaths: audio, sampling_rate = utils.load_wav_to_torch(filepath)