def main(): model = WaveNet() checkpoint = torch.load( 'runs/Oct09_11-24-52_K-00030-LIN/checkpoint_9000.pth') model.load_state_dict(checkpoint['model']) weights = model.export_weights() wavenet = nv_wavenet.NVWaveNet(**weights) # TODO: とりあえずバッチサイズ1で実験 # TODO: 複数の音声をまとめて推論するときは長さをpaddingする必要あり filename = 'data/arctic_a0001.wav' audio, sampling_rate = load_wav_to_torch(filename) mel = get_mel(audio) mel.unsqueeze_(0) print(mel.shape) # NVWaveNetの入力に合うように整形 # (channels, batch=1, num_layers, samples) cond_input = get_cond_input(mel, model) # 波形を生成 # 生成された波形は mu-law された状態なので元に戻す必要がある audio_data = wavenet.infer(cond_input, nv_wavenet.Impl.AUTO) print(audio_data.shape) print(audio_data.min(), audio_data.max()) # wavenet.Aはmu_quantization audio = mu_law_decode_numpy(audio_data[0].cpu().numpy(), wavenet.A) audio = MAX_WAV_VALUE * audio wavdata = audio.astype('int16') scipy.io.wavfile.write('gen.wav', 16000, wavdata)
def main(mel_files, model_filename, output_dir, batch_size, implementation): mel_files = utils.files_to_list(mel_files) model = torch.load(model_filename)['model'] wavenet = nv_wavenet.NVWaveNet(**(model.export_weights())) for files in chunker(mel_files, batch_size): mels = [] for file_path in files: print(file_path) mel = torch.load(file_path) mel = utils.to_gpu(mel) mels.append(torch.unsqueeze(mel, 0)) cond_input = model.get_cond_input(torch.cat(mels, 0)) audio_data = wavenet.infer(cond_input, implementation) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path))[0] audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(), wavenet.A) audio = utils.MAX_WAV_VALUE * audio audio = 32768.0 * audio wavdata = audio.astype('int16') #wavdata = audio.astype('float16') write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
def main(audio_files, model_filename, output_dir, batch_size, speaker_id, implementation): audio_files = utils.files_to_list(audio_files) model = torch.load(model_filename)['model'] model.eval() wavenet = nv_wavenet.NVWaveNet( **(model.decoders[speaker_id].export_weights())) for files in chunker(audio_files, batch_size): audio_ = [] for file_path in files: print(file_path) audio, sampling_rate = utils.load_wav_to_torch(file_path) if sampling_rate != 16000: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, 16000)) audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, 256) audio = utils.to_gpu(audio) audio_.append(torch.unsqueeze(audio, 0)) latent = model.get_latent_input(torch.cat(audio_, 0)) cond_input = model.decoders[speaker_id].get_cond_input(latent) audio_data = wavenet.infer(cond_input, implementation) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path))[0] audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(), wavenet.A) audio = utils.MAX_WAV_VALUE * audio wavdata = audio.astype('int16') write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
def main(input_files, model_dir, output_dir, batch_size, implementation, data_config, audio_config, preload_mels=False): model_filename = get_latest_checkpoint(model_dir) print("Model path: {}".format(model_filename)) model = torch.load(model_filename)['model'] wavenet = nv_wavenet.NVWaveNet(**(model.export_weights())) print("Wavenet num layers: {}, max_dilation: {}".format( wavenet.num_layers, wavenet.max_dilation)) writer = SummaryWriter(output_dir) mel_extractor = Mel2SampOnehot(audio_config=audio_config, **data_config) input_files = utils.files_to_list(input_files) audio_processor = AudioProcessor(audio_config) for j, files in enumerate(chunker(input_files, batch_size)): mels = [] for i, file_path in enumerate(files): if preload_mels: mel = np.load(file_path[0]).T mel = torch.from_numpy(mel) mel = utils.to_gpu(mel) else: audio, _ = utils.load_wav_to_torch(file_path) file_name = os.path.splitext(os.path.basename(file_path))[0] writer.add_audio("eval_true/{}/{}".format(i, file_name), audio / utils.MAX_WAV_VALUE, 0, 22050) mel = mel_extractor.get_mel(audio) mel = mel.t().cuda() mels.append(torch.unsqueeze(mel, 0)) mels = torch.cat(mels, 0) cond_input = model.get_cond_input(mels) audio_data = wavenet.infer(cond_input, implementation) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path[0]))[0] audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(), 256) print("Range of {}.wav before deemphasis : {} to {}".format( file_name, audio.min(), audio.max())) if mel_extractor.apply_preemphasis: audio = audio.astype("float32") audio = audio_processor.deemphasis(audio[None, :]) audio = audio.numpy()[0] print("Range of {}.wav after deemphasis : {} to {}".format( file_name, audio.min(), audio.max())) audio = np.tanh(audio) output_filepath = "{}.wav".format(file_name) output_filepath = os.path.join(output_dir, output_filepath) assert audio.dtype in [np.float64, np.float32] assert (np.abs(audio)).max() <= 1 writer.add_audio(output_filepath, audio, 0, 22050) audio = (audio * 32767).astype("int16") scipy.io.wavfile.write(output_filepath, 22050, audio)
def main(midi_files, model_filename, output_dir, batch_size, implementation): midi_files = utils.files_to_list(midi_files) model = torch.load(model_filename)['model'] wavenet = nv_wavenet.NVWaveNet(**(model.export_weights())) for files in chunker(midi_files, batch_size): midi_batch = [] for file_path in files: print(file_path) midi = torch.load(file_path)#.pin_memory() midi = utils.to_gpu(midi) midi_batch.append(torch.unsqueeze(midi, 0)) #Get conditional input for inference wavenet cond_input = model.get_cond_input(torch.cat(midi_batch, 0)) audio_data = wavenet.infer(cond_input, implementation) print(audio_data) print(audio_data.size()) print(np.max(audio_data.cpu().numpy())) print(np.min(audio_data.cpu().numpy())) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path))[0] audio = utils.mu_law_decode_numpy(audio_data[i,:].cpu().numpy(), wavenet.A) write("{}/{}_infer_noMul.wav".format(output_dir, file_name), 16000, audio) print(audio.shape) print(np.max(audio)) print(np.min(audio)) audio = utils.MAX_WAV_VALUE * audio print(np.max(audio)) print(np.min(audio)) wavdata = audio.astype('int16') print(np.max(wavdata)) print(np.min(wavdata)) write("{}/{}_infer.wav".format(output_dir, file_name), 16000, wavdata) exit()
import torch from scipy.io.wavfile import write import numpy as np import nv_wavenet MAX_WAV_VALUE = 32768.0 def mu_law_decode_numpy(x, mu_quantization=256): assert (np.max(x) <= mu_quantization) assert (np.min(x) >= 0) mu = mu_quantization - 1. # Map values back to [-1, 1]. signal = 2 * (x / mu) - 1 # Perform inverse of mu-law transformation. magnitude = (1 / mu) * ((1 + mu)**np.abs(signal) - 1) return np.sign(signal) * magnitude if __name__ == '__main__': model = torch.load("model.pt") wavenet = nv_wavenet.NVWaveNet(**model) cond_input = torch.load("cond_input.pt") samples = wavenet.infer(cond_input, nv_wavenet.Impl.PERSISTENT) audio = mu_law_decode_numpy(samples.cpu().numpy(), 256) audio = MAX_WAV_VALUE * audio wavdata = audio.astype('int16') write('audio.wav', 16000, wavdata)
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # ***************************************************************************** """ Tests that the NV-WaveNet class is producing audio """ import torch from scipy.io.wavfile import write import nv_wavenet from wavenet import WaveNet import utils import json if __name__ == '__main__': config = json.loads(open('config.json').read()) wavenet_config = config["wavenet_config"] model = WaveNet(**wavenet_config).cuda() weights = model.export_weights() wavenet = nv_wavenet.NVWaveNet(**weights) num_samples = 10*1000 batch_size = config['train_config']['batch_size'] cond_input = torch.zeros([2 * wavenet_config['n_residual_channels'], batch_size, wavenet_config['n_layers'], num_samples]).cuda() samples = wavenet.infer(cond_input, nv_wavenet.Impl.PERSISTENT)[0] audio = utils.mu_law_decode_numpy(samples.cpu().numpy(), 256) audio = utils.MAX_WAV_VALUE * audio wavdata = audio.astype('int16') write('audio.wav',16000, wavdata)
def load_wav_model(checkpoint_path): model = torch.load(checkpoint_path)['model'] wavenet = nv_wavenet.NVWaveNet(**(model.export_weights())) return model, wavenet
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iters_per_checkpoint, iters_per_eval, batch_size, seed, checkpoint_path, log_dir, ema_decay=0.9999): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== if train_data_config["no_chunks"]: criterion = MaskedCrossEntropyLoss() else: criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cuda() ema = ExponentialMovingAverage(ema_decay) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=200000, gamma=0.5) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, scheduler, iteration, ema = load_checkpoint(checkpoint_path, model, optimizer, scheduler, ema) iteration += 1 # next iteration is iteration + 1 trainset = Mel2SampOnehot(audio_config=audio_config, verbose=True, **train_data_config) validset = Mel2SampOnehot(audio_config=audio_config, verbose=False, **valid_data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None valid_sampler = DistributedSampler(validset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== print(train_data_config) if train_data_config["no_chunks"]: collate_fn = utils.collate_fn else: collate_fn = torch.utils.data.dataloader.default_collate train_loader = DataLoader(trainset, num_workers=1, shuffle=False, collate_fn=collate_fn, sampler=train_sampler, batch_size=batch_size, pin_memory=True, drop_last=True) valid_loader = DataLoader(validset, num_workers=1, shuffle=False, sampler=valid_sampler, batch_size=1, pin_memory=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) writer = SummaryWriter(log_dir) print("Checkpoints writing to: {}".format(log_dir)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): if low_memory: torch.cuda.empty_cache() scheduler.step() model.zero_grad() if train_data_config["no_chunks"]: x, y, seq_lens = batch seq_lens = to_gpu(seq_lens) else: x, y = batch x = to_gpu(x).float() y = to_gpu(y) x = (x, y) # auto-regressive takes outputs as inputs y_pred = model(x) if train_data_config["no_chunks"]: loss = criterion(y_pred, y, seq_lens) else: loss = criterion(y_pred, y) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus)[0] else: reduced_loss = loss.data[0] loss.backward() optimizer.step() for name, param in model.named_parameters(): if name in ema.shadow: ema.update(name, param.data) print("{}:\t{:.9f}".format(iteration, reduced_loss)) if rank == 0: writer.add_scalar('loss', reduced_loss, iteration) if (iteration % iters_per_checkpoint == 0 and iteration): if rank == 0: checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, scheduler, learning_rate, iteration, checkpoint_path, ema, wavenet_config) if (iteration % iters_per_eval == 0 and iteration > 0 and not config["no_validation"]): if low_memory: torch.cuda.empty_cache() if rank == 0: model_eval = nv_wavenet.NVWaveNet(**(model.export_weights())) for j, valid_batch in enumerate(valid_loader): mel, audio = valid_batch mel = to_gpu(mel).float() cond_input = model.get_cond_input(mel) predicted_audio = model_eval.infer(cond_input, nv_wavenet.Impl.AUTO) predicted_audio = utils.mu_law_decode_numpy(predicted_audio[0, :].cpu().numpy(), 256) writer.add_audio("valid/predicted_audio_{}".format(j), predicted_audio, iteration, 22050) audio = utils.mu_law_decode_numpy(audio[0, :].cpu().numpy(), 256) writer.add_audio("valid_true/audio_{}".format(j), audio, iteration, 22050) if low_memory: torch.cuda.empty_cache() iteration += 1
wavenet_path = 'checkpoints/shelby_retrain/wavenet_135000' tts_file = '/var/pylon/data/speech/pylon/tts/shelby/tts-train.txt' utterances = load_utterances(tts_file) tf.reset_default_graph() sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) graph = load_graph(sess, sushi_path) sushibot_inputs = graph.get_tensor_by_name("data/inputs:0") sushibot_lengths = graph.get_tensor_by_name("data/input_lengths:0") prediction = graph.get_tensor_by_name("sushibot/prediction:0") model = torch.load(wavenet_path)['model'].cuda(1) wavenet = nv_wavenet.NVWaveNet(**model.export_weights()) for i, utterance in enumerate(tqdm(utterances)): input_vector = [[ SUSHIBOT_CHARSET.index(c) if c in SUSHIBOT_CHARSET else 0 for c in utterance ] + [SUSHIBOT_CHARSET.index('~')]] feed_dict = { sushibot_inputs: input_vector, sushibot_lengths: [len(input_vector[0])] } mels = sess.run(prediction, feed_dict=feed_dict) mels = mels.reshape(-1, 80) np.save(os.path.join(outdir, 'mels/sushi-mel-{:05d}.npy'.format(i)),
def load_wavenet(self): self.wavenet = torch.load(self.wavenet_path)['model'] self.nv_wavenet = nv_wavenet.NVWaveNet( **(self.wavenet.export_weights()))