def run_inference_on_tier(source, tier, text, timestep): # Returns a tuple, (inference, next_tier) # inference is the conditional inference on the current tier # next_tier interleaves the inference with the input to generate the next tier args = parse_inference_args(['-c', 'config/blizzard_compressed_experiments.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'hw_blizzard_compressed', '-i', text]) hp = HParam('./config/blizzard_compressed_experiments.yaml') infererence_hp = HParam(args.infer_config) assert timestep % t_div[hp.model.tier] == 0, \ "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], timestep) model = MelNet(hp, args, infererence_hp).cuda() model.load_tiers() model.eval() audio_lengths = torch.LongTensor([0]).cuda() if tier > 1: for t in tqdm(range(model.args.timestep // model.t_div)): audio_lengths += 1 # source = breakdown[tier][0] x = torch.unsqueeze(torch.from_numpy(source), 0) mu, std, pi = model.tiers[tier](x, audio_lengths) temp = sample_gmm(mu, std, pi) next_tier = model.tierutil.interleave(x, temp, tier + 1) return next_tier[0].cpu().detach().numpy(), temp[0].cpu().detach().numpy()
def inference(text, timestep=64): args = parse_inference_args(['-c', 'config/blizzard_compressed_experiments.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'hw_blizzard_compressed', '-i', text]) hp = HParam('./config/blizzard_compressed_experiments.yaml') infererence_hp = HParam(args.infer_config) assert timestep % t_div[hp.model.tier] == 0, \ "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], timestep) model = MelNet(hp, args, infererence_hp).cuda() model.load_tiers() model.eval() with torch.no_grad(): # generated = model.sample(args.input) breakdown, generated = sample_model_with_breakdown(model, args.input) melspec = generated[0].cpu().detach().numpy() return breakdown, melspec
def run_inference(source, timestep, tier_to_breakdown): # First load in the model hp = HParam('./config/blizzard_alldata_v5.yaml') infer_hp = HParam('./config/inference.yaml') args = parse_inference_args([ '-c', 'config/blizzard_alldata_v5.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'test_tiers', '-i', SENTENCE ]) model = MelNet(hp, args, infer_hp).cuda() model.load_tiers() model.eval() audio_lengths = torch.LongTensor([0]).cuda() for t in tqdm(range(model.args.timestep // model.t_div)): audio_lengths += 1 ## Tier 2~N ## x = torch.unsqueeze(torch.from_numpy(source), 0) for tier in tqdm( range(model.hp.model.tier + 1 - TESTING_TIERS, model.hp.model.tier + 1)): tqdm.write('Tier %d' % tier) # Save original source and inference source actual_source = tier_to_breakdown[tier][0] actual_target = tier_to_breakdown[tier][1] actual_interleaved = tier_to_breakdown[tier + 1][0] current_source = x save_image(x.detach().numpy()[0], 'tier_%d_inference_source' % tier) save_image(actual_source, 'tier_%d_actual_source' % tier) save_image(actual_target, 'tier_%d_actual_target' % tier) mu, std, pi = model.tiers[tier](x, audio_lengths) temp = sample_gmm(mu, std, pi) save_image(temp[0].cpu().detach().numpy(), 'tier_%d_inference_target' % tier) x = model.tierutil.interleave(x, temp, tier + 1) save_image(x.detach().numpy()[0], 'tier_%d_inference_interleaved' % tier) save_image(actual_interleaved, 'tier_%d_actual_interleaved' % tier) reconstructed_mel_tensor = x.detach().numpy() return reconstructed_mel_tensor[0]
help="yaml file for inference configuration") parser.add_argument('-t', '--timestep', type=int, default=240, help="timestep of mel-spectrogram to generate") parser.add_argument('-n', '--name', type=str, default="result", required=False, help="Name for sample") parser.add_argument('-i', '--input', type=str, default=None, required=False, help="Input for conditional generation, leave empty for unconditional") args = parser.parse_args() hp = HParam(args.config) infer_hp = HParam(args.infer_config) assert args.timestep % t_div[hp.model.tier] == 0, \ "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], args.timestep) model = MelNet(hp, args, infer_hp).cuda() model.load_tiers() model.eval() with torch.no_grad(): generated = model.sample(args.input) os.makedirs('temp', exist_ok=True) torch.save(generated, os.path.join('temp', args.name + '.pt')) spectrogram = plot_spectrogram_to_numpy(generated[0].cpu().detach().numpy()) plt.imsave(os.path.join('temp', args.name + '.png'), spectrogram.transpose((1, 2, 0))) waveform, wavespec = Reconstruct(hp).inverse(generated[0]) wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy()) plt.imsave(os.path.join('temp', 'Final ' + args.name + '.png'), wavespec.transpose((1, 2, 0)))
def run_inference(sentence, timestep, tier_to_breakdown): # First load in the model hp = HParam('./config/blizzard_alldata_v5.yaml') infer_hp = HParam('./config/inference.yaml') args = parse_inference_args([ '-c', 'config/blizzard_alldata_v5.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'test_tiers', '-i', SENTENCE ]) model = MelNet(hp, args, infer_hp).cuda() model.load_tiers() model.eval() x = None seq = torch.from_numpy(process_blizzard(sentence)).long().unsqueeze(0) input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda() audio_lengths = torch.LongTensor([0]).cuda() actual_target = tier_to_breakdown[1][1] # save_image(seq.detach().numpy(), 'tier_1_seq_source') ## Tier 1 ## tqdm.write('Tier 1') for t in tqdm(range(model.args.timestep // model.t_div)): audio_lengths += 1 if x is None: x = torch.zeros((1, model.n_mels // model.f_div, 1)).cuda() else: x = torch.cat( [x, torch.zeros((1, model.n_mels // model.f_div, 1)).cuda()], dim=-1) for m in tqdm(range(model.n_mels // model.f_div)): torch.cuda.synchronize() if model.infer_hp.conditional: mu, std, pi, _ = model.tiers[1](x, seq, input_lengths, audio_lengths) else: mu, std, pi = model.tiers[1](x, audio_lengths) temp = sample_gmm(mu, std, pi) x[:, m, t] = temp[:, m, t] save_image(x[0].cpu().detach().numpy(), 'tier_1_inference_target') save_image(actual_target, 'tier_1_actual_target') # for t in tqdm(range(model.args.timestep // model.t_div)): # audio_lengths += 1 # ## Tier 2~N ## # x = torch.unsqueeze(torch.from_numpy(source), 0) # for tier in tqdm(range(model.hp.model.tier + 1 - TESTING_TIERS, model.hp.model.tier + 1)): # tqdm.write('Tier %d' % tier) # # Save original source and inference source # actual_source = tier_to_breakdown[tier][0] # actual_target = tier_to_breakdown[tier][1] # actual_interleaved = tier_to_breakdown[tier+1][0] # current_source = x # save_image(x.detach().numpy()[0], 'tier_%d_inference_source' % tier) # save_image(actual_source, 'tier_%d_actual_source' % tier) # save_image(actual_target, 'tier_%d_actual_target' % tier) # mu, std, pi = model.tiers[tier](x, audio_lengths) # temp = sample_gmm(mu, std, pi) # save_image(temp[0].cpu().detach().numpy(), 'tier_%d_inference_target' % tier) # x = model.tierutil.interleave(x, temp, tier + 1) # save_image(x.detach().numpy()[0], 'tier_%d_inference_interleaved' % tier) # save_image(actual_interleaved, 'tier_%d_actual_interleaved' % tier) reconstructed_mel_tensor = x.detach().numpy() return reconstructed_mel_tensor[0]
parser.add_argument( '-i', '--input', type=str, default=None, required=False, help="Input for conditional generation, leave empty for unconditional") args = parser.parse_args() hp = HParam(args.config) infer_hp = HParam(args.infer_config) assert args.timestep % t_div[hp.model.tier] == 0, \ "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], args.timestep) model = MelNet(hp, args, infer_hp).cuda() model.load_tiers() model.eval() args.tts = False args.tier = 1 args.batch_size = 1 testloader = create_dataloader(hp, args, train=False) gaussian_filter = get_gaussian_fileter(3, 1, 1) gaussian_filter = gaussian_filter.cuda() dependence_length = 62