示例#1
0
def run_inference_on_tier(source, tier, text, timestep):
  # Returns a tuple, (inference, next_tier)
  # inference is the conditional inference on the current tier
  # next_tier interleaves the inference with the input to generate the next tier
  args = parse_inference_args(['-c', 'config/blizzard_compressed_experiments.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'hw_blizzard_compressed', '-i', text])
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  infererence_hp = HParam(args.infer_config)

  assert timestep % t_div[hp.model.tier] == 0, \
      "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], timestep)

  model = MelNet(hp, args, infererence_hp).cuda()
  model.load_tiers()
  model.eval()
  audio_lengths = torch.LongTensor([0]).cuda()
  if tier > 1:
    for t in tqdm(range(model.args.timestep // model.t_div)):
      audio_lengths += 1

  # source = breakdown[tier][0]
  x = torch.unsqueeze(torch.from_numpy(source), 0)
  mu, std, pi = model.tiers[tier](x, audio_lengths)
  temp = sample_gmm(mu, std, pi)
  next_tier = model.tierutil.interleave(x, temp, tier + 1)
  return next_tier[0].cpu().detach().numpy(), temp[0].cpu().detach().numpy()
示例#2
0
def inference(text, timestep=64):
  args = parse_inference_args(['-c', 'config/blizzard_compressed_experiments.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'hw_blizzard_compressed', '-i', text])
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  infererence_hp = HParam(args.infer_config)

  assert timestep % t_div[hp.model.tier] == 0, \
      "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], timestep)

  model = MelNet(hp, args, infererence_hp).cuda()
  model.load_tiers()
  model.eval()

  with torch.no_grad():
      # generated = model.sample(args.input)
      breakdown, generated = sample_model_with_breakdown(model, args.input)

  melspec = generated[0].cpu().detach().numpy()
  return breakdown, melspec
示例#3
0
def run_inference(source, timestep, tier_to_breakdown):
    # First load in the model
    hp = HParam('./config/blizzard_alldata_v5.yaml')
    infer_hp = HParam('./config/inference.yaml')
    args = parse_inference_args([
        '-c', 'config/blizzard_alldata_v5.yaml', '-p', 'config/inference.yaml',
        '-t',
        str(timestep), '-n', 'test_tiers', '-i', SENTENCE
    ])
    model = MelNet(hp, args, infer_hp).cuda()
    model.load_tiers()
    model.eval()
    audio_lengths = torch.LongTensor([0]).cuda()
    for t in tqdm(range(model.args.timestep // model.t_div)):
        audio_lengths += 1
    ## Tier 2~N ##
    x = torch.unsqueeze(torch.from_numpy(source), 0)
    for tier in tqdm(
            range(model.hp.model.tier + 1 - TESTING_TIERS,
                  model.hp.model.tier + 1)):
        tqdm.write('Tier %d' % tier)
        # Save original source and inference source
        actual_source = tier_to_breakdown[tier][0]
        actual_target = tier_to_breakdown[tier][1]
        actual_interleaved = tier_to_breakdown[tier + 1][0]
        current_source = x
        save_image(x.detach().numpy()[0], 'tier_%d_inference_source' % tier)
        save_image(actual_source, 'tier_%d_actual_source' % tier)
        save_image(actual_target, 'tier_%d_actual_target' % tier)
        mu, std, pi = model.tiers[tier](x, audio_lengths)
        temp = sample_gmm(mu, std, pi)
        save_image(temp[0].cpu().detach().numpy(),
                   'tier_%d_inference_target' % tier)
        x = model.tierutil.interleave(x, temp, tier + 1)
        save_image(x.detach().numpy()[0],
                   'tier_%d_inference_interleaved' % tier)
        save_image(actual_interleaved, 'tier_%d_actual_interleaved' % tier)
    reconstructed_mel_tensor = x.detach().numpy()
    return reconstructed_mel_tensor[0]
示例#4
0
                        help="yaml file for inference configuration")
    parser.add_argument('-t', '--timestep', type=int, default=240,
                        help="timestep of mel-spectrogram to generate")
    parser.add_argument('-n', '--name', type=str, default="result", required=False,
                        help="Name for sample")
    parser.add_argument('-i', '--input', type=str, default=None, required=False,
                        help="Input for conditional generation, leave empty for unconditional")
    args = parser.parse_args()

    hp = HParam(args.config)
    infer_hp = HParam(args.infer_config)

    assert args.timestep % t_div[hp.model.tier] == 0, \
        "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], args.timestep)

    model = MelNet(hp, args, infer_hp).cuda()
    model.load_tiers()
    model.eval()

    with torch.no_grad():
        generated = model.sample(args.input)

    os.makedirs('temp', exist_ok=True)
    torch.save(generated, os.path.join('temp', args.name + '.pt'))
    spectrogram = plot_spectrogram_to_numpy(generated[0].cpu().detach().numpy())
    plt.imsave(os.path.join('temp', args.name + '.png'), spectrogram.transpose((1, 2, 0)))

    waveform, wavespec = Reconstruct(hp).inverse(generated[0])
    wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy())
    plt.imsave(os.path.join('temp', 'Final ' + args.name + '.png'), wavespec.transpose((1, 2, 0)))
示例#5
0
def run_inference(sentence, timestep, tier_to_breakdown):
    # First load in the model
    hp = HParam('./config/blizzard_alldata_v5.yaml')
    infer_hp = HParam('./config/inference.yaml')
    args = parse_inference_args([
        '-c', 'config/blizzard_alldata_v5.yaml', '-p', 'config/inference.yaml',
        '-t',
        str(timestep), '-n', 'test_tiers', '-i', SENTENCE
    ])
    model = MelNet(hp, args, infer_hp).cuda()
    model.load_tiers()
    model.eval()

    x = None
    seq = torch.from_numpy(process_blizzard(sentence)).long().unsqueeze(0)
    input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda()
    audio_lengths = torch.LongTensor([0]).cuda()

    actual_target = tier_to_breakdown[1][1]
    # save_image(seq.detach().numpy(), 'tier_1_seq_source')

    ## Tier 1 ##
    tqdm.write('Tier 1')
    for t in tqdm(range(model.args.timestep // model.t_div)):
        audio_lengths += 1
        if x is None:
            x = torch.zeros((1, model.n_mels // model.f_div, 1)).cuda()
        else:
            x = torch.cat(
                [x, torch.zeros((1, model.n_mels // model.f_div, 1)).cuda()],
                dim=-1)
        for m in tqdm(range(model.n_mels // model.f_div)):
            torch.cuda.synchronize()
            if model.infer_hp.conditional:
                mu, std, pi, _ = model.tiers[1](x, seq, input_lengths,
                                                audio_lengths)
            else:
                mu, std, pi = model.tiers[1](x, audio_lengths)
            temp = sample_gmm(mu, std, pi)
            x[:, m, t] = temp[:, m, t]

    save_image(x[0].cpu().detach().numpy(), 'tier_1_inference_target')
    save_image(actual_target, 'tier_1_actual_target')

    # for t in tqdm(range(model.args.timestep // model.t_div)):
    #   audio_lengths += 1
    # ## Tier 2~N ##
    # x = torch.unsqueeze(torch.from_numpy(source), 0)
    # for tier in tqdm(range(model.hp.model.tier + 1 - TESTING_TIERS, model.hp.model.tier + 1)):
    #     tqdm.write('Tier %d' % tier)
    #     # Save original source and inference source
    #     actual_source = tier_to_breakdown[tier][0]
    #     actual_target = tier_to_breakdown[tier][1]
    #     actual_interleaved = tier_to_breakdown[tier+1][0]
    #     current_source = x
    #     save_image(x.detach().numpy()[0], 'tier_%d_inference_source' % tier)
    #     save_image(actual_source, 'tier_%d_actual_source' % tier)
    #     save_image(actual_target, 'tier_%d_actual_target' % tier)
    #     mu, std, pi = model.tiers[tier](x, audio_lengths)
    #     temp = sample_gmm(mu, std, pi)
    #     save_image(temp[0].cpu().detach().numpy(), 'tier_%d_inference_target' % tier)
    #     x = model.tierutil.interleave(x, temp, tier + 1)
    #     save_image(x.detach().numpy()[0], 'tier_%d_inference_interleaved' % tier)
    #     save_image(actual_interleaved, 'tier_%d_actual_interleaved' % tier)
    reconstructed_mel_tensor = x.detach().numpy()
    return reconstructed_mel_tensor[0]
示例#6
0
    parser.add_argument(
        '-i',
        '--input',
        type=str,
        default=None,
        required=False,
        help="Input for conditional generation, leave empty for unconditional")
    args = parser.parse_args()

    hp = HParam(args.config)
    infer_hp = HParam(args.infer_config)

    assert args.timestep % t_div[hp.model.tier] == 0, \
        "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], args.timestep)

    model = MelNet(hp, args, infer_hp).cuda()
    model.load_tiers()
    model.eval()

    args.tts = False
    args.tier = 1
    args.batch_size = 1

    testloader = create_dataloader(hp, args, train=False)

    gaussian_filter = get_gaussian_fileter(3, 1, 1)

    gaussian_filter = gaussian_filter.cuda()

    dependence_length = 62