Exemplo n.º 1
0
def save_states(global_step, writer, y_hat, y, input_lengths, checkpoint_dir=None):
    print("Save intermediate states at step {}".format(global_step))
    idx = np.random.randint(0, len(y_hat))
    length = input_lengths[idx].data.cpu().numpy()[0]

    # (B, C, T)
    y_hat = y_hat.squeeze(-1)
    # (B, T)
    y_hat = F.softmax(y_hat, dim=1).max(1)[1]

    # (T,)
    y_hat = y_hat[idx].data.cpu().long().numpy()
    y = y[idx].view(-1).data.cpu().long().numpy()

    y_hat = P.inv_mulaw_quantize(y_hat)
    y = P.inv_mulaw_quantize(y)

    # Mask by length
    y_hat[length:] = 0
    y[length:] = 0

    # Save audio
    audio_dir = join(checkpoint_dir, "audio")
    os.makedirs(audio_dir, exist_ok=True)
    path = join(audio_dir, "step{:09d}_predicted.wav".format(global_step))
    librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
    path = join(audio_dir, "step{:09d}_target.wav".format(global_step))
    librosa.output.write_wav(path, y, sr=hparams.sample_rate)
Exemplo n.º 2
0
def save_states(global_step,
                writer,
                y_hat,
                student_hat,
                y,
                input_lengths,
                checkpoint_dir=None):

    print("Save intermediate states at step {}".format(global_step))
    idx = np.random.randint(0, len(y_hat))
    length = input_lengths[idx].data.cpu().item()

    # (B, C, T)
    if y_hat.dim() == 4:
        y_hat = y_hat.squeeze(-1)

    if is_mulaw_quantize(hparams.input_type):
        # (B, T)
        y_hat = F.softmax(y_hat, dim=1).max(1)[1]

        # (T,)
        y_hat = y_hat[idx].data.cpu().long().numpy()
        y = y[idx].view(-1).data.cpu().long().numpy()

        y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
        y = P.inv_mulaw_quantize(y, hparams.quantize_channels)
    else:
        # (B, T)
        if hparams.use_gaussian:
            y_hat = y_hat.transpose(1, 2)
            y_hat = sample_from_gaussian(y_hat,
                                         log_scale_min=hparams.log_scale_min)
        else:
            y_hat = sample_from_discretized_mix_logistic(
                y_hat, log_scale_min=hparams.log_scale_min)

        # (T,)
        y_hat = y_hat[idx].view(-1).data.cpu().numpy()
        y = y[idx].view(-1).data.cpu().numpy()
        student_hat = student_hat[idx].view(-1).data.cpu().numpy()

        if is_mulaw(hparams.input_type):
            y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels)
            y = P.inv_mulaw(y, hparams.quantize_channels)
            student_hat = P.inv_mulaw(student_hat, hparams.quantize_channels)

    # Mask by length
    y_hat[length:] = 0
    y[length:] = 0
    student_hat[length:] = 0

    # Save audio
    audio_dir = join(checkpoint_dir, "audio")
    os.makedirs(audio_dir, exist_ok=True)
    path = join(audio_dir, "step{:09d}_teacher.wav".format(global_step))
    librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
    path = join(audio_dir, "step{:09d}_student.wav".format(global_step))
    librosa.output.write_wav(path, student_hat, sr=hparams.sample_rate)
    path = join(audio_dir, "step{:09d}_target.wav".format(global_step))
    librosa.output.write_wav(path, y, sr=hparams.sample_rate)
Exemplo n.º 3
0
def save_states(global_step,
                writer,
                y_hat,
                y,
                input_lengths,
                checkpoint_dir=None):
    print("Save intermediate states at step {}".format(global_step))
    idx = np.random.randint(0, len(y_hat))
    length = input_lengths[idx].data.cpu().item()

    # (B, C, T)
    if y_hat.dim() == 4:
        y_hat = y_hat.squeeze(-1)

    if is_mulaw_quantize(wavenet_hparams.input_type):
        # (B, T)
        y_hat = F.softmax(y_hat, dim=1).max(1)[1]

        # (T,)
        y_hat = y_hat[idx].data.cpu().long().numpy()
        y = y[idx].view(-1).data.cpu().long().numpy()

        y_hat = P.inv_mulaw_quantize(y_hat,
                                     wavenet_hparams.quantize_channels - 1)
        y = P.inv_mulaw_quantize(y, wavenet_hparams.quantize_channels - 1)
    else:
        # (B, T)
        if wavenet_hparams.output_distribution == "Logistic":
            y_hat = sample_from_discretized_mix_logistic(
                y_hat, log_scale_min=wavenet_hparams.log_scale_min)
        elif wavenet_hparams.output_distribution == "Normal":
            y_hat = sample_from_mix_gaussian(
                y_hat, log_scale_min=wavenet_hparams.log_scale_min)
        else:
            assert False

        # (T,)
        y_hat = y_hat[idx].view(-1).data.cpu().numpy()
        y = y[idx].view(-1).data.cpu().numpy()

        if is_mulaw(wavenet_hparams.input_type):
            y_hat = P.inv_mulaw(y_hat, wavenet_hparams.quantize_channels)
            y = P.inv_mulaw(y, wavenet_hparams.quantize_channels)

    # Mask by length
    y_hat[length:] = 0
    y[length:] = 0

    # Save audio
    audio_dir = join(checkpoint_dir, "intermediate", "audio")
    os.makedirs(audio_dir, exist_ok=True)
    path = join(audio_dir, "step{:09d}_predicted.wav".format(global_step))
    # librosa.output.write_wav(path, y_hat, sr=wavenet_hparams.sample_rate)
    sf.write(path, y_hat, samplerate=wavenet_hparams.sample_rate)
    path = join(audio_dir, "step{:09d}_target.wav".format(global_step))
    # librosa.output.write_wav(path, y, sr=wavenet_hparams.sample_rate)
    sf.write(path, y, samplerate=wavenet_hparams.sample_rate)
Exemplo n.º 4
0
def save_states(global_step, writer, y_hat, y, y_student,scale_tot, input_lengths, checkpoint_dir=None):
    print("Save intermediate states at step {}".format(global_step))
    idx = np.random.randint(0, len(y_hat))
    length = input_lengths[idx].data.cpu().numpy()

    # (B, C, T)
    if y_hat.dim() == 4:
        y_hat = y_hat.squeeze(-1)

    if is_mulaw_quantize(hparams.input_type):
        # (B, T)
        y_hat = F.softmax(y_hat, dim=1).max(1)[1]

        # (T,)
        y_hat = y_hat[idx].data.cpu().long().numpy()
        y = y[idx].view(-1).data.cpu().long().numpy()

        y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
        y = P.inv_mulaw_quantize(y, hparams.quantize_channels)
    else:
        # (B, T)
        scale = y_hat[:,1:,:]
        teacher_log_scale = scale.data.cpu().numpy()
        student_log_scale = torch.log(scale_tot).data.cpu().numpy()
        writer.add_histogram('log_teacher_scale', teacher_log_scale, global_step)
        writer.add_histogram('log_student_scale', student_log_scale, global_step)
        y_hat = sample_from_discretized_gaussian(
            y_hat, log_scale_min=hparams.log_scale_min)

        # (T,)
        y_hat = y_hat[idx].view(-1).data.cpu().numpy()
        y = y[idx].view(-1).data.cpu().numpy()

        if is_mulaw(hparams.input_type):
            y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels)
            y = P.inv_mulaw(y, hparams.quantize_channels)

    # Mask by length
    y_hat[length:] = 0
    y[length:] = 0

    y_student = y_student[idx].view(-1).data.cpu().numpy()
    y_student[length:] = 0

    # Save audio
    audio_dir = join(checkpoint_dir, "audio")
    os.makedirs(audio_dir, exist_ok=True)
    path = join(audio_dir, "step{:09d}_teacher_predicted.wav".format(global_step))
    librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
    path = join(audio_dir, "step{:09d}_student_predicted.wav".format(global_step))
    librosa.output.write_wav(path, y_student, sr=hparams.sample_rate)
    path = join(audio_dir, "step{:09d}_target.wav".format(global_step))
    librosa.output.write_wav(path, y, sr=hparams.sample_rate)
    path = join(audio_dir, "step{:09d}.jpg".format(global_step))
    save_waveplot(path,y_teacher=y_hat,y_student=y_student,y_target=y,writer=writer,global_step=global_step)
Exemplo n.º 5
0
def eval_model(global_step, writer, model, y, c, g, input_lengths, eval_dir):
    model.eval()
    idx = np.random.randint(0, len(y))
    length = input_lengths[idx].data.cpu().numpy()[0]

    # (T,)
    y_target = y[idx].view(-1).data.cpu().long().numpy()[:length]

    if c is not None:
        c = c[idx, :, :length].unsqueeze(0)
        assert c.dim() == 3
        print("Shape of local conditioning features: {}".format(c.size()))
    if g is not None:
        # TODO: test
        g = g[idx]
        print("Shape of global conditioning features: {}".format(g.size()))

    # Dummy silence
    initial_value = P.mulaw_quantize(0)
    print("Intial value:", initial_value)

    # (C,)
    initial_input = np_utils.to_categorical(initial_value,
                                            num_classes=256).astype(np.float32)
    initial_input = Variable(torch.from_numpy(initial_input),
                             volatile=True).view(1, 1, 256)
    initial_input = initial_input.cuda() if use_cuda else initial_input
    y_hat = model.incremental_forward(initial_input,
                                      c=c,
                                      g=g,
                                      T=length,
                                      tqdm=tqdm,
                                      softmax=True,
                                      quantize=True)
    y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
    y_hat = P.inv_mulaw_quantize(y_hat)

    y_target = P.inv_mulaw_quantize(y_target)

    # Save audio
    os.makedirs(eval_dir, exist_ok=True)
    path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step))
    librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
    path = join(eval_dir, "step{:09d}_target.wav".format(global_step))
    librosa.output.write_wav(path, y_target, sr=hparams.sample_rate)

    # save figure
    path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step))
    save_waveplot(path, y_hat, y_target)
Exemplo n.º 6
0
def test_mulaw():
    # Check corner cases
    assert P.mulaw_quantize(-1.0, 2) == 0
    assert P.mulaw_quantize(-0.5, 2) == 0
    assert P.mulaw_quantize(-0.001, 2) == 0
    assert P.mulaw_quantize(0.0, 2) == 1
    assert P.mulaw_quantize(0.0001, 2) == 1
    assert P.mulaw_quantize(0.5, 2) == 1
    assert P.mulaw_quantize(0.99999, 2) == 1
    assert P.mulaw_quantize(1.0, 2) == 2

    np.random.seed(1234)
    # forward/backward correctness
    for mu in [128, 256, 512]:
        for x in np.random.rand(100):
            y = P.mulaw(x, mu)
            assert y >= 0 and y <= 1
            x_hat = P.inv_mulaw(y, mu)
            assert np.allclose(x, x_hat)

    # forward/backward correctness for quantize
    for mu in [128, 256, 512]:
        for x, y in [(-1.0, 0), (0.0, mu // 2), (0.99999, mu - 1)]:
            y_hat = P.mulaw_quantize(x, mu)
            err = np.abs(x - P.inv_mulaw_quantize(y_hat, mu))
            print(y, y_hat, err)
            assert np.allclose(y, y_hat)
            # have small quantize error
            assert err <= 0.1

    # ndarray input
    for mu in [128, 256, 512]:
        x = np.random.rand(10)
        y = P.mulaw(x, mu)
        x_hat = P.inv_mulaw(y, mu)
        assert np.allclose(x, x_hat)
        P.inv_mulaw_quantize(P.mulaw_quantize(x))

    # torch array input
    from warnings import warn
    import torch
    torch.manual_seed(1234)
    for mu in [128, 256, 512]:
        x = torch.rand(10)
        y = P.mulaw(x, mu)
        x_hat = P.inv_mulaw(y, mu)
        assert np.allclose(x, x_hat)
        P.inv_mulaw_quantize(P.mulaw_quantize(x))
Exemplo n.º 7
0
def save_log(sess, step, model, plot_dir, audio_dir, hp):
    predicts, targets = sess.run([model.log_outputs, model.targets])

    y_hat = P.inv_mulaw_quantize(predicts[0], hp.quantize_channels)
    y = P.inv_mulaw_quantize(targets[0], hp.quantize_channels)

    pred_wav_path = os.path.join(audio_dir, 'step-{}-pred.wav'.format(step))
    target_wav_path = os.path.join(audio_dir, 'step-{}-real.wav'.format(step))
    plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(step))

    # Save audio
    librosa.output.write_wav(pred_wav_path, y_hat, sr=hp.sample_rate)
    librosa.output.write_wav(target_wav_path, y, sr=hp.sample_rate)

    # Save figure
    waveplot(plot_path, y_hat, y, hparams)
Exemplo n.º 8
0
    def synthesize(self, sess, n_samples, lc, gc):
        sess.run(tf.variables_initializer(self.var_q))

        if self.net.scalar_input:
            seeds = [0]
        else:
            seeds = [128]

        seeds = [seeds]
        seeds = np.repeat(seeds, self.batch_size, axis=0)
        generated = [seeds]

        if type(n_samples) == list:
            n_sample = max(n_samples)
        else:
            n_sample = n_samples

        for j in tqdm(range(n_sample)):
            sample = generated[-1]
            current_lc = lc[:, j, :]

            # Generation phase
            feed_dict = {
                self.sample_placeholder: sample,
                self.lc_placeholder: current_lc,
                self.gen_num: j
            }

            if self.gc_placeholder is not None:
                feed_dict.update({self.gc_placeholder: gc})

            prob, _layers = sess.run([self.next_sample_prob, self.layers_out],
                                     feed_dict=feed_dict)

            # Update phase
            feed_dict = {
                self.initial: _layers[0],
                self.others: np.array(_layers[1:]),
                self.gen_num: j
            }

            sess.run(self.update_q_ops, feed_dict=feed_dict)

            if self.net.scalar_input:
                generated_sample = prob
            else:
                # TODO: random choice
                generated_sample = np.argmax(prob, axis=-1)

            generated.append(generated_sample)

        result = np.hstack(generated)
        if not self.net.scalar_input:
            result = P.inv_mulaw_quantize(result.astype(np.int16),
                                          self.net.quantization_channels)

        if type(n_samples) == list:
            result = [x[:n_samples[i]] for i, x in enumerate(result)]

        return result
Exemplo n.º 9
0
    def generate(self, sess, n_samples, lc, gc):
        sess.run(tf.variables_initializer(self.var_q))
        receptive_field = self.vocoder.net.receptive_field

        if self.vocoder.net.scalar_input:
            seeds = [0]
        else:
            seeds = [128]

        seeds = [seeds]
        seeds = np.repeat(seeds, self.batch_size, axis=0)
        #         generated = []
        generated = [seeds]

        #         for j in tqdm(range(receptive_field + n_samples)):
        #             if j < receptive_field:
        #                 sample = seeds
        #                 current_lc = np.zeros((self.batch_size, hparams.num_mels))
        #             else:
        #                 sample = generated[-1]
        #                 current_lc = lc[:, j - receptive_field, :]
        for j in tqdm(range(n_samples)):
            sample = generated[-1]
            current_lc = lc[:, j, :]

            # Generation phase
            feed_dict = {
                self.sample_placeholder: sample,
                self.lc_placeholder: current_lc,
                self.gen_num: j}

            if self.gc_placeholder is not None:
                feed_dict.update({self.gc_placeholder: gc})

            prob, _layers = sess.run([self.next_sample_prob, self.layers_out], feed_dict=feed_dict)

            # Update phase
            feed_dict = {
                self.initial: _layers[0],
                self.others: np.array(_layers[1:]),
                self.gen_num: j}

            sess.run(self.update_q_ops, feed_dict=feed_dict)

            if self.vocoder.net.scalar_input:
                generated_sample = prob
            else:
                # TODO: random choice
                generated_sample = np.argmax(prob, axis=-1)

            generated.append(generated_sample)

        # result = np.hstack(generated)[:, receptive_field:]
        result = np.hstack(generated)
        if not self.vocoder.net.scalar_input:
            result = P.inv_mulaw_quantize(result.astype(np.int16), self.vocoder.net.quantization_channels)

        return result
Exemplo n.º 10
0
def batch_wavegen(model, c=None, g=None, fast=True, tqdm=tqdm, length=None, writing_dir=None):
    from train import sanity_check
    sanity_check(model, c, g)
    # assert c is not None
    if c is not None:
        B = c.shape[0]
    else:
        B = 1 #c.shape[0]
    model.eval()
    if fast:
        model.make_generation_fast_()

    # Transform data to GPU
    g = None if g is None else g.to(device)
    c = None if c is None else c.to(device)

    if hparams.upsample_conditional_features and length is None:
        length = (c.shape[-1] - hparams.cin_pad * 2) * audio.get_hop_size()

    with torch.no_grad():
        y_hat = model.incremental_forward(
            c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True,
            log_scale_min=hparams.log_scale_min)


        y_hat_sample = y_hat.max(1)[1].view(B, -1).float()
        cross_entropy = model.binary_softmax_loss(y_hat_sample.unsqueeze(1), c)

    # Write the output
    with open(join(writing_dir, "info.json"), "w") as f:
        data = {"0.244" : float(cross_entropy.detach().cpu().numpy())}
        json.dump(data, f, indent=4)

    if is_mulaw_quantize(hparams.input_type):
        # needs to be float since mulaw_inv returns in range of [-1, 1]
        y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy()
        for i in range(B):
            y_hat[i] = P.inv_mulaw_quantize(y_hat[i], hparams.quantize_channels - 1)
    elif is_linear_quantize(hparams.input_type):
        y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy()
        for i in range(B):
            y_hat[i] = inv_linear_quantize(y_hat[i], hparams.quantize_channels - 1)
    elif is_mulaw(hparams.input_type):
        y_hat = y_hat.view(B, -1).cpu().data.numpy()
        for i in range(B):
            y_hat[i] = P.inv_mulaw(y_hat[i], hparams.quantize_channels - 1)
    else:
        y_hat = y_hat.view(B, -1).cpu().data.numpy()

    if hparams.postprocess is not None and hparams.postprocess not in ["", "none"]:
        for i in range(B):
            y_hat[i] = getattr(audio, hparams.postprocess)(y_hat[i])

    if hparams.global_gain_scale > 0:
        for i in range(B):
            y_hat[i] /= hparams.global_gain_scale

    return y_hat
Exemplo n.º 11
0
def batch_wavegen(model, c=None, g=None, fast=True, tqdm=tqdm, length=None):
    from train import sanity_check
    sanity_check(model, c, g)
    # assert c is not None
    if c is not None:
        B = c.shape[0]
    else:
        B = 1  #c.shape[0]
    model.eval()
    if fast:
        model.make_generation_fast_()

    # Transform data to GPU
    g = None if g is None else g.to(device)
    c = None if c is None else c.to(device)

    if hparams.upsample_conditional_features and length is None:
        length = (c.shape[-1] - hparams.cin_pad * 2) * audio.get_hop_size()

    with torch.no_grad():
        y_hat = model.incremental_forward(c=c,
                                          g=g,
                                          T=length,
                                          tqdm=tqdm,
                                          softmax=True,
                                          quantize=True,
                                          log_scale_min=hparams.log_scale_min)

    if is_mulaw_quantize(hparams.input_type):
        # needs to be float since mulaw_inv returns in range of [-1, 1]
        y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy()
        for i in range(B):
            y_hat[i] = P.inv_mulaw_quantize(y_hat[i],
                                            hparams.quantize_channels - 1)
    elif is_linear_quantize(hparams.input_type):
        y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy()
        for i in range(B):
            y_hat[i] = inv_linear_quantize(y_hat[i],
                                           hparams.quantize_channels - 1)
    elif is_mulaw(hparams.input_type):
        y_hat = y_hat.view(B, -1).cpu().data.numpy()
        for i in range(B):
            y_hat[i] = P.inv_mulaw(y_hat[i], hparams.quantize_channels - 1)
    else:
        y_hat = y_hat.view(B, -1).cpu().data.numpy()

    if hparams.postprocess is not None and hparams.postprocess not in [
            "", "none"
    ]:
        for i in range(B):
            y_hat[i] = getattr(audio, hparams.postprocess)(y_hat[i])

    if hparams.global_gain_scale > 0:
        for i in range(B):
            y_hat[i] /= hparams.global_gain_scale

    return y_hat
Exemplo n.º 12
0
def test_mulaw_real():
    fs, x = wavfile.read(example_audio_file())
    x = (x / 32768.0).astype(np.float32)
    mu = 256
    y = P.mulaw_quantize(x, mu)
    assert y.min() >= 0 and y.max() < mu
    assert y.dtype == np.int
    x = P.inv_mulaw_quantize(y, mu) * 32768
    assert x.dtype == np.float32
    x = x.astype(np.int16)
Exemplo n.º 13
0
def batch_wavegen(hparam,
                  net,
                  c_input=None,
                  g_input=None,
                  tqdm_=None,
                  is_numpy=True):
    """
    generate audio
    """
    assert c_input is not None
    B = c_input.shape[0]
    net.set_train(False)

    if hparam.upsample_conditional_features:
        length = (c_input.shape[-1] -
                  hparam.cin_pad * 2) * audio.get_hop_size()
    else:
        # already dupulicated
        length = c_input.shape[-1]

    y_hat = net.incremental_forward(c=c_input,
                                    g=g_input,
                                    T=length,
                                    tqdm=tqdm_,
                                    softmax=True,
                                    quantize=True,
                                    log_scale_min=hparam.log_scale_min,
                                    is_numpy=is_numpy)

    if is_mulaw_quantize(hparam.input_type):
        # needs to be float since mulaw_inv returns in range of [-1, 1]
        y_hat = np.reshape(np.argmax(y_hat, 1), (B, -1))
        y_hat = y_hat.astype(np.float32)
        for k in range(B):
            y_hat[k] = P.inv_mulaw_quantize(y_hat[k],
                                            hparam.quantize_channels - 1)
    elif is_mulaw(hparam.input_type):
        y_hat = np.reshape(y_hat, (B, -1))
        for k in range(B):
            y_hat[k] = P.inv_mulaw(y_hat[k], hparam.quantize_channels - 1)
    else:
        y_hat = np.reshape(y_hat, (B, -1))

    if hparam.postprocess is not None and hparam.postprocess not in [
            "", "none"
    ]:
        for k in range(B):
            y_hat[k] = getattr(audio, hparam.postprocess)(y_hat[k])

    if hparam.global_gain_scale > 0:
        for k in range(B):
            y_hat[k] /= hparam.global_gain_scale

    return y_hat
Exemplo n.º 14
0
def save_ref_audio(hparam, ref, length, target_wav_path_):
    """
    save reference audio
    """
    if is_mulaw_quantize(hparam.input_type):
        ref = np.reshape(np.argmax(ref, 0), (-1))[:length]
        ref = ref.astype(np.float32)
    else:
        ref = np.reshape(ref, (-1))[:length]

    if is_mulaw_quantize(hparam.input_type):
        ref = P.inv_mulaw_quantize(ref, hparam.quantize_channels - 1)
    elif is_mulaw(hparam.input_type):
        ref = P.inv_mulaw(ref, hparam.quantize_channels - 1)
    if hparam.postprocess is not None and hparam.postprocess not in ["", "none"]:
        ref = getattr(audio, hparam.postprocess)(ref)
    if hparam.global_gain_scale > 0:
        ref /= hparam.global_gain_scale

    ref = np.clip(ref, -1.0, 1.0)

    wavfile.write(target_wav_path_, hparam.sample_rate, to_int16(ref))
Exemplo n.º 15
0
def _test_data(sr=4000, N=3000, returns_power=False, mulaw=True):
    x, _ = librosa.load(example_audio_file(), sr=sr)
    x, _ = librosa.effects.trim(x, top_db=15)

    # To save computational cost
    x = x[:N]

    # For power conditioning wavenet
    if returns_power:
        # (1 x N')
        p = librosa.feature.rmse(x, frame_length=256, hop_length=128)
        upsample_factor = x.size // p.size
        # (1 x N)
        p = np.repeat(p, upsample_factor, axis=-1)
        if p.size < x.size:
            # pad against time axis
            p = np.pad(p, [(0, 0), (0, x.size - p.size)],
                       mode="constant",
                       constant_values=0)

        # shape adajst
        p = p.reshape(1, 1, -1)

    # (T,)
    if mulaw:
        x = P.mulaw_quantize(x)
        x_org = P.inv_mulaw_quantize(x)
        # (C, T)
        x = to_categorical(x, num_classes=256).T
        # (1, C, T)
        x = x.reshape(1, 256, -1).astype(np.float32)
    else:
        x_org = x
        x = x.reshape(1, 1, -1)

    if returns_power:
        return x, x_org, p

    return x, x_org
Exemplo n.º 16
0
def wavegen(model, length=None, c=None, g=None, initial_value=None,
            fast=False, tqdm=tqdm):
    """Generate waveform samples by WaveNet.
       Multiple waveforms can be generated in single batch

    Args:
        model (nn.Module) : WaveNet decoder
        length (int): Time steps to generate. If conditinlal features are given,
          then this is determined by the feature size.
        c (numpy.ndarray or list): Conditional features, of shape T x C
        g (scalar or list): Speaker ID
        initial_value (int) : initial_value for the WaveNet decoder.
        fast (Bool): Whether to remove weight normalization or not.
        tqdm (lambda): tqdm

    Returns:
        numpy.ndarray or list : Generated waveform samples
    """
    from train import sanity_check
    sanity_check(model, c, g)

    model.eval()
    if fast:
        model.make_generation_fast_()

    # Prepare Local Condition
    batch_size = 1
    output_should_be_list = False
    
    if c is None:
        assert length is not None
    else:
        if type(c)==list :
            output_should_be_list = True
            
            c = [_to_numpy(x) for x in c]
            for x in c :
                if x.ndim != 2:
                    raise RuntimeError(
                        "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, x.shape))
                    assert x.ndim == 2
                    
            batch_size = len(c)
            batch = np.zeros([batch_size, max([x.shape[0] for x in c]), c[0].shape[1]])
            for i in range(batch_size) :
                batch[i,:c[i].shape[0],:] = c[i][:,:]
                
            upsample_factor = audio.get_hop_size()
            # length_list : used to cut silence when batch_size > 1
            length_list = [x.shape[0]*upsample_factor for x in c]
            length = max(length_list)
            
            if not hparams.upsample_conditional_features:
                batch = np.repeat(batch, upsample_factor, axis=1)
                
            c = torch.FloatTensor(np.transpose(batch, [0, 2, 1]))
        else :
            c = _to_numpy(c)
            # (Tc, D)
            if c.ndim != 2:
                raise RuntimeError(
                    "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, c.shape))
                assert c.ndim == 2
            Tc = c.shape[0]
            upsample_factor = audio.get_hop_size()
            # Overwrite length according to feature size
            length = Tc * upsample_factor
            # (Tc, D) -> (Tc', D)
            # Repeat features before feeding it to the network
            if not hparams.upsample_conditional_features:
                c = np.repeat(c, upsample_factor, axis=0)

            # B x C x T
            c = torch.FloatTensor(c.T).unsqueeze(0)

        
    # Prepare initial_input
    if initial_value is None:
        if is_mulaw_quantize(hparams.input_type):
            initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
        else:
            initial_value = 0.0
    if is_mulaw_quantize(hparams.input_type):
        assert initial_value >= 0 and initial_value < hparams.quantize_channels
        initial_input = np_utils.to_categorical(
            initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
        initial_input = torch.from_numpy(initial_input).view(
            1, 1, hparams.quantize_channels)
    else:
        initial_input = torch.zeros(1, 1, 1).fill_(initial_value)
    initial_input = initial_input.repeat(batch_size, 1, 1)
        
    # Prepare Global Condition
    if type(g)==list :
        g = [_to_numpy(x) for x in g]
        g = torch.LongTensor(g)
    elif g is not None :
        g = _to_numpy(g)
        g = torch.LongTensor([g])
        
    
    # Transform data to GPU
    initial_input = initial_input.to(device)
    g = None if g is None else g.to(device)
    c = None if c is None else c.to(device)

    
    with torch.no_grad():
        y_hat = model.incremental_forward(
            initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True,
            log_scale_min=hparams.log_scale_min)

    if is_mulaw_quantize(hparams.input_type):
        y_hat = y_hat.max(1)[1].view(batch_size, -1).long().cpu().data.numpy()
        y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
    elif is_mulaw(hparams.input_type):
        y_hat = P.inv_mulaw(y_hat.view(batch_size, -1).cpu().data.numpy(), hparams.quantize_channels)
    else:
        y_hat = y_hat.view(batch_size, -1).cpu().data.numpy()

    if output_should_be_list :
        return [y_hat[i, :length_list[i]] for i in range(batch_size)]
    else :
        return y_hat[0, :]
Exemplo n.º 17
0
def wavegen(model,
            length=None,
            c=None,
            g=None,
            initial_value=None,
            fast=False,
            tqdm=tqdm):
    """Generate waveform samples by WaveNet.

    Args:
        model (nn.Module) : WaveNet decoder
        length (int): Time steps to generate. If conditinlal features are given,
          then this is determined by the feature size.
        c (numpy.ndarray): Conditional features, of shape T x C
        g (scaler): Speaker ID
        initial_value (int) : initial_value for the WaveNet decoder.
        fast (Bool): Whether to remove weight normalization or not.
        tqdm (lambda): tqdm

    Returns:
        numpy.ndarray : Generated waveform samples
    """
    from train import sanity_check
    sanity_check(model, c, g)

    c = _to_numpy(c)
    g = _to_numpy(g)

    model.eval()
    if fast:
        model.make_generation_fast_()

    if c is None:
        assert length is not None
    else:
        # (Tc, D)
        if c.ndim != 2:
            raise RuntimeError(
                "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given."
                .format(hparams.cin_channels, c.shape))
            assert c.ndim == 2
        Tc = c.shape[0]
        upsample_factor = audio.get_hop_size()
        # Overwrite length according to feature size
        length = Tc * upsample_factor
        # (Tc, D) -> (Tc', D)
        # Repeat features before feeding it to the network
        if not hparams.upsample_conditional_features:
            c = np.repeat(c, upsample_factor, axis=0)

        # B x C x T
        c = torch.FloatTensor(c.T).unsqueeze(0)

    if initial_value is None:
        if is_mulaw_quantize(hparams.input_type):
            initial_value = P.mulaw_quantize(0, hparams.quantize_channels - 1)
        else:
            initial_value = 0.0

    if is_mulaw_quantize(hparams.input_type):
        assert initial_value >= 0 and initial_value < hparams.quantize_channels
        initial_input = np_utils.to_categorical(
            initial_value,
            num_classes=hparams.quantize_channels).astype(np.float32)
        initial_input = torch.from_numpy(initial_input).view(
            1, 1, hparams.quantize_channels)
    else:
        initial_input = torch.zeros(1, 1, 1).fill_(initial_value)

    g = None if g is None else torch.LongTensor([g])

    # Transform data to GPU
    initial_input = initial_input.to(device)
    g = None if g is None else g.to(device)
    c = None if c is None else c.to(device)

    with torch.no_grad():
        y_hat = model.incremental_forward(initial_input,
                                          c=c,
                                          g=g,
                                          T=length,
                                          tqdm=tqdm,
                                          softmax=True,
                                          quantize=True,
                                          log_scale_min=hparams.log_scale_min)

    if is_mulaw_quantize(hparams.input_type):
        y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
        y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
    elif is_mulaw(hparams.input_type):
        y_hat = P.inv_mulaw(
            y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels)
    else:
        y_hat = y_hat.view(-1).cpu().data.numpy()

    if hparams.postprocess is not None and hparams.postprocess not in [
            "", "none"
    ]:
        y_hat = getattr(audio, hparams.postprocess)(y_hat)

    if hparams.global_gain_scale > 0:
        y_hat /= hparams.global_gain_scale

    return y_hat
Exemplo n.º 18
0
def save_states(global_step,
                writer,
                y_hat,
                y,
                y_student,
                input_lengths,
                mu=None,
                checkpoint_dir=None):
    '''

    :param global_step:
    :param writer:
    :param y_hat: parameters output by teachery_hat是教师结果
    :param y: target
    :param y_student: student output
    :param input_lengths:
    :param mu: student mu
    :param checkpoint_dir:
    :return:
    '''
    print("Save intermediate states at step {}".format(global_step))
    idx = np.random.randint(0, len(y_hat))
    length = input_lengths[idx].data.cpu().numpy()
    if mu is not None:
        mu = mu[idx]
    # (B, C, T)
    if y_hat.dim() == 4:
        y_hat = y_hat.squeeze(-1)

    if is_mulaw_quantize(hparams.input_type):
        # (B, T)
        y_hat = F.softmax(y_hat, dim=1).max(1)[1]

        # (T,)
        y_hat = y_hat[idx].data.cpu().long().numpy()
        y = y[idx].view(-1).data.cpu().long().numpy()

        y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
        y = P.inv_mulaw_quantize(y, hparams.quantize_channels)
    else:
        # (B, T)
        y_hat = sample_from_discretized_mix_logistic(
            y_hat, log_scale_min=hparams.log_scale_min)
        # (T,)
        y_hat = y_hat[idx].view(-1).data.cpu().numpy()
        y = y[idx].view(-1).data.cpu().numpy()

        if is_mulaw(hparams.input_type):
            y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels)
            y = P.inv_mulaw(y, hparams.quantize_channels)

    # Mask by length
    y_hat[length:] = 0
    y[length:] = 0
    y_student = y_student.data.cpu().numpy()
    y_student = y_student[idx].reshape(y_student.shape[-1])
    mu = to_numpy(mu)
    # Save audio
    audio_dir = join(checkpoint_dir, "audio")
    if global_step % 1000 == 0:
        audio_dir = join(checkpoint_dir, "audio")
        os.makedirs(audio_dir, exist_ok=True)
        path = join(audio_dir, "step{:09d}_teacher.wav".format(global_step))
        librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
        path = join(audio_dir, "step{:09d}_target.wav".format(global_step))
        librosa.output.write_wav(path, y, sr=hparams.sample_rate)
        path = join(audio_dir, "step{:09d}_student.wav".format(global_step))
        librosa.output.write_wav(path, y_student, sr=hparams.sample_rate)
    # TODO save every 200 step,
    if global_step % 200 == 0:
        path = join(audio_dir, "wave_step{:09d}.png".format(global_step))
        save_waveplot(path,
                      y_student=y_student,
                      y_target=y,
                      y_teacher=y_hat,
                      student_mu=mu)
def main():
    args = get_arguments()
    if args.hparams is not None:
        hparams.parse(args.hparams)
    if not hparams.gc_enable:
        hparams.global_cardinality = None
        hparams.global_channel = None
    print(hparams_debug_string())

    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                            gpu_options=tf.GPUOptions(
                                                allow_growth=True)))

    net = WaveNetModel(
        batch_size=1,
        dilations=hparams.dilations,
        filter_width=hparams.filter_width,
        residual_channels=hparams.residual_channels,
        dilation_channels=hparams.dilation_channels,
        skip_channels=hparams.skip_channels,
        quantization_channels=hparams.quantization_channels,
        use_biases=hparams.use_biases,
        scalar_input=hparams.scalar_input,
        initial_filter_width=hparams.initial_filter_width,
        local_condition_channel=hparams.num_mels,
        upsample_conditional_features=hparams.upsample_conditional_features,
        upsample_factor=hparams.upsample_factor,
        global_cardinality=hparams.global_cardinality,
        global_channel=hparams.global_channel)
    samples = tf.placeholder(tf.int32)
    local_ph = tf.placeholder(tf.float32, shape=(1, hparams.num_mels))

    sess.run(tf.global_variables_initializer())
    variables_to_restore = {
        var.name[:-2]: var
        for var in tf.global_variables()
        if not ('state_buffer' in var.name or 'pointer' in var.name)
    }
    saver = tf.train.Saver(variables_to_restore)

    print('Restoring model from {}'.format(args.checkpoint))
    saver.restore(sess, args.checkpoint)

    tmp_global_condition = None
    upsample_factor = audio.get_hop_size()

    generate_list = []
    with open(args.eval_txt, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            if line is not None:
                line = line.strip().split('|')
                npy_path = os.path.join(hparams.NPY_DATAROOT, line[1])
                tmp_local_condition = np.load(npy_path).astype(np.float32)
                if len(line) == 5:
                    tmp_global_condition = int(line[4])
                if hparams.global_channel is None:
                    tmp_global_condition = None
                generate_list.append(
                    (tmp_local_condition, tmp_global_condition, line[1]))

    for local_condition, global_condition, npy_path in generate_list:
        wav_id = npy_path.split('-mel')[0]
        wav_out_path = "wav/{}_gen.wav".format(wav_id)

        if not hparams.upsample_conditional_features:
            local_condition = np.repeat(local_condition,
                                        upsample_factor,
                                        axis=0)
        else:
            local_condition = np.expand_dims(local_condition, 0)
            local_condition = net.create_upsample(local_condition)
            local_condition = tf.squeeze(local_condition,
                                         [0]).eval(session=sess)
        next_sample = net.predict_proba_incremental(samples, local_ph,
                                                    global_condition)
        sess.run(net.init_ops)

        quantization_channels = hparams.quantization_channels

        # Silence with a single random sample at the end.
        waveform = [quantization_channels / 2] * (net.receptive_field - 1)
        waveform.append(np.random.randint(quantization_channels))

        sample_len = local_condition.shape[0]
        for step in tqdm(range(0, sample_len)):

            outputs = [next_sample]
            outputs.extend(net.push_ops)
            window = waveform[-1]

            # Run the WaveNet to predict the next sample.
            prediction = sess.run(outputs,
                                  feed_dict={
                                      samples: window,
                                      local_ph:
                                      local_condition[step:step + 1, :]
                                  })[0]

            # Scale prediction distribution using temperature.
            np.seterr(divide='ignore')
            scaled_prediction = np.log(prediction) / args.temperature
            scaled_prediction = (scaled_prediction -
                                 np.logaddexp.reduce(scaled_prediction))
            scaled_prediction = np.exp(scaled_prediction)
            np.seterr(divide='warn')
            # print(quantization_channels, scaled_prediction)
            sample = np.random.choice(np.arange(quantization_channels),
                                      p=scaled_prediction)
            waveform.append(sample)

            # If we have partial writing, save the result so far.
            if (wav_out_path and args.save_every
                    and (step + 1) % args.save_every == 0):
                out = P.inv_mulaw_quantize(np.array(waveform),
                                           quantization_channels)
                write_wav(out, hparams.sample_rate, wav_out_path)

                # Introduce a newline to clear the carriage return from the progress.
        print()
        # Save the result as a wav file.
        if wav_out_path:
            out = P.inv_mulaw_quantize(
                np.array(waveform).astype(np.int16), quantization_channels)
            # out = P.inv_mulaw_quantize(np.asarray(waveform), quantization_channels)
            write_wav(out, hparams.sample_rate, wav_out_path)
    print('Finished generating.')
Exemplo n.º 20
0
def wavegen(model,
            length=None,
            c=None,
            g=None,
            initial_value=None,
            fast=False,
            tqdm=tqdm):
    """Generate waveform samples by WaveNet.

    Args:
        model (nn.Module) : WaveNet decoder
        length (int): Time steps to generate. If conditinlal features are given,
          then determined by the feature size.
        c (numpy.ndarray): Conditional features, of shape T x C
        g (scaler): Speaker ID
        initial_value (int) : initial_value for the WaveNet decoder.
        fast (Bool): Whether to remove weight normalization or not.
        tqdm (lambda): tqdm

    Returns:
        numpy.ndarray : Generated waveform samples
    """
    c = _to_numpy(c)
    g = _to_numpy(g)

    if use_cuda:
        model = model.cuda()
    model.eval()
    if fast:
        model.make_generation_fast_()

    if c is None:
        assert length is not None
    else:
        # (N, D)
        assert c.ndim == 2
        # (T, D)
        if not hparams.upsample_conditional_features:
            upsample_factor = audio.get_hop_size()
            c = np.repeat(c, upsample_factor, axis=0)
        length = c.shape[0]
        # B x C x T
        c = c.T.reshape(1, -1, length)
        c = Variable(torch.FloatTensor(c))

    if initial_value is None:
        initial_value = P.mulaw_quantize(0)  # dummy silence
    assert initial_value >= 0 and initial_value < 256

    initial_input = np_utils.to_categorical(initial_value,
                                            num_classes=256).astype(np.float32)
    initial_input = Variable(torch.from_numpy(initial_input)).view(1, 1, 256)
    g = None if g is None else Variable(torch.LongTensor([g]))
    if use_cuda:
        initial_input = initial_input.cuda()
        g = None if g is None else g.cuda()
        c = None if c is None else c.cuda()

    y_hat = model.incremental_forward(initial_input,
                                      c=c,
                                      g=g,
                                      T=length,
                                      tqdm=tqdm,
                                      softmax=True,
                                      quantize=True)
    y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
    y_hat = P.inv_mulaw_quantize(y_hat)

    return y_hat
Exemplo n.º 21
0
        # Generate
        waveform = wavegen(model,
                           length,
                           c=c,
                           g=g,
                           initial_value=initial_value,
                           fast=True,
                           tqdm=_tqdm)

        # save
        librosa.output.write_wav(dst_wav_path,
                                 waveform,
                                 sr=hparams.sample_rate)
        librosa.output.write_wav(target_wav_path,
                                 P.inv_mulaw_quantize(x),
                                 sr=hparams.sample_rate)

        # log
        if output_html:
            print("""
<audio controls="controls" >
<source src="/{}/audio/{}/{}" autoplay/>
Your browser does not support the audio element.
</audio>
""".format(hparams.name, dst_dir_name, basename(dst_wav_path)))

    print(
        "Finished! Check out {} for generated audio samples.".format(dst_dir))
    sys.exit(0)
Exemplo n.º 22
0
            target_wav_path = join(dst_dir, "{}_{}{}_target.wav".format(
                idx, checkpoint_name, file_name_suffix))
        else:
            dst_wav_path = join(dst_dir, "speaker{}_{}_{}{}_predicted.wav".format(
                g, idx, checkpoint_name, file_name_suffix))
            target_wav_path = join(dst_dir, "speaker{}_{}_{}{}_target.wav".format(
                g, idx, checkpoint_name, file_name_suffix))

        # Generate
        waveform = wavegen(model, length, c=c, g=g, initial_value=initial_value,
                           fast=True, tqdm=_tqdm)

        # save
        librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate)
        if is_mulaw_quantize(hparams.input_type):
            x = P.inv_mulaw_quantize(x, hparams.quantize_channels)
        elif is_mulaw(hparams.input_type):
            x = P.inv_mulaw(x, hparams.quantize_channels)
        librosa.output.write_wav(target_wav_path, x, sr=hparams.sample_rate)

        # log
        if output_html:
            print("""
<audio controls="controls" >
<source src="/{}/audio/{}/{}" autoplay/>
Your browser does not support the audio element.
</audio>
""".format(hparams.name, dst_dir_name, basename(dst_wav_path)))

    print("Finished! Check out {} for generated audio samples.".format(dst_dir))
    del tee
Exemplo n.º 23
0
def test_incremental_forward_correctness():
    import librosa.display
    from matplotlib import pyplot as plt

    model = build_compact_model().to(device)

    checkpoint_path = join(dirname(__file__), "..", "foobar/checkpoint_step000058000.pth")
    if exists(checkpoint_path):
        print("Loading from:", checkpoint_path)
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["state_dict"])

    sr = 4000
    x, x_org = _test_data(sr=sr, N=3000)
    x = torch.from_numpy(x).contiguous().to(device)

    model.eval()

    # Batch forward
    y_offline = model(x, softmax=True)

    # Test from zero start
    y_online = model.incremental_forward(initial_input=None, T=100, tqdm=tqdm, softmax=True)

    # Incremental forward with forced teaching
    y_online = model.incremental_forward(test_inputs=x, tqdm=tqdm, softmax=True, quantize=False)

    # (1 x C x T)
    c = (y_offline - y_online).abs()
    print(c.mean(), c.max())

    try:
        assert np.allclose(y_offline.cpu().data.numpy(),
                           y_online.cpu().data.numpy(), atol=1e-4)
    except Exception:
        from warnings import warn
        warn("oops! must be a bug!")

    # (1, T, C)
    xt = x.transpose(1, 2).contiguous()

    initial_input = xt[:, 0, :].unsqueeze(1).contiguous()
    print(initial_input.size())
    print("Inital value:", initial_input.view(-1).max(0)[1])

    # With zero start
    zerostart = True
    if zerostart:
        y_inference = model.incremental_forward(
            initial_input=initial_input, T=xt.size(1), tqdm=tqdm, softmax=True, quantize=True)
    else:
        # Feed a few samples as test_inputs and then generate auto-regressively
        N = 1000
        y_inference = model.incremental_forward(
            initial_input=None, test_inputs=xt[:, :N, :],
            T=xt.size(1), tqdm=tqdm, softmax=True, quantize=True)

    # Waveforms
    # (T,)
    y_offline = y_offline.max(1)[1].view(-1)
    y_online = y_online.max(1)[1].view(-1)
    y_inference = y_inference.max(1)[1].view(-1)

    y_offline = P.inv_mulaw_quantize(y_offline.cpu().data.long().numpy())
    y_online = P.inv_mulaw_quantize(y_online.cpu().data.long().numpy())
    y_inference = P.inv_mulaw_quantize(y_inference.cpu().data.long().numpy())

    plt.figure(figsize=(16, 10))
    plt.subplot(4, 1, 1)
    librosa.display.waveplot(x_org, sr=sr)
    plt.subplot(4, 1, 2)
    librosa.display.waveplot(y_offline, sr=sr)
    plt.subplot(4, 1, 3)
    librosa.display.waveplot(y_online, sr=sr)
    plt.subplot(4, 1, 4)
    librosa.display.waveplot(y_inference, sr=sr)
    plt.show()

    save_audio = False
    if save_audio:
        librosa.output.write_wav("target.wav", x_org, sr=sr)
        librosa.output.write_wav("online.wav", y_online, sr=sr)
        librosa.output.write_wav("inference.wav", y_inference, sr=sr)
Exemplo n.º 24
0
                    "-feats", "")

            # Paths
            if g is None:
                dst_wav_path = join(dst_dir, "{}_gen.wav".format(name))
                target_wav_path = join(dst_dir, "{}_ref.wav".format(name))
            else:
                dst_wav_path = join(dst_dir,
                                    "speaker{}_{}_gen.wav".format(g, name))
                target_wav_path = join(dst_dir,
                                       "speaker{}_{}_ref.wav".format(g, name))

            # save
            if has_ref_file:
                if is_mulaw_quantize(hparams.input_type):
                    ref = P.inv_mulaw_quantize(ref,
                                               hparams.quantize_channels - 1)
                elif is_mulaw(hparams.input_type):
                    ref = P.inv_mulaw(ref, hparams.quantize_channels - 1)
                if hparams.postprocess is not None and hparams.postprocess not in [
                        "", "none"
                ]:
                    ref = getattr(audio, hparams.postprocess)(ref)
                if hparams.global_gain_scale > 0:
                    ref /= hparams.global_gain_scale

            # clip (just in case)
            gen = np.clip(gen, -1.0, 1.0)
            if has_ref_file:
                ref = np.clip(ref, -1.0, 1.0)

            wavfile.write(dst_wav_path, hparams.sample_rate, to_int16(gen))
Exemplo n.º 25
0
def main(args):
    model = ModelWrapper()
    model.eval()

    if args["--downsample_interval"] is None:
        raise(ValueError("Must specify downsample fraction with --downsample_interval"))
    downsample_interval = int(args["--downsample_interval"])

    receptive_field = model.receptive_field

    # Change the output dir if you want
    writing_dir = args["<output-dir>"]
    os.makedirs(writing_dir, exist_ok=True)
    print("writing dir: {}".format(writing_dir))

    # Load up a samples
    x_original = librosa.core.load(args["<input-file>"], sr=hparams.sample_rate, mono=True)[0]

    # Hacky way to allow processing some or all of the file
    global SAMPLE_SIZE
    if SAMPLE_SIZE == -1:
        SAMPLE_SIZE = x_original.shape[0]

    x_original = x_original[:SAMPLE_SIZE]

    # Normalize to reduce encoding artifacts
    x_original /= abs(x_original).max()
    sf.write(os.path.join(writing_dir, "x_original.wav"), x_original, hparams.sample_rate)

    # Cut the sampling rate
    x_modified = x_original[::downsample_interval]
    x_modified_out = librosa.core.resample(x_modified,
                                           int(hparams.sample_rate / downsample_interval), hparams.sample_rate)
    sf.write(join(writing_dir, "x_modified.wav"), x_modified_out, hparams.sample_rate)
    x_modified = P.mulaw_quantize(x_modified, hparams.quantize_channels - 1)

    # Update constraint mask for super resolution. Masked spots don't update
    mask = np.ones_like(x_original)
    mask[::downsample_interval] = 0
    mask = torch.Tensor(mask).unsqueeze(0).to(device)

    # Initialize with noise for the samples we need to fill in, or x original for the samples
    # we are allowed to use
    noise = np.random.uniform(0, 256, size=x_original.shape)
    mask_np = mask[0].detach().cpu().numpy()
    x = P.mulaw_quantize(x_original, hparams.quantize_channels - 1) * (1 - mask_np) + noise * (mask_np)
    x = torch.FloatTensor(x).unsqueeze(0).to(device)
    x.requires_grad = True

    sigmas = [175.9, 110., 68.7, 54.3, 42.9, 34.0, 26.8, 21.2, 16.8, 13.3, 10.5, 8.29, 6.55, 5.18, 4.1, 3.24, 2.56, 1.6, 1.0, 0.625, 0.39, 0.244, 0.15, 0.1]

    for idx, sigma in enumerate(sigmas):
        # Make sure each sample is updated on average N_STEPS times
        n_steps_sgld = int((SAMPLE_SIZE/(SGLD_WINDOW*BATCH_SIZE)) * N_STEPS)
        print("Number of SGLD steps {}".format(n_steps_sgld))
        
        # Bump down a model
        checkpoint_path = join(args["<checkpoint>"], CHECKPOINTS[sigma], "checkpoint_latest_ema.pth")
        model.load_checkpoint(checkpoint_path)

        parmodel = torch.nn.DataParallel(model)
        parmodel.to(device)

        eta = .05 * (sigma ** 2)

        for i in range(n_steps_sgld):
            # need to get a good sampling of the beginning/end (boundary effects)
            # to understand this: think about how often we would update x[receptive_field] (first point)
            # if we only sampled U(receptive_field,x0.shape-receptive_field-SGLD_WINDOW)
            j = np.random.randint(-SGLD_WINDOW, x.shape[1], BATCH_SIZE)
            j = np.maximum(j, 0)
            j = np.minimum(j, x.shape[1]-(SGLD_WINDOW))

            patches = []
            for k in range(BATCH_SIZE):
                patches.append(x[:, j[k]:j[k] + SGLD_WINDOW])

            patches = torch.stack(patches, axis=0)


            # Forward pass
            log_prob, prediction = parmodel(patches, sigma=sigma)
            log_prob = torch.sum(log_prob)
            grad = torch.autograd.grad(log_prob, patches)[0]

            x_update = eta * grad

            # Langevin step
            epsilon = np.sqrt(2 * eta) * torch.normal(0, 1, size=x_update.shape, device=device)
            x_update += epsilon

            with torch.no_grad():
                for k in range(BATCH_SIZE):
                    x_update[k] *= mask[:, j[k] : j[k] + SGLD_WINDOW]
                    x[:, j[k] : j[k] + SGLD_WINDOW] += x_update[k]

            if (not i % 20) or (i == (n_steps_sgld - 1)): # debugging
                print("--------------")
                print('sigma = {}'.format(sigma))
                print('eta = {}'.format(eta))
                print("i {}".format(i))
                print("Max sample {}".format(
                    abs(x).max()))
                print('Mean sample logpx: {}'.format(log_prob / (BATCH_SIZE*SGLD_WINDOW)))
                print("Max gradient update: {}".format(eta * abs(grad).max()))
                t0 = time.time()


        out = P.inv_mulaw_quantize(x[0].detach().cpu().numpy(), hparams.quantize_channels - 1)
        out = np.clip(out, -1, 1)
        sf.write(os.path.join(writing_dir, "out_{}.wav".format(sigma)), out, hparams.sample_rate)
def main(args):
    model = ModelWrapper()
    model.eval()

    receptive_field = model.receptive_field
    hparams.max_time_steps = SAMPLE_SIZE

    test_data_loader = get_data_loader(args["<dump-root>"], collate_fn)

    # Change the output dir if you want
    writing_dir = args["<output-dir>"]
    if not exists(writing_dir):
        os.makedirs(writing_dir)
    print("writing dir: {}".format(writing_dir))

    (x_original, y, c, g, input_lengths) = next(iter(test_data_loader))
    c = c.to(device)
    sanity_check(model.model, c, g)

    # Write inputs
    x_original_out = P.inv_mulaw_quantize(x_original, hparams.quantize_channels - 1)
    sf.write(join(writing_dir, "original.wav"), x_original_out[0, 0,], hparams.sample_rate)

    # Initialize with noise
    x = torch.FloatTensor(np.random.uniform(0, 256, size=(1, x_original.shape[-1] + 1))).to(device)
    x.requires_grad = True

    sigmas = [175.9, 110., 68.7,  42.9, 26.8, 16.8, 10.5, 6.55, 4.1, 2.56, 1.6, 1.0, 0.625, 0.39, 0.244, 0.1]

    t0 = time.time()

    for idx, sigma in enumerate(sigmas):
        # Bump down a model
        checkpoint_path = join(args["<checkpoint>"], CHECKPOINTS[sigma], "checkpoint_latest_ema.pth")
        model.load_checkpoint(checkpoint_path)
        parmodel = torch.nn.DataParallel(model)
        parmodel.to(device)

        eta = .1 * (sigma ** 2)

        # Make sure each sample is updated on average N_STEPS times
        n_steps_sgld = int((SAMPLE_SIZE/(SGLD_WINDOW*BATCH_SIZE)) * N_STEPS)
        print("Number of SGLD steps {}".format(n_steps_sgld))
        for i in range(n_steps_sgld):
            # Sample a random chunk of the spectrogram, accounting for padding
            # need to get a good sampling of the beginning/end (boundary effects)
            # to understand this: think about how often we would update x[0] (first point)
            # if we only sampled U(0,c.shape-receptive_field-SGLD_WINDOW)
            j = np.random.randint(hparams.cin_pad - SGLD_WINDOW // hparams.hop_size,
                                  c.shape[-1] - hparams.cin_pad, 
                                  BATCH_SIZE)
            j = np.maximum(j, hparams.cin_pad)
            j = np.minimum(j, c.shape[-1] - hparams.cin_pad - (SGLD_WINDOW // hparams.hop_size))
            # Get the corresponding start of the waveform
            x_start = (j - hparams.cin_pad) * hparams.hop_size

            patches_c = []
            patches_x = []
            for k in range(BATCH_SIZE):
                patches_c.append(c[0, :, j[k] - hparams.cin_pad : j[k] + hparams.cin_pad + (SGLD_WINDOW // hparams.hop_size)])
                patches_x.append(x[:, x_start[k] : x_start[k] + SGLD_WINDOW + 1])

            patches_c = torch.stack(patches_c, axis=0)
            patches_x = torch.stack(patches_x, axis=0)

            # Forward pass
            log_prob, prediction0 = parmodel(patches_x, c=patches_c, sigma=sigma)
            log_prob = torch.sum(log_prob)

            grad = torch.autograd.grad(log_prob, patches_x)[0]

            x_update = eta * grad

            # Langevin step
            epsilon = np.sqrt(2 * eta) * torch.normal(0, 1, size=x_update.shape, device=device)
            x_update += epsilon

            with torch.no_grad():
                for k in range(BATCH_SIZE):
                    x[:, x_start[k] : x_start[k] + SGLD_WINDOW + 1] += x_update[k]

            if (not i % 20) or (i == (n_steps_sgld - 1)): # debugging
                print("--------------")
                print("i {}".format(i))
                print("Max sample {}".format(
                  abs(x).max()))
                print('Mean sample logpx: {}'.format(log_prob / x.shape[-1]))
                print("Max gradient update: {}".format(eta * abs(grad).max()))


        out = P.inv_mulaw_quantize(x[0, 1:].detach().cpu().numpy(), hparams.quantize_channels - 1)
        out = np.clip(out, -1, 1)
        sf.write(join(writing_dir, "out_{}.wav".format(sigma)), out, hparams.sample_rate)

    final_time = time.time()
    with open(join(writing_dir, "info.json"), "w") as f:
        json.dump({"time": float(final_time - t0)}, f, indent=4)
Exemplo n.º 27
0
def main(args):
    model = build_model().to(device)
    model.eval()

    receptive_field = model.receptive_field
    test_data_loader = get_data_loader(args["<dump-root>"], collate_fn)

    (x, y, c, g, input_lengths) = next(iter(test_data_loader))
    # cin_pad = hparams.cin_pad
    # if cin_pad > 0:
    #     c = F.pad(c, pad=(cin_pad, cin_pad), mode="replicate")
    c = c.to(device)
    sanity_check(model, c, g)
    # Write inputs
    x_original_out = inv_linear_quantize(x, hparams.quantize_channels - 1)
    x_original_out = P.inv_mulaw_quantize(x, hparams.quantize_channels - 1)
    sf.write("x_original.wav", x_original_out[0, 0,], hparams.sample_rate)

    # Initialize with noise
    x = torch.FloatTensor(np.random.uniform(-512, 700, size=(1, x.shape[-1] + 1))).to(device)
    # x = F.pad(x, (receptive_field, 0), "constant", 127)
    x.requires_grad = True


    sigmas = [175.9, 110., 68.7,  42.9, 26.8, 16.8, 10.5, 6.55, 4.1, 2.56, 1.6, 1.0, 0.625, 0.39, 0.1]
    start_sigma = 256.
    end_sigma = 0.1

    for idx, sigma in enumerate(sigmas):
        n_steps = 200
        # Bump down a model
        checkpoint_path = join(args["<checkpoint>"], checkpoints[sigma], "checkpoint_latest.pth")
        print("Load checkpoint0 from {}".format(checkpoint_path))
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["state_dict"])

        eta = .02 * (sigma ** 2)
        gamma = 15 * (1.0 / sigma) ** 2

        for i in range(n_steps):
            # Seed with noised up GT, good for unconditional generation
            # x0[0, :receptive_field] = torch.FloatTensor(x0_original[:receptive_field] + np.random.normal(0, sigma, x0_original[:receptive_field].shape)).to(device)
            # x1[0, :receptive_field] = torch.FloatTensor(x1_original[:receptive_field] + np.random.normal(0, sigma, x1_original[:receptive_field].shape)).to(device)

            # Seed with noised up silence
            # x0[0, :receptive_field] = torch.FloatTensor(np.random.normal(127, sigma, x0_original[:receptive_field].shape)).to(device)
            # x1[0, :receptive_field] = torch.FloatTensor(np.random.normal(127, sigma, x1_original[:receptive_field].shape)).to(device)

            # Forward pass
            log_prob, prediction = model.smoothed_loss(x, c=c, sigma=sigma)
            log_prob = torch.sum(log_prob)
            grad = torch.autograd.grad(log_prob, x)[0]
            x_update = eta * grad

            # Langevin step
            epsilon = np.sqrt(2 * eta) * torch.normal(0, 1, size=(1, x.shape[-1]), device=device)
            x_update += epsilon

            with torch.no_grad():
                x += x_update

            if (not i % 20) or (i == (n_steps - 1)): # debugging
                print("--------------")
                print('sigma = {}'.format(sigma))
                print('eta = {}'.format(eta))
                print("i {}".format(i))
                print("Max sample {}".format(
                    abs(x).max()))
                print('Mean sample logpx: {}'.format(log_prob / x.shape[-1]))
                print("Max gradient update: {}".format(eta * abs(grad).max()))

        out = P.inv_mulaw_quantize(x[0, 1:].detach().cpu().numpy(), hparams.quantize_channels - 1)
        # out = inv_linear_quantize(x[0].detach().cpu().numpy(), hparams.quantize_channels - 1)
        out = np.clip(out, -1, 1)
        sf.write("out_{}.wav".format(sigma), out, hparams.sample_rate)
Exemplo n.º 28
0
def eval_model(global_step,
               writer,
               model,
               y,
               c,
               g,
               input_lengths,
               eval_dir,
               ema=None):
    if ema is not None:
        print("Using averaged model for evaluation")
        model = clone_as_averaged_model(model, ema)

    model.eval()
    idx = np.random.randint(0, len(y))
    length = input_lengths[idx].data.cpu().numpy()[0]

    # (T,)
    y_target = y[idx].view(-1).data.cpu().numpy()[:length]

    if c is not None:
        c = c[idx, :, :length].unsqueeze(0)
        assert c.dim() == 3
        print("Shape of local conditioning features: {}".format(c.size()))
    if g is not None:
        # TODO: test
        g = g[idx]
        print("Shape of global conditioning features: {}".format(g.size()))

    # Dummy silence
    if is_mulaw_quantize(hparams.input_type):
        initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
    elif is_mulaw(hparams.input_type):
        initial_value = P.mulaw(0.0, hparams.quantize_channels)
    else:
        initial_value = 0.0
    print("Intial value:", initial_value)

    # (C,)
    if is_mulaw_quantize(hparams.input_type):
        initial_input = np_utils.to_categorical(
            initial_value,
            num_classes=hparams.quantize_channels).astype(np.float32)
        initial_input = Variable(torch.from_numpy(initial_input)).view(
            1, 1, hparams.quantize_channels)
    else:
        initial_input = Variable(torch.zeros(1, 1, 1).fill_(initial_value))
    initial_input = initial_input.cuda() if use_cuda else initial_input

    # Run the model in fast eval mode
    y_hat = model.incremental_forward(initial_input,
                                      c=c,
                                      g=g,
                                      T=length,
                                      softmax=True,
                                      quantize=True,
                                      tqdm=tqdm,
                                      log_scale_min=hparams.log_scale_min)

    if is_mulaw_quantize(hparams.input_type):
        y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
        y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
        y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels)
    elif is_mulaw(hparams.input_type):
        y_hat = P.inv_mulaw(
            y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels)
        y_target = P.inv_mulaw(y_target, hparams.quantize_channels)
    else:
        y_hat = y_hat.view(-1).cpu().data.numpy()

    # Save audio
    os.makedirs(eval_dir, exist_ok=True)
    path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step))
    librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
    path = join(eval_dir, "step{:09d}_target.wav".format(global_step))
    librosa.output.write_wav(path, y_target, sr=hparams.sample_rate)

    # save figure
    path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step))
    save_waveplot(path, y_hat, y_target)
Exemplo n.º 29
0
def wavegen(model,
            length=None,
            c=None,
            g=None,
            initial_value=None,
            fast=False,
            tqdm=tqdm):
    """Generate waveform samples by WaveNet.

    Args:
        model (nn.Module) : WaveNet decoder
        length (int): Time steps to generate. If conditinlal features are given,
          then this is determined by the feature size.
        c (numpy.ndarray): Conditional features, of shape T x C
        g (scaler): Speaker ID
        initial_value (int) : initial_value for the WaveNet decoder.
        fast (Bool): Whether to remove weight normalization or not.
        tqdm (lambda): tqdm

    Returns:
        numpy.ndarray : Generated waveform samples
    """
    from train import sanity_check
    sanity_check(model, c, g)

    c = _to_numpy(c)
    g = _to_numpy(g)

    if use_cuda:
        model = model.cuda()
    model.eval()
    if fast:
        model.make_generation_fast_()

    if c is None:
        assert length is not None
    else:
        # (Tc, D)
        assert c.ndim == 2
        Tc = c.shape[0]
        upsample_factor = audio.get_hop_size()
        # Overwrite length according to feature size
        length = Tc * upsample_factor
        # (Tc, D) -> (Tc', D)
        # Repeat features before feeding it to the network
        if not hparams.upsample_conditional_features:
            c = np.repeat(c, upsample_factor, axis=0)

        # B x C x T
        c = Variable(torch.FloatTensor(c.T).unsqueeze(0))

    if initial_value is None:
        if is_mulaw_quantize(hparams.input_type):
            initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
        else:
            initial_value = 0.0

    if is_mulaw_quantize(hparams.input_type):
        assert initial_value >= 0 and initial_value < hparams.quantize_channels
        initial_input = np_utils.to_categorical(
            initial_value,
            num_classes=hparams.quantize_channels).astype(np.float32)
        initial_input = Variable(torch.from_numpy(initial_input)).view(
            1, 1, hparams.quantize_channels)
    else:
        initial_input = Variable(torch.zeros(1, 1, 1)).fill_(initial_value)

    g = None if g is None else Variable(torch.LongTensor([g]))
    if use_cuda:
        initial_input = initial_input.cuda()
        g = None if g is None else g.cuda()
        c = None if c is None else c.cuda()

    y_hat = model.incremental_forward(initial_input,
                                      c=c,
                                      g=g,
                                      T=length,
                                      tqdm=tqdm,
                                      softmax=True,
                                      quantize=True,
                                      log_scale_min=hparams.log_scale_min)

    if is_mulaw_quantize(hparams.input_type):
        y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
        y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
    elif is_mulaw(hparams.input_type):
        y_hat = P.inv_mulaw(
            y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels)
    else:
        y_hat = y_hat.view(-1).cpu().data.numpy()

    return y_hat
Exemplo n.º 30
0
def eval_model(global_step, writer, device, model, y, c, g, input_lengths, eval_dir, ema=None):
    if ema is not None:
        print("Using averaged model for evaluation")
        model = clone_as_averaged_model(device, model, ema)
        model.make_generation_fast_()

    model.eval()
    idx = np.random.randint(0, len(y))
    length = input_lengths[idx].data.cpu().item()

    # (T,)
    y_target = y[idx].view(-1).data.cpu().numpy()[:length]

    if c is not None:
        if hparams.upsample_conditional_features:
            c = c[idx, :, :length // audio.get_hop_size() + hparams.cin_pad * 2].unsqueeze(0)
        else:
            c = c[idx, :, :length].unsqueeze(0)
        assert c.dim() == 3
        print("Shape of local conditioning features: {}".format(c.size()))
    if g is not None:
        # TODO: test
        g = g[idx]
        print("Shape of global conditioning features: {}".format(g.size()))

    # Dummy silence
    if is_mulaw_quantize(hparams.input_type):
        initial_value = P.mulaw_quantize(0, hparams.quantize_channels - 1)
    elif is_mulaw(hparams.input_type):
        initial_value = P.mulaw(0.0, hparams.quantize_channels)
    else:
        initial_value = 0.0

    # (C,)
    if is_mulaw_quantize(hparams.input_type):
        initial_input = to_categorical(
            initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
        initial_input = torch.from_numpy(initial_input).view(
            1, 1, hparams.quantize_channels)
    else:
        initial_input = torch.zeros(1, 1, 1).fill_(initial_value)
    initial_input = initial_input.to(device)

    # Run the model in fast eval mode
    with torch.no_grad():
        y_hat = model.incremental_forward(
            initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm,
            log_scale_min=hparams.log_scale_min)

    if is_mulaw_quantize(hparams.input_type):
        y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
        y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels - 1)
        y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels - 1)
    elif is_mulaw(hparams.input_type):
        y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels)
        y_target = P.inv_mulaw(y_target, hparams.quantize_channels)
    else:
        y_hat = y_hat.view(-1).cpu().data.numpy()

    # Save audio
    os.makedirs(eval_dir, exist_ok=True)
    path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step))
    librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
    path = join(eval_dir, "step{:09d}_target.wav".format(global_step))
    librosa.output.write_wav(path, y_target, sr=hparams.sample_rate)

    # save figure
    path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step))
    save_waveplot(path, y_hat, y_target)

    # add audio and figures to tensorboard
    writer.add_audio('target_audio', y_target, global_step, hparams.sample_rate)
    writer.add_audio('generated_audio', y_hat, global_step, hparams.sample_rate)