Пример #1
0
def synthesizer(model, text, device):
    seq = text_to_sequence(text, [hparams.cleaners])
    # seq = torch.Tensor(seq).to(device)
    seq = np.stack([seq])
    if torch.cuda.is_available():
        seq = torch.from_numpy(seq).type(torch.cuda.LongTensor).to(device)
    else:
        seq = torch.from_numpy(seq).type(torch.LongTensor).to(device)
    # print(seq)

    # Provide [GO] Frame
    mel_input = np.zeros([np.shape(seq)[0], hparams.num_mels, 1],
                         dtype=np.float32)
    mel_input = torch.Tensor(mel_input).to(device)
    # print(np.shape(mel_input))

    model.eval()
    with torch.no_grad():
        _, linear_output = model(seq, mel_input)
        # print(np.shape(linear_output))

    # trans_linear = audio.trans(linear_output[0].cpu().numpy())
    wav = audio.inv_spectrogram(linear_output[0].cpu().numpy())
    # print(audio.find_endpoint(wav))
    # print(np.shape(wav))
    wav = wav[:audio.find_endpoint(wav)]
    # print(np.shape(wav))
    return wav
def tts(model, text, spk, qF0s):
    """Convert text to speech waveform given a Tacotron model.
    """
    if use_cuda:
        model = model.cuda()

    model.encoder.eval()
    model.postnet.eval()

    sequence = np.array(text)
    spk = np.array([spk])
    #sequence = np.array(text_to_sequence(text, [hparams.cleaners]))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    spk = Variable(torch.from_numpy(spk))
    qF0s = np.array(qF0s)
    qF0s =  Variable(torch.from_numpy(qF0s)).unsqueeze(0)

    if use_cuda:
        sequence = sequence.cuda()
        spk = spk.cuda()
        qF0s = qF0s.cuda()
 
    # Greedy decoding
    mel_outputs, linear_outputs, alignments = model(sequence, spk, qF0s.long())

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio.denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram
Пример #3
0
def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(
        log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(
        log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T, hparams)
    save_wav(waveform, audio_path, hparams.sample_rate)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=True),
                            isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=False),
                            isKorean=False)
def tts(model, text, mel):
    """Convert text to speech waveform given a Tacotron model.
    """
    if use_cuda:
        model = model.cuda()

    model.encoder.eval()
    model.postnet.eval()

    sequence = np.array(text)
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    mel = Variable(torch.from_numpy(mel)).unsqueeze(0)

    if use_cuda:
        sequence = sequence.cuda()
        mel = mel.cuda()

    mel_outputs, linear_outputs, alignments = model.forward_generate_gst(
        sequence, mel)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio.denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()

    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram
def tts(model, text):
    """Convert text to speech waveform given a Tacotron model.
    """
    if use_cuda:
        model = model.cuda()
    # TODO: Turning off dropout of decoder's prenet causes serious performance
    # regression, not sure why.
    # model.decoder.eval()
    model.encoder.eval()
    model.postnet.eval()

    sequence = np.array(text)
    #sequence = np.array(text_to_sequence(text, [hparams.cleaners]))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    if use_cuda:
        sequence = sequence.cuda()

    # Greedy decoding
    mel_outputs, linear_outputs, alignments = model(sequence)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio.denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram
Пример #6
0
def save_states(global_step,
                mel_outputs,
                linear_outputs,
                attn,
                y,
                checkpoint_dir=None):

    idx = 1  # idx = np.random.randint(0, len(mel_outputs))

    # Alignment
    path = os.path.join(checkpoint_dir,
                        "step{}_alignment.png".format(global_step))
    alignment = attn[idx].cpu().data.numpy(
    )  # alignment = attn[idx].cpu().data.numpy()[:, :input_length]
    plot_alignment(alignment.T,
                   path,
                   info="tacotron, step={}".format(global_step))

    # Predicted spectrogram
    path = os.path.join(checkpoint_dir,
                        "step{}_predicted_spectrogram.png".format(global_step))
    linear_output = linear_outputs[idx].cpu().data.numpy()
    plot_spectrogram(linear_output, path)

    # Predicted audio signal
    signal = audio.inv_spectrogram(linear_output.T)
    path = os.path.join(checkpoint_dir,
                        "step{}_predicted.wav".format(global_step))
    audio.save_wav(signal, path)

    # Target spectrogram
    path = os.path.join(checkpoint_dir,
                        "step{}_target_spectrogram.png".format(global_step))
    linear_output = y[idx].cpu().data.numpy()
    plot_spectrogram(linear_output, path)
def tts(model, text, tones):
    """Convert text to speech waveform given a Tacotron model.
    """
    if use_cuda:
        model = model.cuda()

    model.encoder.eval()
    model.postnet.eval()

    sequence = np.array(text)
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    tones = np.array(tones)
    tones = Variable(torch.from_numpy(tones)).unsqueeze(0)

    if use_cuda:
        sequence = sequence.cuda()
        tones = tones.cuda()

    mel_outputs, linear_outputs = model(sequence, tones)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio.denormalize(linear_output)

    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, spectrogram
Пример #8
0
def synthesize(model, mspec, spk):
    """Convert text to speech waveform given a Tacotron model.
    """
    if use_cuda:
        model = model.cuda()

    model.eval()

    sequence = np.array(mspec)
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    spk = np.array(spk)
    spk = Variable(torch.from_numpy(spk)).unsqueeze(0)

    if use_cuda:
        sequence = sequence.cuda()
        spk = spk.cuda()

    with torch.no_grad():
        model.forward_getlatents(sequence)
        mel_outputs, linear_outputs, = model.forward_eval(sequence, spk)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio.denormalize(linear_output)
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform
Пример #9
0
def tts(model, text):
    """Convert text to speech waveform given a Tacotron model.
	"""
    if USE_CUDA:
        model = model.cuda()

    # NOTE: dropout in the decoder should be activated for generalization!
    # model.decoder.eval()
    model.encoder.eval()
    model.postnet.eval()

    sequence = np.array(text_to_sequence(text))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    if USE_CUDA:
        sequence = sequence.cuda()

    # Greedy decoding
    mel_outputs, linear_outputs, gate_outputs, alignments = model(sequence)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram
    wavs = [
        os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler)
        if file.endswith(".wav")
    ]
    outputs_py = [file + ".py.gen.wav" for file in wavs]
    outputs_tf = [file + ".tf.gen.wav" for file in wavs]
    wavs = [
        audio.load_wav(wav_path + ".wav", hparams.sample_rate)
        for wav_path in wavs
    ]
    spectrogram = [audio.spectrogram(wav).astype(np.float32) for wav in wavs]
    print("Linear spectrograms dim: ")
    print(spectrogram[0].shape)
    # --------------------------------- librosa Version ---------------------------------
    # convert back
    gens = [audio.inv_spectrogram(s) for s in spectrogram]

    for gen, output in zip(gens, outputs_py):
        audio.save_wav(gen, output)

    # --------------------------------- TensorFlow Version ---------------------------------

    samples = [inv_spectrogram(spec) for spec in spectrogram]

    with tf.Session() as sess:
        samples = [sess.run(sample) for sample in samples]

    for gen, output in zip(samples, outputs_tf):
        audio.save_wav(gen, output)

    print("Done!")
    def plot(self, with_head=False):
        ''' Plotting the visualizations of the Unsupervised End-to-end Mockingjay Model'''
        self.verbose('Testing set total ' + str(len(self.dataloader)) +
                     ' batches.')
        if not os.path.exists(self.dump_dir): os.makedirs(self.dump_dir)
        with torch.no_grad():
            idx = 0
            for x in tqdm(self.dataloader, desc="Plotting"):
                spec_stacked, pos_enc, attn_mask = self.process_MAM_data(
                    spec=x)

                if with_head:
                    outputs = self.model(spec_stacked,
                                         pos_enc,
                                         attention_mask=attn_mask)
                    if self.output_attention:
                        _, pred_spec = outputs
                    else:
                        pred_spec, _ = outputs

                    # generate the model filled MAM spectrogram
                    spec_masked = copy.deepcopy(spec_stacked)
                    for i in range(len(spec_masked)):
                        sample_index = random.sample(
                            range(len(spec_masked[i])),
                            int(
                                len(spec_masked[i]) *
                                self.config['mockingjay']['mask_proportion']))
                        spec_masked[i][sample_index] = 0
                    outputs = self.model(spec_masked,
                                         pos_enc,
                                         attention_mask=attn_mask)
                    if self.output_attention:
                        _, fill_spec = outputs
                    else:
                        fill_spec, _ = outputs

                    # plot reconstructed / ground-truth / MAM filled spectrogram
                    for y_pred, y_true, y_fill in zip(pred_spec, spec_stacked,
                                                      fill_spec):

                        y_pred = self.up_sample_frames(y_pred,
                                                       return_first=True)
                        y_true = self.up_sample_frames(y_true,
                                                       return_first=True)
                        y_fill = self.up_sample_frames(y_fill,
                                                       return_first=True)

                        plot_spectrogram(y_pred.data.cpu().numpy(),
                                         path=os.path.join(
                                             self.dump_dir,
                                             str(idx) + '_pred.png'))
                        plot_spectrogram(y_true.data.cpu().numpy(),
                                         path=os.path.join(
                                             self.dump_dir,
                                             str(idx) + '_true.png'))
                        plot_spectrogram(y_fill.data.cpu().numpy(),
                                         path=os.path.join(
                                             self.dump_dir,
                                             str(idx) + '_fill.png'))

                        wave_pred = inv_spectrogram(
                            y_pred.data.cpu().numpy().T)
                        wave_fill = inv_spectrogram(
                            y_fill.data.cpu().numpy().T)
                        librosa.output.write_wav(
                            os.path.join(self.dump_dir,
                                         str(idx) + '_pred.wav'), wave_pred,
                            sample_rate)
                        librosa.output.write_wav(
                            os.path.join(self.dump_dir,
                                         str(idx) + '_fill.wav'), wave_fill,
                            sample_rate)

                        idx += 1
                        if idx >= 10:
                            self.verbose(
                                'Spectrogram head generated samples are saved to: {}'
                                .format(self.dump_dir))
                            exit()  # visualize the first 10 testing samples
                elif self.output_attention:
                    all_attentions, _ = self.mockingjay(
                        spec_stacked,
                        pos_enc,
                        attention_mask=attn_mask,
                        output_all_encoded_layers=True)
                    all_attentions = torch.stack(all_attentions).transpose(
                        0, 1)
                    # all_attentions: (batch_size, num_layer, num_head, Q_seq_len, K_seq_len)

                    for attentions in all_attentions:
                        torch.save(
                            attentions.cpu(),
                            os.path.join(self.dump_dir, f'{idx}_attentions'))
                        idx += 1
                        if idx >= 10:
                            self.verbose(
                                f'Attention samples are saved to {self.dump_dir}'
                            )
                            exit()
                else:
                    encoded_layers = self.mockingjay(
                        spec_stacked,
                        pos_enc,
                        attention_mask=attn_mask,
                        output_all_encoded_layers=True)
                    encoded_layers = torch.stack(encoded_layers)

                    layer_num = encoded_layers.size(0)
                    batch_size = encoded_layers.size(1)
                    seq_len = encoded_layers.size(2)
                    feature_dim = encoded_layers.size(3)

                    dckpt = torch.load(self.paras.load_ws)
                    weights = dckpt['Classifier']['weight']

                    flatten = encoded_layers.reshape(layer_num, -1)
                    weighted_sum = torch.matmul(weights[:layer_num],
                                                flatten).reshape(
                                                    batch_size, seq_len,
                                                    feature_dim)
                    # embeddings: (batch_size, seq_len, feature_dim)

                    targets = [
                        encoded_layers[0], encoded_layers[-1], weighted_sum
                    ]
                    target_names = [
                        '_hidden_first.png', '_hidden_last.png',
                        '_hidden_weighted_sum.png'
                    ]
                    for target, name in zip(targets, target_names):
                        for index, rep in enumerate(target):
                            if idx + index >= 10:
                                break
                            png_name = os.path.join(self.dump_dir,
                                                    str(idx + index) + name)
                            self.verbose(f'Generating {png_name}')
                            plot_embedding(rep.data.cpu().numpy(),
                                           path=png_name)

                    idx += batch_size
                    if idx >= 10:
                        self.verbose(
                            'Mockingjay generated samples are saved to: {}'.
                            format(self.dump_dir))
                        break  # visualize the first 10 testing samples
Пример #12
0
def train(log_dir, args):
    checkpoint_path = os.path.join(hdfs_ckpts, log_dir, 'model.ckpt')
    log(hp.to_string(), is_print=False)
    log('Loading training data from: %s' % args.tfr_dir)
    log('Checkpoint path: %s' % checkpoint_path)
    log('Using model: sygst tacotron2')

    tf_dset = TFDataSet(hp, args.tfr_dir)
    feats = tf_dset.get_train_next()
    # Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    training = tf.placeholder_with_default(True, shape=(), name='training')
    with tf.name_scope('model'):
        model = Tacotron2SYGST(hp)
        model(feats['inputs'],
              mel_inputs=feats['mel_targets'],
              spec_inputs=feats['linear_targets'],
              spec_lengths=feats['spec_lengths'],
              ref_inputs=feats['mel_targets'],
              ref_lengths=feats['spec_lengths'],
              arousal_labels=feats['soft_arousal_labels'],
              valence_labels=feats['soft_valance_labels'],
              training=training)
        """
        text_x, mel_x, spec_x, spec_len, aro, val = debug_data(2, 5, 10)
        model(text_x, mel_x, spec_x, spec_len, mel_x, spec_len, aro, val, training=training)
        """
        model.add_loss()
        model.add_optimizer(global_step)
        stats = model.add_stats()

    # Bookkeeping:
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=2)

    # Train!
    config = tf.ConfigProto(allow_soft_placement=True,
                            gpu_options=tf.GPUOptions(allow_growth=True))
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            sess.run(tf.global_variables_initializer())
            if args.restore_step:
                # Restore from a checkpoint if the user requested it.
                restore_path = '%s-%s' % (checkpoint_path, args.restore_step)
                saver.restore(sess, restore_path)
                log('Resuming from checkpoint: %s' % restore_path, slack=True)
            else:
                log('Starting a new training run ...', slack=True)
            """
            fetches = [global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss,
                       model.stop_loss, model.arousal_loss, model.valence_loss, model.mel_grad_norms_max,
                       model.spec_grad_norms_max, model.stop_grad_norms_max, model.aro_grad_norms_max, model.val_grad_norms_max]
            """
            fetches = [
                global_step, model.optimize, model.loss, model.mel_loss,
                model.spec_loss, model.stop_loss, model.arousal_loss,
                model.valence_loss
            ]
            for _ in range(_max_step):
                start_time = time.time()
                sess.run(debug.get_ops())
                # step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g = sess.run(fetches)
                step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss = sess.run(
                    fetches)
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                """
                message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f,mg=%.4f,spg=%.4f,sg=%.4f,ag=%.4f,vg=%.4f]' % (
                    step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g)
                """
                message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f]' % (
                    step, time_window.average, mel_loss, spec_loss, stop_loss,
                    aro_loss, val_loss)
                log(message, slack=(step % args.checkpoint_interval == 0))

                if loss > 100 or math.isnan(loss):
                    log('Loss exploded to %.5f at step %d!' % (loss, step),
                        slack=True)
                    raise Exception('Loss Exploded')

                if step % args.summary_interval == 0:
                    log('Writing summary at step: %d' % step)
                    try:
                        summary_writer.add_summary(sess.run(stats), step)
                    except Exception as e:
                        log(f'summary failed and ignored: {str(e)}')

                if step % args.checkpoint_interval == 0:
                    log('Saving checkpoint to: %s-%d' %
                        (checkpoint_path, step))
                    saver.save(sess, checkpoint_path, global_step=step)
                    log('Saving audio and alignment...')
                    gt_mel, gt_spec, seq, mel, spec, align = sess.run([
                        model.mel_targets[0], model.spec_targets[0],
                        model.text_targets[0], model.mel_outputs[0],
                        model.spec_outputs[0], model.alignment_outputs[0]
                    ])
                    text = sequence_to_text(seq)
                    wav = audio.inv_spectrogram(hp, spec.T)
                    wav_path = os.path.join(log_dir,
                                            'step-%d-audio.wav' % step)
                    mel_path = os.path.join(log_dir, 'step-%d-mel.png' % step)
                    spec_path = os.path.join(log_dir,
                                             'step-%d-spec.png' % step)
                    align_path = os.path.join(log_dir,
                                              'step-%d-align.png' % step)
                    info = '%s, %s, step=%d, loss=%.5f\n %s' % (
                        args.model, time_string(), step, loss, text)
                    plot.plot_alignment(align, align_path, info=info)
                    plot.plot_mel(mel, mel_path, info=info, gt_mel=gt_mel)
                    plot.plot_mel(spec, spec_path, info=info, gt_mel=gt_spec)
                    audio.save_wav(hp, wav, wav_path)
                    log('Input: %s' % text)

        except Exception as e:
            log('Exiting due to exception: %s' % e, slack=True)
            traceback.print_exc()