Exemplo n.º 1
0
def _eval_tgt(synth, args, checkpoint_path, output_dir, hparams, sentences,
              flag_to_wav, checkpoint_eal):
    synth_dir = os.path.join(output_dir, 'eval',
                             'wav') if flag_to_wav else os.path.join(
                                 output_dir, 'eval', 'npy')
    os.makedirs(synth_dir, exist_ok=True)

    # Set up denormalisation parameters for synthesis
    mean_path = os.path.abspath(
        os.path.join(args.base_dir, args.training_dir, 'pml_data/mean.dat'))
    std_path = os.path.abspath(
        os.path.join(args.base_dir, args.training_dir, 'pml_data/std.dat'))
    mean_norm = None
    std_norm = None

    if os.path.isfile(mean_path) and os.path.isfile(std_path):
        mean_norm = np.fromfile(mean_path, 'float32')
        std_norm = np.fromfile(std_path, 'float32')
    else:
        warnings.warn(
            'No mean or standard deviation files found at locations {} and {}'.
            format(mean_path, std_path))

    print('Synthesizing to {}...'.format(synth_dir))
    if flag_to_wav:
        wavs = synth.synthesize(sentences,
                                to_wav=True,
                                mean_norm=mean_norm,
                                std_norm=std_norm,
                                spec_type=hparams.spec_type)
        for i, wav in enumerate(wavs):
            path = os.path.join(synth_dir, 'eval-%d.wav' % i)
            print('Writing {}...'.format(path))
            if args.variant not in ['tacotron_orig', 'tacotron_bk2orig']:
                sp.wavwrite(path,
                            wav,
                            hparams.sample_rate,
                            norm_max_ifneeded=True,
                            verbose=0)
            else:
                with open(path, 'wb') as f:
                    f.write(wav)

    else:
        import pdb
        pdb.set_trace()
        tgt_features_matrix = synth.synthesize(sentences,
                                               to_wav=False,
                                               mean_norm=mean_norm,
                                               std_norm=std_norm,
                                               spec_type=hparams.spec_type)
        name_list = get_file_list(
            '/home/dawna/tts/qd212/data/lj/merlinData/file_id_list.scp'
        )[13050:13050 + 50]
        for i, f in enumerate(tgt_features_matrix):
            if i < 50: path = os.path.join(synth_dir, '%s.npy' % name_list[i])
            else: path = os.path.join(synth_dir, 'eval-%d.npy' % i)
            print('Writing {}...'.format(path))
            np.save(path, f, allow_pickle=False)
Exemplo n.º 2
0
#!/usr/bin/python
'''
Copyright(C) 2016 Engineering Department, University of Cambridge, UK.

License
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

Author
    Gilles Degottex <*****@*****.**>
'''

import sys

from lib import sigproc as sp

if __name__ == "__main__":
    print('Normalise {}'.format(sys.argv[1]))
    wav, fs, enc = sp.wavread(sys.argv[1])
    wav, meta = lib.sigproc.interfaces.sv56demo(wav, fs)
    sp.wavwrite(sys.argv[1], wav, fs, enc)
Exemplo n.º 3
0
def synthesizef(fs,
                shift=0.005,
                dftlen=4096,
                ff0=None,
                flf0=None,
                fspec=None,
                flspec=None,
                ffwlspec=None,
                ffwcep=None,
                fmcep=None,
                fpdd=None,
                fmpdd=None,
                fnm=None,
                ffwnm=None,
                nm_cont=False,
                fsyn=None,
                verbose=1):
    '''
    Call the synthesis from python using file inputs and outputs
    '''
    if ff0:
        f0 = np.fromfile(ff0, dtype=np.float32)
    if flf0:
        f0 = np.fromfile(flf0, dtype=np.float32)
        f0[f0 > 0] = np.exp(f0[f0 > 0])
    ts = (shift) * np.arange(len(f0))
    f0s = np.vstack((ts, f0)).T

    if fspec:
        SPEC = np.fromfile(fspec, dtype=np.float32)
        SPEC = SPEC.reshape((len(f0), -1))
    if flspec:
        SPEC = np.fromfile(flspec, dtype=np.float32)
        SPEC = np.exp(SPEC.reshape((len(f0), -1)))
    if ffwlspec:
        FWLSPEC = np.fromfile(ffwlspec, dtype=np.float32)
        FWLSPEC = FWLSPEC.reshape((len(f0), -1))
        SPEC = np.exp(sp.fwbnd2linbnd(FWLSPEC, fs, dftlen, smooth=True))
    if ffwcep:
        FWCEP = np.fromfile(ffwcep, dtype=np.float32)
        FWCEP = FWCEP.reshape((len(f0), -1))
        SPEC = np.exp(sp.fwcep2loghspec(FWCEP, fs, dftlen))
    if fmcep:  # pragma: no cover
        # Cannot test this because it needs SPTK
        MCEP = np.fromfile(fmcep, dtype=np.float32)
        MCEP = MCEP.reshape((len(f0), -1))
        SPEC = sp.mcep2spec(MCEP, sp.bark_alpha(fs), dftlen)

    NM = None
    pdd_thresh = 0.75  # For this value, see:
    # G. Degottex and D. Erro, "A uniform phase representation for the harmonic model in speech synthesis applications," EURASIP, Journal on Audio, Speech, and Music Processing - Special Issue: Models of Speech - In Search of Better Representations, vol. 2014, iss. 1, p. 38, 2014.
    if fpdd:
        PDD = np.fromfile(fpdd, dtype=np.float32)
        PDD = PDD.reshape((len(f0), -1))
        NM = PDD.copy()
        NM[PDD < pdd_thresh] = 0.0
        NM[PDD > pdd_thresh] = 1.0
    if fmpdd:  # pragma: no cover
        # Cannot test this because it needs SPTK
        MPDD = np.fromfile(fmpdd, dtype=np.float32)
        MPDD = MPDD.reshape((len(f0), -1))
        PDD = sp.mcep2spec(MPDD, sp.bark_alpha(fs), dftlen)
        NM = PDD.copy()
        NM[PDD < pdd_thresh] = 0.0
        NM[PDD > pdd_thresh] = 1.0

    if fnm:
        NM = np.fromfile(fnm, dtype=np.float32)
        NM = NM.reshape((len(f0), -1))
    if ffwnm:
        FWNM = np.fromfile(ffwnm, dtype=np.float32)
        FWNM = FWNM.reshape((len(f0), -1))
        NM = sp.fwbnd2linbnd(FWNM, fs, dftlen)

    syn = synthesize(fs, f0s, SPEC, NM=NM, nm_cont=nm_cont, verbose=verbose)
    if fsyn:
        sp.wavwrite(fsyn, syn, fs, norm_max_ifneeded=True, verbose=verbose)

    return syn
Exemplo n.º 4
0
def train(log_dir, args, input):
    commit = get_git_commit() if args.git else 'None'
    checkpoint_path = os.path.join(log_dir, 'model.ckpt')
    input_path = os.path.join(args.base_dir, input)
    log('Checkpoint path: %s' % checkpoint_path)
    log('Loading training data from: %s' % input_path)
    log('Using model: %s' % args.variant)
    log(hparams_debug_string())

    # Set up DataFeeder:
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        if args.eal_dir:
            from tacotron.datafeeder import DataFeeder_EAL
            feeder = DataFeeder_EAL(coord, input_path, hparams, args.eal_dir)
        else:
            from tacotron.datafeeder import DataFeeder
            feeder = DataFeeder(coord, input_path, hparams)

    # Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    with tf.variable_scope('model') as scope:
        model = create_model(args.variant, hparams)
        if args.eal_dir:
            model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets,
                             feeder.linear_targets, feeder.pml_targets, is_training=True, 
                             eal=True, locked_alignments=feeder.locked_alignments, 
                             flag_trainAlign=args.eal_trainAlign, flag_trainJoint=args.eal_trainJoint, alignScale=args.eal_alignScale)
        else:
            model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets,
                             feeder.linear_targets, feeder.pml_targets, is_training=True, 
                             gta=True)
        model.add_loss()
        model.add_optimizer(global_step)
        stats = add_stats(model, eal_dir=args.eal_dir)

    # Bookkeeping:
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)

    # Set up fixed alignment synthesizer
    alignment_synth = AlignmentSynthesizer()

    # Set up text for synthesis
    fixed_sentence = 'Scientists at the CERN laboratory say they have discovered a new particle.'

    # Set up denormalisation parameters for synthesis
    mean_path = os.path.abspath(os.path.join(args.base_dir, input, '..', 'pml_data/mean.dat'))
    std_path = os.path.abspath(os.path.join(args.base_dir, input, '..', 'pml_data/std.dat'))
    log('Loading normalisation mean from: {}'.format(mean_path))
    log('Loading normalisation standard deviation from: {}'.format(std_path))
    mean_norm = None
    std_norm = None

    if os.path.isfile(mean_path) and os.path.isfile(std_path):
        mean_norm = np.fromfile(mean_path, 'float32')
        std_norm = np.fromfile(std_path, 'float32')

    # Train!
#     import pdb
#     flag_pdb = False
#     pdb.set_trace()
#     args.checkpoint_interval = 2
#     args.num_steps = 5
    
    with tf.Session() as sess:
        try:
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            sess.run(tf.global_variables_initializer())
            
#             pdb.set_trace()
            
            if args.restore_step:
                # Restore from a checkpoint if the user requested it.
                restore_path = '%s-%d' % (checkpoint_path, args.restore_step)
                saver.restore(sess, restore_path)
                log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True)
            elif args.eal_dir and args.eal_ckpt:
                if args.eal_trainAlign or args.eal_trainJoint:
                    list_var = tf.trainable_variables() + [v for v in tf.global_variables() if 'moving' in v.name]
                    saver_eal = tf.train.Saver(list_var)
                    saver_eal.restore(sess, args.eal_ckpt)
                    log('Loaded weights and batchNorm cache of checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True)
                elif args.eal_ft:
                    saver.restore(sess, args.eal_ckpt)
                    log('Refining the model from checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True)
                else:
                    list_var = [var for var in tf.global_variables() if 'optimizer' not in var.name]
                    saver_eal = tf.train.Saver(list_var)
                    saver_eal.restore(sess, args.eal_ckpt)
                    log('Initializing the weights from checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True)
#                 args.num_steps *= 2
#                 sess.run(global_step.assign(0))
            else:
                log('Starting new training run at commit: %s' % commit, slack=True)

            feeder.start_in_session(sess)
            step = 0  # initialise step variable so can use in while condition
            
            while not coord.should_stop() and step <= args.num_steps:
                
#                 pdb.set_trace()
                                
                start_time = time.time()
                if args.eal_trainAlign:
                    step, loss, loss_align, opt = sess.run([global_step, model.loss, model.loss_align, model.optimize])
#                     try:
#                         step, loss, loss_align, opt, tmp_a, tmp_ar = sess.run([global_step, model.loss, model.loss_align, model.optimize, 
#                                                                                model.alignments, model.alignments_ref])
#                     except:
#                         print("Oops!",sys.exc_info()[0],"occured.")
#                         flag_pdb = True
#                     if flag_pdb or np.isnan(loss_align):
#                         pdb.set_trace()
#                         flag_pdb = False
                    time_window.append(time.time() - start_time)
                    loss_window.append(loss_align)
                    message = 'Step %-7d [%.03f sec/step, loss=%.05f, loss_align=%.05f, avg_loss_align=%.05f]' % (
                        step, time_window.average, loss, loss_align, loss_window.average)
                elif args.eal_trainJoint:
                    step, loss, loss_align, loss_joint, opt = sess.run([global_step, model.loss, model.loss_align, 
                                                                        model.loss_joint, model.optimize])
                    time_window.append(time.time() - start_time)
                    loss_window.append(loss_joint)
                    message = 'Step %-7d [%.03f sec/step, loss=%.05f, loss_align=%.05f, avg_loss_joint=%.05f]' % (
                        step, time_window.average, loss, loss_align, loss_window.average)
                else:
                    step, loss, opt = sess.run([global_step, model.loss, model.optimize])
                    time_window.append(time.time() - start_time)
                    loss_window.append(loss)
                    message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % (
                        step, time_window.average, loss, loss_window.average)
                log(message, slack=(step % args.checkpoint_interval == 0))
                
                if loss > 100 or math.isnan(loss):
                    log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True)
                    raise Exception('Loss Exploded')

                if step % args.summary_interval == 0:
                    log('Writing summary at step: %d' % step)
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.checkpoint_interval == 0:
                    log('Saving checkpoint to: %s-%d' % (checkpoint_path, step))
                    saver.save(sess, checkpoint_path, global_step=step)
                    log('Saving audio and alignment...')
                    summary_elements = []

                    # if the model has linear spectrogram features, use them to synthesize audio
                    if hasattr(model, 'linear_targets'):
                        input_seq, alignment, target_spectrogram, spectrogram = sess.run([
                            model.inputs[0], model.alignments[0], model.linear_targets[0], model.linear_outputs[0]])

                        output_waveform = audio.inv_spectrogram(spectrogram.T)
                        target_waveform = audio.inv_spectrogram(target_spectrogram.T)
                        audio.save_wav(output_waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step))
                        audio.save_wav(target_waveform, os.path.join(log_dir, 'step-%d-target-audio.wav' % step))
                    # otherwise, synthesize audio from PML vocoder features
                    elif hasattr(model, 'pml_targets'):
                        input_seq, alignment, target_pml_features, pml_features = sess.run([
                            model.inputs[0], model.alignments[0], model.pml_targets[0], model.pml_outputs[0]])

                        cfg = Configuration(hparams.sample_rate, hparams.pml_dimension)
                        synth = PMLSynthesizer(cfg)
                        output_waveform = synth.pml_to_wav(pml_features, mean_norm=mean_norm, std_norm=std_norm,
                                                           spec_type=hparams.spec_type)
                        target_waveform = synth.pml_to_wav(target_pml_features, mean_norm=mean_norm, std_norm=std_norm,
                                                           spec_type=hparams.spec_type)

                        sp.wavwrite(os.path.join(log_dir, 'step-%d-target-audio.wav' % step), target_waveform,
                                    hparams.sample_rate, norm_max_ifneeded=True)
                        sp.wavwrite(os.path.join(log_dir, 'step-%d-audio.wav' % step), output_waveform,
                                    hparams.sample_rate, norm_max_ifneeded=True)

                    # we need to adjust the output and target waveforms so the values lie in the interval [-1.0, 1.0]
                    output_waveform /= 1.05 * np.max(np.abs(output_waveform))
                    target_waveform /= 1.05 * np.max(np.abs(target_waveform))

                    summary_elements.append(
                        tf.summary.audio('ideal-%d' % step, np.expand_dims(target_waveform, 0), hparams.sample_rate),
                    )

                    summary_elements.append(
                        tf.summary.audio('sample-%d' % step, np.expand_dims(output_waveform, 0), hparams.sample_rate),
                    )

                    # get the alignment for the top sentence in the batch
                    random_attention_plot = plot.plot_alignment(alignment, os.path.join(log_dir,
                                                                                        'step-%d-random-align.png' % step),
                                                                info='%s, %s, %s, step=%d, loss=%.5f' % (
                                                                args.variant, commit, time_string(), step, loss))

                    summary_elements.append(
                        tf.summary.image('attention-%d' % step, random_attention_plot),
                    )

                    # also process the alignment for a fixed sentence for comparison
                    alignment_synth.load('%s-%d' % (checkpoint_path, step), hparams, model_name=args.variant)
                    fixed_alignment = alignment_synth.synthesize(fixed_sentence)
                    fixed_attention_plot = plot.plot_alignment(fixed_alignment,
                                                               os.path.join(log_dir, 'step-%d-fixed-align.png' % step),
                                                               info='%s, %s, %s, step=%d, loss=%.5f' % (
                                                               args.variant, commit, time_string(), step, loss))

                    summary_elements.append(
                        tf.summary.image('fixed-attention-%d' % step, fixed_attention_plot),
                    )

                    # save the audio and alignment to tensorboard (audio sample rate is hyperparameter)
                    merged = sess.run(tf.summary.merge(summary_elements))

                    summary_writer.add_summary(merged, step)

                    log('Input: %s' % sequence_to_text(input_seq))

        except Exception as e:
            log('Exiting due to exception: %s' % e, slack=True)
            traceback.print_exc()
            coord.request_stop(e)