示例#1
0
def run_eval(args, checkpoint, hparams, sentences):
    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))

    if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
        raise ValueError(
            'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
        raise ValueError(
            'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    output_dir = get_synthesis_output_dir(args.caching_dir)

    eval_dir = get_evals_dir(args.caching_dir)
    log_dir = os.path.join(output_dir, 'logs-eval')

    #if args.model == 'Tacotron-2':
    #assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir)

    #Create output path if it doesn't exist
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer(args.caching_dir)
    synth.load(checkpoint_path, hparams)

    #Set inputs batch wise
    sentences = [
        sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range(
            0, len(sentences), hparams.tacotron_synthesis_batch_size)
    ]

    log('Starting Synthesis')
    with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
        for i, texts in enumerate(tqdm(sentences)):
            start = time.time()
            basenames = [
                'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))
            ]
            mel_filenames, speaker_ids = synth.synthesize(
                texts, basenames, eval_dir, log_dir, None)

            for elems in zip(texts, mel_filenames, speaker_ids):
                file.write('|'.join([str(x) for x in elems]) + '\n')
    log('synthesized mel spectrograms at {}'.format(eval_dir))
    return eval_dir
def run_synthesis(args, checkpoint, hparams):

  try:
    checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
    log('loaded model at {}'.format(checkpoint_path))
  except:
    raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))

  if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
    raise ValueError('Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'.format(
      hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus))

  if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
    raise ValueError('Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'.format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus))

  gta = args.GTA == 'True'
  synth_dir = get_synth_dir(args.caching_dir, gta)
  gta_map_file = get_gta_map_file(synth_dir)
  #Create output path if it doesn't exist
  os.makedirs(synth_dir, exist_ok=True)

  metadata_path = get_train_txt(args.caching_dir)
  metadata = load_meta(metadata_path)
  log(hparams_debug_string())
  synth = Synthesizer(args.caching_dir)
  synth.load(checkpoint_path, hparams, gta=gta)
  frame_shift_ms = hparams.hop_size / hparams.sample_rate
  hours = sum([int(x[2]) for x in metadata]) * frame_shift_ms / (3600)
  log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours))

  #Set inputs batch wise
  metadata = [metadata[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]

  log('Starting Synthesis')

  txt_dir = get_txt_dir(args.caching_dir)
  mel_dir = get_mel_dir(args.caching_dir)
  wav_dir = get_wav_dir(args.caching_dir)

  symbol_file = get_symbols_file(args.caching_dir)
  conv = get_from_file(symbol_file)
  with open(gta_map_file, 'w') as file:
    for i, meta in enumerate(tqdm(metadata)):
      if i % 10 == 0:
        text_paths = [os.path.join(txt_dir, "{}.npy".format(m[0])) for m in meta]
        text_symbols = [np.load(pth) for pth in text_paths]
        # trim ~ at the end
        texts = [conv.sequence_to_original_text(x) for x in text_symbols]
        #texts = [m[5] for m in meta]
        mel_filenames = [os.path.join(mel_dir, "{}.npy".format(m[0])) for m in meta]
        wav_filenames = [os.path.join(wav_dir, "{}.npy".format(m[0])) for m in meta]
        basenames = [m[0] for m in meta]
        mel_output_filenames, speaker_ids = synth.synthesize(texts, basenames, synth_dir, None, mel_filenames)

        for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts):
          file.write('|'.join([str(x) for x in elems]) + '\n')

  log('synthesized mel spectrograms at {}'.format(synth_dir))
  return gta_map_file
def run_live(args, checkpoint, hparams):
    # if args.mode != eval or synthesis
    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))

    if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
        raise ValueError(
            'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
        raise ValueError(
            'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    #Log to Terminal without keeping any records in files
    log(hparams_debug_string())
    synth = Synthesizer(args.caching_dir)
    synth.load(checkpoint_path, hparams)

    #Generate fast greeting message
    greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!'
    log(greetings)
    generate_fast(synth, greetings)

    #Interaction loop
    while True:
        try:
            text = input()
            generate_fast(synth, text)

        except KeyboardInterrupt:
            leave = 'Thank you for testing our features. see you soon.'
            log(leave)
            generate_fast(synth, leave)
            sleep(2)
            break
示例#4
0
def train(log_dir, args, hparams):
    symbol_file = get_symbols_file(args.caching_dir)
    symbol_converter = get_from_file(symbol_file)
    symbols_count = symbol_converter.get_symbols_count()

    save_dir = get_save_dir(log_dir)
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    mel_dir = os.path.join(log_dir, 'mel-spectrograms')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'tacotron_events')
    meta_folder = os.path.join(log_dir, 'metas')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
    caching_dir = args.caching_dir

    if hparams.predict_linear:
        linear_dir = os.path.join(log_dir, 'linear-spectrograms')
        os.makedirs(linear_dir, exist_ok=True)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(caching_dir))
    log('Using model: {}'.format("tacotron"))
    log(hparams_debug_string())

    #Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, caching_dir, hparams)

    #Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step,
                                    symbols_count)
    eval_model = model_test_mode(args, feeder, hparams, global_step,
                                 symbols_count)

    #Potential Griffin-Lim GPU setup
    if hparams.GL_on_GPU:
        GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels),
                                          name='GLGPU_mel_inputs')
        GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq),
                                          name='GLGPU_lin_inputs')

        GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(
            GLGPU_mel_inputs, hparams)
        GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(
            GLGPU_lin_inputs, hparams)

    #Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=20)

    log('Tacotron training set to a maximum of {} steps'.format(
        args.tacotron_train_steps))

    #Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    #Train
    sess = tf.Session(config=config)

    summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

    sess.run(tf.global_variables_initializer())

    #saved model restoring
    if args.restore:
        # Restore saved model if the user requested it, default = True
        try:
            checkpoint_state = tf.train.get_checkpoint_state(save_dir)

            if (checkpoint_state and checkpoint_state.model_checkpoint_path):
                log('Loading checkpoint {}'.format(
                    checkpoint_state.model_checkpoint_path),
                    slack=True)
                saver.restore(sess, checkpoint_state.model_checkpoint_path)

            else:
                log('No model to load at {}'.format(save_dir), slack=True)
                saver.save(sess, checkpoint_path, global_step=global_step)

        except tf.errors.OutOfRangeError as e:
            log('Cannot restore checkpoint: {}'.format(e), slack=True)
    else:
        log('Starting new training!', slack=True)
        saver.save(sess, checkpoint_path, global_step=global_step)

    #initializing feeder
    threads = feeder.start_threads(sess)

    #Training loop
    while not coord.should_stop() and step < args.tacotron_train_steps:
        start_time = time.time()
        step, loss, opt = sess.run([global_step, model.loss, model.optimize])
        time_window.append(time.time() - start_time)
        loss_window.append(loss)
        message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
            step, time_window.average, loss, loss_window.average)
        log(message, end='\r', slack=(step % args.checkpoint_interval == 0))

        if np.isnan(loss) or loss > 100.:
            log('Loss exploded to {:.5f} at step {}'.format(loss, step))
            raise Exception('Loss exploded')

        if step % args.summary_interval == 0:
            log('\nWriting summary at step {}'.format(step))
            summary_writer.add_summary(sess.run(stats), step)

        if step % args.eval_interval == 0:
            #Run eval and save eval stats
            log('\nRunning evaluation at step {}'.format(step))
            if feeder.test_steps == 0:
                log('zero test steps, skipping...')
            else:
                eval_losses = []
                before_losses = []
                after_losses = []
                stop_token_losses = []
                linear_losses = []
                linear_loss = None

                if hparams.predict_linear:
                    for i in tqdm(range(feeder.test_steps)):
                        eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run(
                            [
                                eval_model.tower_loss[0],
                                eval_model.tower_before_loss[0],
                                eval_model.tower_after_loss[0],
                                eval_model.tower_stop_token_loss[0],
                                eval_model.tower_linear_loss[0],
                                eval_model.tower_mel_outputs[0][0],
                                eval_model.tower_mel_targets[0][0],
                                eval_model.tower_targets_lengths[0][0],
                                eval_model.tower_alignments[0][0],
                                eval_model.tower_linear_outputs[0][0],
                                eval_model.tower_linear_targets[0][0],
                            ])
                        eval_losses.append(eloss)
                        before_losses.append(before_loss)
                        after_losses.append(after_loss)
                        stop_token_losses.append(stop_token_loss)
                        linear_losses.append(linear_loss)

                    if (len(linear_losses) != 0):
                        linear_loss = sum(linear_losses) / len(linear_losses)
                    else:
                        linear_loss = 0
                        log('len(linear_losses) was 0')

                    if hparams.GL_on_GPU:
                        wav = sess.run(GLGPU_lin_outputs,
                                       feed_dict={GLGPU_lin_inputs: lin_p})
                        wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                                    hparams.preemphasize)
                    else:
                        wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(
                            eval_wav_dir,
                            'step-{}-eval-wave-from-linear.wav'.format(step)),
                        sr=hparams.sample_rate)

                else:
                    for i in tqdm(range(feeder.test_steps)):
                        eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run(
                            [
                                eval_model.tower_loss[0],
                                eval_model.tower_before_loss[0],
                                eval_model.tower_after_loss[0],
                                eval_model.tower_stop_token_loss[0],
                                eval_model.tower_mel_outputs[0][0],
                                eval_model.tower_mel_targets[0][0],
                                eval_model.tower_targets_lengths[0][0],
                                eval_model.tower_alignments[0][0]
                            ])
                        eval_losses.append(eloss)
                        before_losses.append(before_loss)
                        after_losses.append(after_loss)
                        stop_token_losses.append(stop_token_loss)

                if (len(eval_losses) != 0):
                    eval_loss = sum(eval_losses) / len(eval_losses)
                else:
                    eval_loss = 0
                    log('len(eval_losses) was 0')

                if (len(before_losses) != 0):
                    before_loss = sum(before_losses) / len(before_losses)
                else:
                    before_loss = 0
                    log('len(before_losses) was 0')

                if (len(after_losses) != 0):
                    after_loss = sum(after_losses) / len(after_losses)
                else:
                    after_loss = 0
                    log('len(after_losses) was 0')

                if (len(stop_token_losses) != 0):
                    stop_token_loss = sum(stop_token_losses) / len(
                        stop_token_losses)
                else:
                    stop_token_loss = 0
                    log('len(stop_token_losses) was 0')

                log('Saving eval log to {}..'.format(eval_dir))
                #Save some log to monitor model improvement on same unseen sequence
                if hparams.GL_on_GPU:
                    wav = sess.run(GLGPU_mel_outputs,
                                   feed_dict={GLGPU_mel_inputs: mel_p})
                    wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                                hparams.preemphasize)
                else:
                    wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
                audio.save_wav(
                    wav,
                    os.path.join(
                        eval_wav_dir,
                        'step-{}-eval-wave-from-mel.wav'.format(step)),
                    sr=hparams.sample_rate)

                plot.plot_alignment(
                    align,
                    os.path.join(eval_plot_dir,
                                 'step-{}-eval-align.png'.format(step)),
                    title='{}, {}, step={}, loss={:.5f}'.format(
                        "tacotron", time_string(), step, eval_loss),
                    max_len=t_len // hparams.outputs_per_step)
                plot.plot_spectrogram(
                    mel_p,
                    os.path.join(
                        eval_plot_dir,
                        'step-{}-eval-mel-spectrogram.png'.format(step)),
                    title='{}, {}, step={}, loss={:.5f}'.format(
                        "tacotron", time_string(), step, eval_loss),
                    target_spectrogram=mel_t,
                    max_len=t_len)

                if hparams.predict_linear:
                    plot.plot_spectrogram(
                        lin_p,
                        os.path.join(
                            eval_plot_dir,
                            'step-{}-eval-linear-spectrogram.png'.format(
                                step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            "tacotron", time_string(), step, eval_loss),
                        target_spectrogram=lin_t,
                        max_len=t_len,
                        auto_aspect=True)

                log('Eval loss for global step {}: {:.3f}'.format(
                    step, eval_loss))
                log('Writing eval summary!')
                add_eval_stats(summary_writer, step, linear_loss, before_loss,
                               after_loss, stop_token_loss, eval_loss)

        if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300:
            #Save model and current global step
            saver.save(sess, checkpoint_path, global_step=global_step)

            log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..'
                )
            if hparams.predict_linear:
                input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run(
                    [
                        model.tower_inputs[0][0],
                        model.tower_mel_outputs[0][0],
                        model.tower_linear_outputs[0][0],
                        model.tower_alignments[0][0],
                        model.tower_mel_targets[0][0],
                        model.tower_targets_lengths[0][0],
                        model.tower_linear_targets[0][0],
                    ])

                #save predicted linear spectrogram to disk (debug)
                linear_filename = 'linear-prediction-step-{}.npy'.format(step)
                np.save(os.path.join(linear_dir, linear_filename),
                        linear_prediction.T,
                        allow_pickle=False)

                #save griffin lim inverted wav for debug (linear -> wav)
                if hparams.GL_on_GPU:
                    wav = sess.run(
                        GLGPU_lin_outputs,
                        feed_dict={GLGPU_lin_inputs: linear_prediction})
                    wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                                hparams.preemphasize)
                else:
                    wav = audio.inv_linear_spectrogram(linear_prediction.T,
                                                       hparams)
                audio.save_wav(
                    wav,
                    os.path.join(wav_dir,
                                 'step-{}-wave-from-linear.wav'.format(step)),
                    sr=hparams.sample_rate)

                #Save real and predicted linear-spectrogram plot to disk (control purposes)
                plot.plot_spectrogram(
                    linear_prediction,
                    os.path.join(
                        plot_dir,
                        'step-{}-linear-spectrogram.png'.format(step)),
                    title='{}, {}, step={}, loss={:.5f}'.format(
                        "tacotron", time_string(), step, loss),
                    target_spectrogram=linear_target,
                    max_len=target_length,
                    auto_aspect=True)

            else:
                input_seq, mel_prediction, alignment, target, target_length = sess.run(
                    [
                        model.tower_inputs[0][0],
                        model.tower_mel_outputs[0][0],
                        model.tower_alignments[0][0],
                        model.tower_mel_targets[0][0],
                        model.tower_targets_lengths[0][0],
                    ])

            #save predicted mel spectrogram to disk (debug)
            mel_filename = 'mel-prediction-step-{}.npy'.format(step)
            np.save(os.path.join(mel_dir, mel_filename),
                    mel_prediction.T,
                    allow_pickle=False)

            #save griffin lim inverted wav for debug (mel -> wav)
            if hparams.GL_on_GPU:
                wav = sess.run(GLGPU_mel_outputs,
                               feed_dict={GLGPU_mel_inputs: mel_prediction})
                wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                            hparams.preemphasize)
            else:
                wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
            audio.save_wav(wav,
                           os.path.join(
                               wav_dir,
                               'step-{}-wave-from-mel.wav'.format(step)),
                           sr=hparams.sample_rate)

            #save alignment plot to disk (control purposes)
            plot.plot_alignment(
                alignment,
                os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
                title='{}, {}, step={}, loss={:.5f}'.format(
                    "tacotron", time_string(), step, loss),
                max_len=target_length // hparams.outputs_per_step)
            #save real and predicted mel-spectrogram plot to disk (control purposes)
            plot.plot_spectrogram(
                mel_prediction,
                os.path.join(plot_dir,
                             'step-{}-mel-spectrogram.png'.format(step)),
                title='{}, {}, step={}, loss={:.5f}'.format(
                    "tacotron", time_string(), step, loss),
                target_spectrogram=target,
                max_len=target_length)
            original_text = symbol_converter.sequence_to_text(input_seq)
            log('Input at step {}: {}'.format(step, original_text))

        if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
            #Get current checkpoint state
            checkpoint_state = tf.train.get_checkpoint_state(save_dir)

            #Update Projector
            log('\nSaving Model Character Embeddings visualization..')
            add_embedding_stats(summary_writer, [model.embedding_table.name],
                                checkpoint_state.model_checkpoint_path)
            log('Tacotron Character embeddings have been updated on tensorboard!'
                )

        log("train it finished.")

    log("request stop.")
    coord.request_stop()
    log("waitfor stop.")
    coord.wait_for_stop()
    log("close queue.")
    #feeder.close_queue()
    # do not finish.
    log("wait threads exit.")
    #coord.join(threads)

    try:
        log("close session.")
        sess.close()
        log("reset graph.")
        tf.reset_default_graph()
    except Exception as e:
        log("Session bug occured.")
        #log('Exiting due to exception: {}'.format(e), slack=True)
        #traceback.print_exc()
        #coord.request_stop(e)
        #coord.wait_for_stop()
        #raise Exception('Exception occured.')

    sleep(0.5)

    log('Tacotron training complete after {} global steps!'.format(
        args.tacotron_train_steps),
        slack=True)
示例#5
0
def train(log_dir, args, hparams):
    save_dir = get_save_dir(log_dir)
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'wavenet_events')
    meta_folder = os.path.join(log_dir, 'metas')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt')

    gta = args.GTA == 'True'
    synth_dir = get_synth_dir(args.caching_dir, gta)
    gta_map_file = get_gta_map_file(synth_dir)

    log('Checkpoint_path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(gta_map_file))
    log('Using model: {}'.format('WaveNet'))
    log(hparams_debug_string())

    #Start by setting a seed for repeatability
    tf.set_random_seed(hparams.wavenet_random_seed)

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, gta_map_file, hparams)

    #Set up model
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    eval_model = model_test_mode(args, feeder, hparams, global_step)

    #Speaker Embeddings metadata
    if hparams.speakers_path is not None:
        speaker_embedding_meta = hparams.speakers_path

    else:
        speaker_embedding_meta = os.path.join(meta_folder,
                                              'SpeakerEmbeddings.tsv')
        if not os.path.isfile(speaker_embedding_meta):
            with open(speaker_embedding_meta, 'w', encoding='utf-8') as f:
                for speaker in hparams.speakers:
                    f.write('{}\n'.format(speaker))

        speaker_embedding_meta = speaker_embedding_meta.replace(log_dir, '..')

    #book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    sh_saver = create_shadow_saver(model, global_step)

    log('Wavenet training set to a maximum of {} steps'.format(
        args.wavenet_train_steps))

    #Memory allocation on the memory
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    run_init = False

    #Train
    sess = tf.Session(config=config)

    summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
    sess.run(tf.global_variables_initializer())

    #saved model restoring
    if args.restore:
        # Restore saved model if the user requested it, default = True
        try:
            checkpoint_state = tf.train.get_checkpoint_state(save_dir)

            if (checkpoint_state and checkpoint_state.model_checkpoint_path):
                log('Loading checkpoint {}'.format(
                    checkpoint_state.model_checkpoint_path),
                    slack=True)
                load_averaged_model(sess, sh_saver,
                                    checkpoint_state.model_checkpoint_path)
            else:
                log('No model to load at {}'.format(save_dir), slack=True)
                if hparams.wavenet_weight_normalization:
                    run_init = True

        except tf.errors.OutOfRangeError as e:
            log('Cannot restore checkpoint: {}'.format(e), slack=True)
    else:
        log('Starting new training!', slack=True)
        if hparams.wavenet_weight_normalization:
            run_init = True

    if run_init:
        log('\nApplying Weight normalization in fresh training. Applying data dependent initialization forward pass..'
            )
        #Create init_model
        init_model, _ = model_train_mode(args,
                                         feeder,
                                         hparams,
                                         global_step,
                                         init=True)

    #initializing feeder
    feeder.start_threads(sess)

    if run_init:
        #Run one forward pass for model parameters initialization (make prediction on init_batch)
        _ = sess.run(init_model.tower_y_hat)
        log('Data dependent initialization done. Starting training!')

    #Training loop
    while not coord.should_stop() and step < args.wavenet_train_steps:
        start_time = time.time()
        step, loss, opt = sess.run([global_step, model.loss, model.optimize])
        time_window.append(time.time() - start_time)
        loss_window.append(loss)

        message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
            step, time_window.average, loss, loss_window.average)
        log(message, end='\r', slack=(step % args.checkpoint_interval == 0))

        if np.isnan(loss) or loss > 100:
            log('Loss exploded to {:.5f} at step {}'.format(loss, step))
            raise Exception('Loss exploded')

        if step % args.summary_interval == 0:
            log('\nWriting summary at step {}'.format(step))
            summary_writer.add_summary(sess.run(stats), step)

        if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps:
            save_log(sess,
                     step,
                     model,
                     plot_dir,
                     wav_dir,
                     hparams=hparams,
                     model_name='WaveNet')
            save_checkpoint(sess, sh_saver, checkpoint_path, global_step)

        if step % args.eval_interval == 0:
            log('\nEvaluating at step {}'.format(step))
            eval_step(sess,
                      step,
                      eval_model,
                      eval_plot_dir,
                      eval_wav_dir,
                      summary_writer=summary_writer,
                      hparams=model._hparams,
                      model_name='WaveNet')

        if hparams.gin_channels > 0 and (step % args.embedding_interval == 0
                                         or step == args.wavenet_train_steps
                                         or step == 1):
            #Get current checkpoint state
            checkpoint_state = tf.train.get_checkpoint_state(save_dir)

            #Update Projector
            log('\nSaving Model Speaker Embeddings visualization..')
            add_embedding_stats(summary_writer, [model.embedding_table.name],
                                [speaker_embedding_meta],
                                checkpoint_state.model_checkpoint_path)
            log('WaveNet Speaker embeddings have been updated on tensorboard!')

    log('Wavenet training complete after {} global steps'.format(
        args.wavenet_train_steps),
        slack=True)
    coord.request_stop()
    coord.wait_for_stop()

    try:
        sess.close()
        tf.reset_default_graph()
    except:
        log("Session bug occured.")
        # except Exception as e:
        #   log('Exiting due to exception: {}'.format(e), slack=True)
        #   traceback.print_exc()
        #   coord.request_stop(e)
        #   coord.wait_for_stop()
        #   raise Exception('Exception occured.')

    sleep(0.5)
示例#6
0
def run_synthesis(args, checkpoint, caching_dir, hparams):
    output_dir = get_output_dir(caching_dir)

    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))

    log_dir = os.path.join(output_dir, 'plots')
    wav_dir = os.path.join(output_dir, 'wavs')

    #We suppose user will provide correct folder depending on training method
    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    #if args.model == 'Tacotron-2':
    #If running all Tacotron-2, synthesize audio from evaluated mels
    evals_dir = get_evals_dir(args.caching_dir)
    metadata_filename = os.path.join(evals_dir, 'map.txt')

    with open(metadata_filename, encoding='utf-8') as f:
        metadata = np.array([line.strip().split('|') for line in f])

    speaker_ids = metadata[:, 2]
    mel_files = metadata[:, 1]
    texts = metadata[:, 0]

    speaker_ids = None if (speaker_ids == '<no_g>').all() else speaker_ids
    # else:
    #   #else Get all npy files in input_dir (supposing they are mels)
    #   mel_files  = sorted([os.path.join(args.mels_dir, f) for f in os.listdir(args.mels_dir) if f.split('.')[-1] == 'npy'])
    #   speaker_ids = None if args.speaker_id is None else args.speaker_id.replace(' ', '').split(',')
    #   if speaker_ids is not None:
    #     assert len(speaker_ids) == len(mel_files)

    #  texts = None

    log('Starting synthesis! (this will take a while..)')
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)

    mel_files = [
        mel_files[i:i + hparams.wavenet_synthesis_batch_size]
        for i in range(0, len(mel_files), hparams.wavenet_synthesis_batch_size)
    ]
    speaker_ids = None if speaker_ids is None else [
        speaker_ids[i:i + hparams.wavenet_synthesis_batch_size] for i in range(
            0, len(speaker_ids), hparams.wavenet_synthesis_batch_size)
    ]
    texts = None if texts is None else [
        texts[i:i + hparams.wavenet_synthesis_batch_size]
        for i in range(0, len(texts), hparams.wavenet_synthesis_batch_size)
    ]

    with open(os.path.join(wav_dir, 'map.txt'), 'w') as file:
        for i, mel_batch in enumerate(tqdm(mel_files)):
            mel_spectros = [np.load(mel_file) for mel_file in mel_batch]

            basenames = [
                os.path.basename(mel_file).replace('.npy', '')
                for mel_file in mel_batch
            ]
            speaker_id_batch = None if speaker_ids is None else speaker_ids[i]
            audio_files = synth.synthesize(mel_spectros, speaker_id_batch,
                                           basenames, wav_dir, log_dir)

            speaker_logs = ['<no_g>'] * len(
                mel_batch) if speaker_id_batch is None else speaker_id_batch

            for j, mel_file in enumerate(mel_batch):
                if texts is None:
                    file.write('{}|{}\n'.format(mel_file, audio_files[j],
                                                speaker_logs[j]))
                else:
                    file.write('{}|{}|{}\n'.format(texts[i][j], mel_file,
                                                   audio_files[j],
                                                   speaker_logs[j]))

    log('synthesized audio waveforms at {}'.format(wav_dir))