Exemplo n.º 1
0
def _run_eval(args, checkpoint_path, output_dir, hparams, sentences):
    mel_dir = os.path.join(output_dir, 'mel')
    wav_dir = os.path.join(output_dir, 'wav')
    plot_dir = os.path.join(output_dir, 'plot')

    #Create output path if it doesn't exist
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    #Set inputs batch wise
    sentences = [
        sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range(
            0, len(sentences), hparams.tacotron_synthesis_batch_size)
    ]

    log('Starting Synthesis')

    for i, texts in enumerate(tqdm(sentences)):
        basenames = [
            'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))
        ]
        synth.synthesize(texts, basenames, mel_dir, wav_dir, plot_dir, None)

    log('synthesized mel spectrograms at {}'.format(mel_dir))
    log('plot mel spectrograms at {}'.format(wav_dir))
    log('synthesized wavs at {}'.format(wav_dir))
    return mel_dir, wav_dir
Exemplo n.º 2
0
    def make_test_batches(self):
        start = time.time()

        # Read a group of examples
        n = self._hparams.tacotron_batch_size
        r = self._hparams.outputs_per_step

        #Test on entire test set
        test_batches_per_group = n * 2 if self._args.test_max_len or self._args.TEST else len(
            self._test_meta)

        examples = [
            self._get_test_groups() for i in range(test_batches_per_group)
        ]

        # Bucket examples based on similar output sequence length for efficiency
        examples.sort(key=lambda x: x[-1])
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]

        if self._args.test_max_len:
            batches = batches[::-1]
        else:
            np.random.shuffle(batches)

        log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(
            len(batches), n,
            time.time() - start))
        return batches, r
Exemplo n.º 3
0
def preprocess(args, audio_dir, taco_dir, hparams):
    output_dir = os.path.join(args.base_dir, 'wavernn_data')
    quant_dir = os.path.join(output_dir, 'quant')
    mels_dir = os.path.join(output_dir, 'mels')
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(quant_dir, exist_ok=True)
    os.makedirs(mels_dir, exist_ok=True)

    audio_files = get_files(audio_dir)
    mels_files = get_files(taco_dir)

    # This will take a while depending on size of dataset
    dataset_ids = []
    for i, path in enumerate(zip(audio_files, mels_files)):
        audio_id = path[0].split('/')[-1][6:-4]
        mels_id = path[1].split('/')[-1][4:-4]

        assert (mels_id == audio_id)

        dataset_ids.append(audio_id)

        np.save(f'{quant_dir}/{audio_id}.npy', convert_gta_audio(path[0]))
        np.save(f'{mels_dir}/{mels_id}.npy', convert_gta_mels(path[1]))

        log('%i/%i : audio: %s mel: %s' %
            (i + 1, len(audio_files), audio_id, mels_id))

    dataset_ids_unique = list(set(dataset_ids))

    with open(f'{output_dir}/dataset_ids.pkl', 'wb') as file:
        pickle.dump(dataset_ids_unique, file)
def tacotron_synthesize(args,
                        hparams,
                        checkpoint,
                        ppgs=None,
                        speakers=None,
                        Lf0s=None):
    output_dir = args.output_dir

    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        #checkpoint_path = '/home/zhaoxt20/vae_tac_myself/exp_multi_2020.4.1_2DPPgs+ref_same_speaker_dif_sentence/pretrained_model/tacotron_model.ckpt-45000'
        log('loaded model at {}'.format(checkpoint_path))
    except:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))

    if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
        raise ValueError(
            'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
        raise ValueError(
            'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    return run_eval(args, checkpoint_path, output_dir, hparams, ppgs, speakers,
                    Lf0s)
Exemplo n.º 5
0
def tacotron_synthesize(logId, sentences):
    # Set inputs batch wise
    sentences = [
        sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range(
            0, len(sentences), hparams.tacotron_synthesis_batch_size)
    ]
    log("logId={} , sentences={}".format(logId, sentences))
    # basenames = logId
    # texts = sentences
    # synth.synthesizev1(texts, basenames, eval_dir, log_dir, None)
    # mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None)
    t1 = time.time()
    for i, texts in enumerate(tqdm(sentences)):

        # basenames = ['batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))]
        basenames = [logId]
        wavPaths = synth.synthesizev1(texts, basenames, eval_dir, log_dir,
                                      None)
    t2 = time.time()
    log('logId={} , synthesized mel spectrograms at {} cost time={}'.format(
        logId, eval_dir, (t2 - t1)))

    # with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
    #     for i, texts in enumerate(tqdm(sentences)):
    #         start = time.time()
    #         basenames = ['batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))]
    #         # basenames = logId
    #         mel_filenames, speaker_ids = synth.synthesizev1(texts, basenames, eval_dir, log_dir, None)
    #
    #         for elems in zip(texts, mel_filenames, speaker_ids):
    #             file.write('|'.join([str(x) for x in elems]) + '\n')
    # log('synthesized mel spectrograms at {}'.format(eval_dir))
    return wavPaths[0]
Exemplo n.º 6
0
def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
    eval_dir = os.path.join(output_dir, 'eval')
    log_dir = os.path.join(output_dir, 'logs-eval')

    #Create output path if it doesn't exist
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams, reference_mel=args.reference_audio)
    if args.reference_audio is not None:
        ref_wav = audio.load_wav(args.reference_audio)
        reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T
    else:
        raise ValueError(
            "Evaluation without reference audio. Please provide path to reference audio."
        )
    with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
        for i, text in enumerate(tqdm(sentences)):
            start = time.time()
            mel_filename = synth.synthesize(text,
                                            i + 1,
                                            eval_dir,
                                            log_dir,
                                            None,
                                            reference_mel=reference_mel)
            file.write('{}|{}\n'.format(text, mel_filename))

    log('synthesized mel spectrograms at {}'.format(eval_dir))
    return eval_dir
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron'))
    parser.add_argument('--output', default='training')
    parser.add_argument('--dataset',
                        required=True,
                        choices=['blizzard', 'ljspeech', 'nick'])
    parser.add_argument('--num_workers', type=int, default=cpu_count())
    parser.add_argument(
        '--hparams',
        default='',
        help=
        'Hyperparameter overrides as a comma-separated list of name=value pairs'
    )
    parser.add_argument('--validation_size', type=int, default=0)
    parser.add_argument('--test_size', type=int, default=0)

    args = parser.parse_args()
    hparams.parse(args.hparams)
    log(hparams_debug_string())

    if args.dataset == 'blizzard':
        preprocess_blizzard(args, hparams)
    elif args.dataset == 'ljspeech':
        preprocess_ljspeech(args, hparams)
    elif args.dataset == 'nick':
        preprocess_nick(args, hparams)
Exemplo n.º 8
0
def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
    eval_dir = os.path.join(output_dir, 'eval')
    log_dir = os.path.join(output_dir, 'logs-eval')

    if args.model in ('Both', 'Tacotron-2'):
        assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #mels_dir = wavenet_input_dir
    
    #Create output path if it doesn't exist
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    
    with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
        for i, text in enumerate(tqdm(sentences)):
            start = time.time()
            mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None)

            file.write('{}|{}\n'.format(text, mel_filename))
            npy_data = np.load(mel_filename)
            npy_data = npy_data.reshape((-1,))
            npy_data.tofile("f32_for_lpcnet.f32")
		
    log('synthesized mel spectrograms at {}'.format(eval_dir))
    return eval_dir
Exemplo n.º 9
0
def run_synthesis_sytle_transfer(args, synth_metadata_filename,
                                 checkpoint_path, output_dir, hparams):

    synth_dir = os.path.join(output_dir, 'natural')

    #Create output path if it doesn't exist
    os.makedirs(synth_dir, exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(args, checkpoint_path, hparams)

    texts, basenames, basenames_refs, mel_filenames, \
    mel_ref_filenames_emt, mel_ref_filenames_spk,\
    emt_labels, spk_labels = get_filenames_from_metadata(synth_metadata_filename, args.input_dir, args.flip_spk_emt)

    synth.synthesize(texts,
                     basenames,
                     synth_dir,
                     synth_dir,
                     mel_filenames,
                     mel_ref_filenames_emt=mel_ref_filenames_emt,
                     mel_ref_filenames_spk=mel_ref_filenames_spk,
                     emt_labels_synth=emt_labels,
                     spk_labels_synth=spk_labels)
Exemplo n.º 10
0
def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
	eval_dir = os.path.join(output_dir, 'eval')
	log_dir = os.path.join(output_dir, 'logs-eval')

	if args.model in ('Both', 'Tacotron-2'):
		assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #mels_dir = wavenet_input_dir
	
	#Create output path if it doesn't exist
	os.makedirs(eval_dir, exist_ok=True)
	os.makedirs(log_dir, exist_ok=True)
	os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
	os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

	log(hparams_debug_string())
	synth = Synthesizer()
	synth.load(checkpoint_path, hparams)

	
	with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
		for i, text in enumerate(tqdm(sentences)):
			start = time.time()
			mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None)

			file.write('{}|{}\n'.format(text, mel_filename))
	log('synthesized mel spectrograms at {}'.format(eval_dir))
	return eval_dir
Exemplo n.º 11
0
def wavenet_synthesize(args, hparams, checkpoint):
    output_dir = 'wavenet_' + args.output_dir

    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except AttributeError:
        #Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa)
        if 'Both' in checkpoint:
            checkpoint = checkpoint.replace('Both', 'Tacotron-2')
        elif 'Tacotron-2' in checkpoint:
            checkpoint = checkpoint.replace('Tacotron-2', 'Both')
        else:  #Synthesizing separately
            raise AssertionError(
                'Cannot restore checkpoint: {}, did you train a model?'.format(
                    checkpoint))

        try:
            #Try loading again
            checkpoint_path = tf.train.get_checkpoint_state(
                checkpoint).model_checkpoint_path
            log('loaded model at {}'.format(checkpoint_path))
        except:
            raise RuntimeError(
                'Failed to load checkpoint at {}'.format(checkpoint))

    run_synthesis(args, checkpoint_path, output_dir, hparams)
    def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
        log('Constructing model: %s' % model_name)
        inputs = tf.placeholder(tf.int32, [None, None], 'inputs')
        input_lengths = tf.placeholder(tf.int32, [None], 'input_lengths')
        with tf.variable_scope('model') as scope:
            self.model = create_model(model_name, hparams)
            self.model.initialize(inputs, input_lengths)
            self.final_outputs = self.model.final_outputs
            self.alignments = self.model.alignments
            self.stop_token_outputs = self.model.stop_token_outputs

        self.gta = gta
        self._hparams = hparams
        #pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0

        log('Loading checkpoint: %s' % checkpoint_path)
        #Memory allocation on the GPU as needed
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(self.session, checkpoint_path)
Exemplo n.º 13
0
    def load_checkpoint(self, args, hparams, checkpoint):
        # ./Tacotron-2/tacotron/synthesize.py:tacotron_synthesize
        output_dir = 'tacotron_' + args.output_dir

        try:
            checkpoint_path = tf.train.get_checkpoint_state(
                checkpoint).model_checkpoint_path
            log('loaded model at {}'.format(checkpoint_path))
        except:
            raise RuntimeError(
                'Failed to load checkpoint at {}'.format(checkpoint))

        if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
            raise ValueError(
                'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size 	choice.'
                .format(hparams.tacotron_synthesis_batch_size,
                        hparams.tacotron_num_gpus))

        if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
            raise ValueError(
                'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'
                .format(hparams.tacotron_synthesis_batch_size,
                        hparams.tacotron_num_gpus))

        # ./Tacotron-2/tacotron/synthesize.py:run_live
        log(hparams_debug_string())
        synth = Synthesizer()
        synth.load(checkpoint_path, hparams)

        self.model = synth
Exemplo n.º 14
0
    def _enqueue_next_train_group(self):
        while not self._coord.should_stop():
            start = time.time()

            # Read a group of examples
            n = self._hparams.tacotron_batch_size
            r = self._hparams.outputs_per_step
            examples = [
                self._get_next_example() for i in range(n * _batches_per_group)
            ]

            # Bucket examples based on similar output sequence length for efficiency
            examples.sort(key=lambda x: x[-1])
            batches = [examples[i:i + n] for i in range(0, len(examples), n)]
            np.random.shuffle(batches)

            print(
                '\nGenerated {} train batches of size {} in {:.3f} sec'.format(
                    len(batches), n,
                    time.time() - start))
            log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(
                len(batches), n,
                time.time() - start))
            for batch in batches:
                feed_dict = dict(
                    zip(self._placeholders, self._prepare_batch(batch, r)))
                self._session.run(self._enqueue_op, feed_dict=feed_dict)
Exemplo n.º 15
0
def setup_log(log_path, checkpoint_path, input_path):
    infolog.init(log_path, 'emt4_disc', None)
    log('hi')
    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format('emt4_disc'))
    log(hparams_debug_string())
Exemplo n.º 16
0
    def make_test_batches(self):
        start = time.time()

        # Read a group of examples
        n = self._hparams.vad_batch_size * _num_per_batch

        # Test on entire test set
        examples = [
            self._get_test_groups() for i in range(len(self._test_meta))
        ]

        # Bucket examples based on similar output sequence length for efficiency
        examples.sort(key=lambda x: len(x[-1]))
        examples = (np.vstack([ex[0] for ex in examples]),
                    np.vstack([ex[1] for ex in examples]))
        batches = [(examples[0][i:i + n], examples[1][i:i + n])
                   for i in range(0,
                                  len(examples[-1]) + 1 - n, n)]
        if len(examples[-1]) % n != 0:
            batches.append((examples[0][-(len(examples[-1]) % n):],
                            examples[1][-(len(examples[-1]) % n):]))
        self.test_steps = len(batches)
        log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(
            sum([len(batch) for batch in batches]), n,
            time.time() - start))
        return batches
Exemplo n.º 17
0
def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
	eval_dir = os.path.join(output_dir, 'eval')
	log_dir = os.path.join(output_dir, 'logs-eval')

	if args.model == 'Tacotron-2':
		assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #mels_dir = wavenet_input_dir

	#Create output path if it doesn't exist
	os.makedirs(eval_dir, exist_ok=True)
	os.makedirs(log_dir, exist_ok=True)
	os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
	os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

	log(hparams_debug_string())
	synth = Synthesizer()
	synth.load(checkpoint_path, hparams, speaker_id=args.speaker_id)

	with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
		for i, text in enumerate(tqdm(sentences)):
			start = time.time()
			if args.speaker_id is not None:
				mel_filename, speaker_id = synth.synthesize([text], [i+1], eval_dir, log_dir, None, speaker_id=[args.speaker_id[i]])
			else:
				mel_filename, speaker_id = synth.synthesize([text], [i+1], eval_dir, log_dir, None, speaker_id=None)

			file.write('{}|{}|{}\n'.format(text, mel_filename[0], speaker_id[0]))
	log('synthesized mel spectrograms at {}'.format(eval_dir))
	return eval_dir
Exemplo n.º 18
0
def tacotron_synthesize(args, hparams, checkpoint, sentences=None):
    output_dir = 'tacotron_' + args.output_dir

    try:
        checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except AttributeError:
        #Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa)
        if 'Both' in checkpoint:
            checkpoint = checkpoint.replace('Both', 'Tacotron-2')
        elif 'Tacotron-2' in checkpoint:
            checkpoint = checkpoint.replace('Tacotron-2', 'Both')
        else:
            raise AssertionError('Cannot restore checkpoint: {}, did you train a model?'.format(checkpoint))

        try:
            #Try loading again
            checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
            log('loaded model at {}'.format(checkpoint_path))
        except:
            raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))

    if args.mode == 'eval':
        return run_eval(args, checkpoint_path, output_dir, hparams, sentences)
    elif args.mode == 'synthesis':
        return run_synthesis(args, checkpoint_path, output_dir, hparams)
    else:
        run_live(args, checkpoint_path, hparams)
Exemplo n.º 19
0
    def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
        log('Constructing model: %s' % model_name)
        inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
        input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
        targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels],
                                 'mel_targets')
        with tf.variable_scope('model') as scope:
            self.model = create_model(model_name, hparams)
            if gta:
                self.model.initialize(inputs, input_lengths, targets, gta=gta)
            else:
                self.model.initialize(inputs, input_lengths)
            self.mel_outputs = self.model.mel_outputs
            self.linear_outputs = self.model.linear_outputs if (
                hparams.predict_linear and not gta) else None
            self.alignment = self.model.alignments[0]

        self.gta = gta
        self._hparams = hparams

        log('Loading checkpoint: %s' % checkpoint_path)
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(self.session, checkpoint_path)
Exemplo n.º 20
0
def synthesize(args, input_dir, output_dir, checkpoint_path, hparams):
    # device
    device = torch.device('cuda' if args.use_cuda else 'cpu')

    # Initialize Model
    model = Model(rnn_dims=hparams.rnn_dims, fc_dims=hparams.fc_dims, bits=hparams.wavernn_bits, pad=hparams.wavernn_pad, upsample_factors = hparams.upsample_scales,\
                 feat_dims=hparams.feat_dims, compute_dims=hparams.compute_dims, res_out_dims=hparams.res_out_dims, res_blocks=hparams.res_blocks,\
                 hop_length = hparams.hop_size, sample_rate=hparams.sample_rate).to(device)

    # Load Model
    if args.use_cuda:
        checkpoint = torch.load(checkpoint_path)
    else:
        checkpoint = torch.load(checkpoint_path,
                                map_location=lambda storage, loc: storage)

    log('Loading model from {}'.format(checkpoint_path))
    model.load_state_dict(checkpoint['state_dict'])

    # Synth from Mels to Wave
    filenames = [
        f for f in sorted(os.listdir(input_dir)) if f.endswith('.npy')
    ]
    for i, filename in tqdm(enumerate(filenames)):
        mel = np.load(os.path.join(input_dir, filename)).T
        save_wavernn_wav(model.generate(mel),
                         '{}/{}_generated.wav'.format(output_dir,
                                                      i), hparams.sample_rate)
Exemplo n.º 21
0
    def load(self, checkpoint_path, hparams, model_name='WaveNet'):
        log('Constructing model: {}'.format(model_name))
        self._hparams = hparams
        local_cond, global_cond = self._check_conditions()

        self.local_conditions = tf.placeholder(
            tf.float32,
            shape=[1, None, hparams.num_mels],
            name='local_condition_features') if local_cond else None
        self.global_conditions = tf.placeholder(
            tf.int32, shape=(1, 1),
            name='global_condition_features') if global_cond else None
        self.synthesis_length = tf.placeholder(
            tf.int32, shape=(),
            name='synthesis_length') if not local_cond else None

        with tf.variable_scope('model') as scope:
            self.model = create_model(model_name, hparams)
            self.model.initialize(y=None,
                                  c=self.local_conditions,
                                  g=self.global_conditions,
                                  input_lengths=None,
                                  synthesis_length=self.synthesis_length)

            self._hparams = hparams
            sh_saver = create_shadow_saver(self.model)

            log('Loading checkpoint: {}'.format(checkpoint_path))
            self.session = tf.Session()
            self.session.run(tf.global_variables_initializer())
            load_averaged_model(self.session, sh_saver, checkpoint_path)
Exemplo n.º 22
0
def synthesize(args, input_dir, output_dir, checkpoint_path, hparams):
    # device
    device = torch.device('cuda' if args.use_cuda else 'cpu')

    # Initialize Model
    model = WaveRNN(hparams.wavernn_bits, hparams.hop_size, hparams.num_mels,
                    device).to(device)

    # Load Model
    if args.use_cuda:
        checkpoint = torch.load(checkpoint_path)
    else:
        checkpoint = torch.load(checkpoint_path,
                                map_location=lambda storage, loc: storage)

    log('Loading model from {}'.format(checkpoint_path))
    model.load_state_dict(checkpoint['state_dict'])

    # Synth from Mels to Wave
    filenames = [
        f for f in sorted(os.listdir(input_dir)) if f.endswith('.npy')
    ]
    for i, filename in tqdm(enumerate(filenames)):
        mel = np.load(os.path.join(input_dir, filename)).T
        save_wavernn_wav(model.generate(mel),
                         f'{output_dir}/{i}_generated.wav',
                         hparams.sample_rate)
Exemplo n.º 23
0
def tacotron_synthesize(args, hparams, checkpoint, sentences=None):
	output_dir = 'tacotron_' + args.output_dir

	try:
		checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
		log('loaded model at {}'.format(checkpoint_path))
	except AttributeError:
		#Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa)
		if 'Both' in checkpoint:
			checkpoint = checkpoint.replace('Both', 'Tacotron-2')
		elif 'Tacotron-2' in checkpoint:
			checkpoint = checkpoint.replace('Tacotron-2', 'Both')
		else:
			raise AssertionError('Cannot restore checkpoint: {}, did you train a model?'.format(checkpoint))

		try:
			#Try loading again
			checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
			log('loaded model at {}'.format(checkpoint_path))
		except:
			raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))

		

	wavenet_in_dir = None

	if args.mode == 'eval':
		wavenet_in_dir = run_eval(args, checkpoint_path, output_dir, hparams, sentences)
	elif args.mode == 'synthesis':
		run_synthesis(args, checkpoint_path, output_dir, hparams)
	else:
		run_live(args, checkpoint_path, hparams)

	return wavenet_in_dir
Exemplo n.º 24
0
def init_tacotron2(args):
    # t2
    print('\n#####################################')
    if args.model == 'Tacotron':
        print('\nInitialising Tacotron Model...\n')
        t2_hparams = hparams.parse(args.hparams)
        try:
            checkpoint_path = tf.train.get_checkpoint_state(
                args.taco_checkpoint).model_checkpoint_path
            log('loaded model at {}'.format(checkpoint_path))
        except:
            raise RuntimeError('Failed to load checkpoint at {}'.format(
                args.taco_checkpoint))

        output_dir = 'tacotron_' + args.output_dir
        eval_dir = os.path.join(output_dir, 'eval')
        log_dir = os.path.join(output_dir, 'logs-eval')
        print('eval_dir:', eval_dir)
        print('args.mels_dir:', args.mels_dir)

        # Create output path if it doesn't exist
        os.makedirs(eval_dir, exist_ok=True)
        os.makedirs(log_dir, exist_ok=True)
        os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
        os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)
        log(hparams_debug_string())
        synth = Synthesizer()
        synth.load(checkpoint_path, t2_hparams)

    return synth, eval_dir, log_dir
Exemplo n.º 25
0
    def _enqueue_next_train_group(self):
        while not self._coord.should_stop():
            start = time.time()

            # Read a group of examples
            n = self._hparams.tacotron_batch_size
            r = self._hparams.outputs_per_step
            log("feeder.py:_enqueue_next_train_group():row162:before examples")
            examples = [
                self._get_next_example() for i in range(n * _batches_per_group)
            ]
            log("feeder.py:_enqueue_next_train_group():row164:after examples")

            # Bucket examples based on similar output sequence length for efficiency
            examples.sort(key=lambda x: x[-1])
            batches = [examples[i:i + n] for i in range(0, len(examples), n)]
            np.random.shuffle(batches)
            print(strftime("---%a, %d %b %Y %H:%M:%S +0000", localtime()))
            print(
                'feeder.py:_enqueue_next_train_group():row168:Generated {} train batches of size {} in {:.3f} sec'
                .format(len(batches), n,
                        time.time() - start))
            for batch in batches:
                #print('\nfeeder.py:_enqueue_next_train_group():row171')
                feed_dict = dict(
                    zip(self._placeholders, self._prepare_batch(batch, r)))
                self._session.run(self._enqueue_op, feed_dict=feed_dict)
Exemplo n.º 26
0
def publish(args, hparams, checkpoint_path):
    log(hparams_debug_string())
    if not os.path.exists(args.book):
        raise ValueError('{}: {}'.format('No such file or directory', args.book))

    speaker_id = args.speaker_id
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    with open_file(args.book) as f:
        text = f.read()
        if args.lang == 'kr':
            kkma = Kkma()
            sents = kkma.sentences(text)
        else:
            sents = nltk.sent_tokenize(text)

        full_mels = None
        silence = np.full((100, hparams.num_mels), hparams.min_level_db, np.float32)
        for i, line in enumerate(tqdm(sents)):
            text = line.strip()
            if text:
                mels = generate_fast(synth, text, speaker_id, play=False)
                if i > 0:
                    full_mels = np.concatenate((full_mels, silence), axis=0)  # padding silence between sents
                    full_mels = np.concatenate((full_mels, mels), axis=0)
                else:
                    full_mels = mels

        save_path = change_file_ext(args.book, '.wav')
        log('saving to wav file...')
        wav = audio.inv_mel_spectrogram(full_mels.T, hparams)
        audio.save_wav(wav, save_path, sr=hparams.sample_rate)
def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
    eval_dir = os.path.join(output_dir, 'eval')
    log_dir = os.path.join(output_dir, 'logs-eval')

    if args.model == 'Tacotron-2':
        assert os.path.normpath(eval_dir) == os.path.normpath(
            args.mels_dir)  #mels_dir = wavenet_input_dir

    #Create output path if it doesn't exist
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    delta_size = hparams.tacotron_synthesis_batch_size if hparams.tacotron_synthesis_batch_size < len(
        sentences) else len(sentences)
    batch_sentences = [
        sentences[i:i + hparams.tacotron_synthesis_batch_size]
        for i in range(0, len(sentences), delta_size)
    ]
    start = time.time()
    for i, batch in enumerate(tqdm(batch_sentences)):
        audio.save_wav(
            synth.eval(batch),
            os.path.join(log_dir, 'wavs', 'eval_batch_{:03}.wav'.format(i)),
            hparams)
    log('\nGenerated total batch of {} in {:.3f} sec'.format(
        delta_size,
        time.time() - start))

    return eval_dir
Exemplo n.º 28
0
def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
    eval_dir = os.path.join(output_dir, 'eval')
    log_dir = os.path.join(output_dir, 'logs-eval')

    if args.model in ('Both', 'Tacotron-2'):
        assert os.path.normpath(eval_dir) == os.path.normpath(
            args.mels_dir)  # mels_dir = wavenet_input_dir

    # Create output path if it doesn't exist
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
        for i, text in enumerate(tqdm(sentences)):
            if is_korean_text(text):
                text = normalize_number(text)
                # 한글을 자소 단위로 쪼갠다.
                text = split_to_jamo(text, hparams.cleaners)
            mel_filename = synth.synthesize(text, i + 1, eval_dir, log_dir,
                                            None)

            file.write('{}|{}\n'.format(text, mel_filename))
    log('synthesized mel spectrograms at {}'.format(eval_dir))
    return eval_dir
Exemplo n.º 29
0
    def _enqueue_next_train_group(self):
        while not self._coord.should_stop():
            start = time.time()

            # Read a group of examples
            n = self._hparams.vad_batch_size
            examples = [
                self._get_next_example() for _ in range(_num_per_batch)
            ]

            # Bucket examples based on similar output sequence length for efficiency
            examples.sort(key=lambda x: len(x[-1]))
            examples = (np.vstack([ex[0] for ex in examples]),
                        np.vstack([ex[1] for ex in examples]))
            batches = [(examples[0][i:i + n], examples[1][i:i + n])
                       for i in range(0,
                                      len(examples[-1]) + 1 - n, n)]
            if len(examples[-1]) % n != 0:
                batches.append((examples[0][-(len(examples[-1]) % n):],
                                examples[1][-(len(examples[-1]) % n):]))

            log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(
                len(batches), n,
                time.time() - start))
            for batch in batches:
                feed_dict = dict(
                    zip(self._placeholders, self._prepare_batch(batch)))
                self._session.run(self._enqueue_op, feed_dict=feed_dict)
Exemplo n.º 30
0
def tacotron_synthesize(args, hparams, checkpoint, sentences=None):
    output_dir = 'tacotron_' + args.output_dir

    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))

    if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
        raise ValueError(
            'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
        raise ValueError(
            'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    if args.mode == 'eval':
        return run_eval(args, checkpoint_path, output_dir, hparams, sentences)
    elif args.mode == 'synthesis':
        return run_synthesis(args, checkpoint_path, output_dir, hparams)
    else:
        run_live(args, checkpoint_path, hparams)
Exemplo n.º 31
0
def get_filenames_from_metadata(synth_metadata_filename,
                                input_dir,
                                flip_spk_emt=False):

    with open(synth_metadata_filename, encoding='utf-8') as f:
        metadata = [
            line.strip().split('|') for line in f if not (line.startswith('#'))
        ]
        frame_shift_ms = hparams.hop_size / hparams.sample_rate
        hours = sum([int(x[6]) for x in metadata]) * frame_shift_ms / (3600)
        log('Synthesis - Loaded metadata for {} examples ({:.2f} hours)'.
            format(len(metadata), hours))

    # log('Starting Synthesis')
    texts = [m[7] for m in metadata]
    mel_filenames = [
        os.path.join(input_dir, m[0], 'mels', m[2]) for m in metadata
    ]
    basenames = [
        os.path.basename(m).replace('.npy', '').replace('mel-', '')
        for m in mel_filenames
    ]
    basenames_refs = [m[11] + '_' + m[13] for m in metadata]

    mel_ref_filenames_emt = []
    mel_ref_filenames_spk = []
    emt_labels = []
    spk_labels = []
    for m in metadata:
        dataset = m[0]
        if m[12] == 'same':
            mel_ref_filenames_emt.append(
                os.path.join(input_dir, dataset, 'mels', m[2]))
        else:
            if 'accent' in synth_metadata_filename:
                dataset_emt = 'vctk'
            else:
                dataset_emt = 'emth' if m[12][0] == 'h' else 'emt4'
                if m[12][0] == 'h':
                    m[12] = m[12][1:]
            mel_ref_filenames_emt.append(
                os.path.join(input_dir, dataset_emt, 'mels', m[12]))

        if m[14] == 'same':
            mel_ref_filenames_spk.append(
                os.path.join(input_dir, dataset, 'mels', m[2]))
        else:
            mel_ref_filenames_spk.append(
                os.path.join(input_dir, 'jessa', 'mels', m[14]))
        emt_labels.append(m[8])
        spk_labels.append(m[9])
    if flip_spk_emt:
        mel_ref_filenames_emt_tmp = mel_ref_filenames_emt
        mel_ref_filenames_emt = mel_ref_filenames_spk
        mel_ref_filenames_spk = mel_ref_filenames_emt_tmp

    return (texts, basenames, basenames_refs, mel_filenames,
            mel_ref_filenames_emt, mel_ref_filenames_spk, emt_labels,
            spk_labels)
Exemplo n.º 32
0
    def __init__(self, coordinator, metadata_filename, hparams):
        super(DataFeeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        self._offset = 0

        # Load metadata:
        self._datadir = os.path.dirname(metadata_filename)
        with open(metadata_filename, encoding='utf-8') as f:
            self._metadata = [line.strip().split('|') for line in f]
            hours = sum(
                (int(x[2])
                 for x in self._metadata)) * hparams.frame_shift_ms / (3600 *
                                                                       1000)
            log('Loaded metadata for %d examples (%.2f hours)' %
                (len(self._metadata), hours))

        # Create placeholders for inputs and targets. Don't specify batch size because we want to
        # be able to feed different sized batches at eval time.
        self._placeholders = [
            tf.placeholder(tf.int32, [None, None], 'inputs'),
            tf.placeholder(tf.int32, [None], 'input_lengths'),
            tf.placeholder(tf.float32, [None, None, hparams.num_mels],
                           'mel_targets'),
            tf.placeholder(tf.float32, [None, None, hparams.num_freq],
                           'linear_targets'),
            tf.placeholder(tf.float32, [None, None, hparams.pml_dimension],
                           'pml_targets'),
        ]

        # Create queue for buffering data:
        queue = tf.FIFOQueue(
            8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32],
            name='input_queue')
        self._enqueue_op = queue.enqueue(self._placeholders)
        self.inputs, self.input_lengths, self.mel_targets, self.linear_targets, self.pml_targets = queue.dequeue(
        )
        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.mel_targets.set_shape(self._placeholders[2].shape)
        self.linear_targets.set_shape(self._placeholders[3].shape)
        self.pml_targets.set_shape(self._placeholders[4].shape)

        # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
        # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
        # synthesis (useful for proper nouns, etc.)
        if hparams.use_cmudict:
            cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
            if not os.path.isfile(cmudict_path):
                raise Exception(
                    'If use_cmudict=True, you must download ' +
                    'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s'
                    % cmudict_path)
            self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
            log('Loaded CMUDict with %d unambiguous entries' %
                len(self._cmudict))
        else:
            self._cmudict = None
Exemplo n.º 33
0
def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences):
	log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name or args.model))
	log('Synthesizing mel-spectrograms from text..')
	wavenet_in_dir = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)
	log('Synthesizing audio from mel-spectrograms.. (This may take a while)')
	wavenet_synthesize(args, hparams, wave_checkpoint)
	log('Tacotron-2 TTS synthesis complete!')
Exemplo n.º 34
0
	def make_test_batches(self):
		start = time.time()

		# Read a group of examples
		n = self._hparams.tacotron_batch_size
		r = self._hparams.outputs_per_step

		#Test on entire test set
		examples = [self._get_test_groups() for i in range(len(self._test_meta))]

		# Bucket examples based on similar output sequence length for efficiency
		examples.sort(key=lambda x: x[-1])
		batches = [examples[i: i+n] for i in range(0, len(examples), n)]
		np.random.shuffle(batches)

		log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
		return batches, r
Exemplo n.º 35
0
	def _enqueue_next_train_group(self):
		while not self._coord.should_stop():
			start = time.time()

			# Read a group of examples
			n = self._hparams.tacotron_batch_size
			r = self._hparams.outputs_per_step
			examples = [self._get_next_example() for i in range(n * _batches_per_group)]

			# Bucket examples based on similar output sequence length for efficiency
			examples.sort(key=lambda x: x[-1])
			batches = [examples[i: i+n] for i in range(0, len(examples), n)]
			np.random.shuffle(batches)

			log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
			for batch in batches:
				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
				self._session.run(self._enqueue_op, feed_dict=feed_dict)
Exemplo n.º 36
0
	def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
		log('Constructing model: %s' % model_name)
		inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
		input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
		targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets')
		with tf.variable_scope('model') as scope:
			self.model = create_model(model_name, hparams)
			if gta:
				self.model.initialize(inputs, input_lengths, targets, gta=gta)
			else:		
				self.model.initialize(inputs, input_lengths)
			self.mel_outputs = self.model.mel_outputs
			self.alignment = self.model.alignments[0]

		self.gta = gta
		self._hparams = hparams

		log('Loading checkpoint: %s' % checkpoint_path)
		self.session = tf.Session()
		self.session.run(tf.global_variables_initializer())
		saver = tf.train.Saver()
		saver.restore(self.session, checkpoint_path)
Exemplo n.º 37
0
def run_synthesis(args, checkpoint_path, output_dir, hparams):
	GTA = (args.GTA == 'True')
	if GTA:
		synth_dir = os.path.join(output_dir, 'gta')

		#Create output path if it doesn't exist
		os.makedirs(synth_dir, exist_ok=True)
	else:
		synth_dir = os.path.join(output_dir, 'natural')

		#Create output path if it doesn't exist
		os.makedirs(synth_dir, exist_ok=True)


	metadata_filename = os.path.join(args.input_dir, 'train.txt')
	log(hparams_debug_string())
	synth = Synthesizer()
	synth.load(checkpoint_path, hparams, gta=GTA)
	with open(metadata_filename, encoding='utf-8') as f:
		metadata = [line.strip().split('|') for line in f]
		frame_shift_ms = hparams.hop_size / hparams.sample_rate
		hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600)
		log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours))

	log('starting synthesis')
	mel_dir = os.path.join(args.input_dir, 'mels')
	wav_dir = os.path.join(args.input_dir, 'audio')
	with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
		for i, meta in enumerate(tqdm(metadata)):
			text = meta[5]
			mel_filename = os.path.join(mel_dir, meta[1])
			wav_filename = os.path.join(wav_dir, meta[0])
			mel_output_filename = synth.synthesize(text, i+1, synth_dir, None, mel_filename)

			file.write('{}|{}|{}|{}\n'.format(wav_filename, mel_filename, mel_output_filename, text))
	log('synthesized mel spectrograms at {}'.format(synth_dir))
	return os.path.join(synth_dir, 'map.txt')
Exemplo n.º 38
0
def run_live(args, checkpoint_path, hparams):
	#Log to Terminal without keeping any records in files
	log(hparams_debug_string())
	synth = Synthesizer()
	synth.load(checkpoint_path, hparams)

	#Generate fast greeting message
	greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!'
	log(greetings)
	generate_fast(synth, greetings)

	#Interaction loop
	while True:
		try:
			text = input()
			generate_fast(synth, text)

		except KeyboardInterrupt:
			leave = 'Thank you for testing our features. see you soon.'
			log(leave)
			generate_fast(synth, leave)
			sleep(2)
			break
Exemplo n.º 39
0
	def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
			global_step=None, is_training=False, is_evaluating=False):
		"""
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
		if mel_targets is None and stop_token_targets is not None:
			raise ValueError('no mel targets were provided but token_targets were given')
		if mel_targets is not None and stop_token_targets is None and not gta:
			raise ValueError('Mel targets are provided without corresponding token_targets')
		if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training:
			raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
		if gta and linear_targets is not None:
			raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
		if is_training and self._hparams.mask_decoder and targets_lengths is None:
			raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
		if is_training and is_evaluating:
			raise RuntimeError('Model can not be in training and evaluation modes at the same time!')

		with tf.variable_scope('inference') as scope:
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams
			assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
			if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
				assert global_step is not None

			#GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
			post_condition = hp.predict_linear and not gta

			# Embeddings ==> [batch_size, sequence_length, embedding_dim]
			embedding_table = tf.get_variable(
				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)


			#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
			encoder_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))

			encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

			#For shape visualization purpose
			enc_conv_output_shape = encoder_cell.conv_output_shape


			#Decoder Parts
			#Attention Decoder Prenet
			prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
			#Attention Mechanism
			attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
				mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, 
				cumulate_weights=hp.cumulative_weights)
			#Decoder LSTM Cells
			decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
				size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm')
			#Frames Projection layer
			frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform')
			#<stop_token> projection layer
			stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')


			#Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
			decoder_cell = TacotronDecoderCell(
				prenet,
				attention_mechanism,
				decoder_lstm,
				frame_projection,
				stop_projection)


			#Define the helper for our decoder
			if is_training or is_evaluating or gta:
				self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step)
			else:
				self.helper = TacoTestHelper(batch_size, hp)


			#initial decoder state
			decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

			#Only use max iterations at synthesis time
			max_iters = hp.max_iters if not (is_training or is_evaluating) else None

			#Decode
			(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
				CustomDecoder(decoder_cell, self.helper, decoder_init_state),
				impute_finished=False,
				maximum_iterations=max_iters,
				swap_memory=hp.tacotron_swap_with_cpu)


			# Reshape outputs to be one output per entry 
			#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
			decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
			stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])

		
			#Postnet
			postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')

			#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
			residual = postnet(decoder_output)

			#Project residual to same dimension as mel spectrogram 
			#==> [batch_size, decoder_steps * r, num_mels]
			residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
			projected_residual = residual_projection(residual)


			#Compute the mel spectrogram
			mel_outputs = decoder_output + projected_residual


			if post_condition:
				#Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
				#Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
				post_processing_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM'))

				expand_outputs = post_processing_cell(mel_outputs)
				linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs)

			#Grab alignments from the final decoder state
			alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

			if is_training:
				self.ratio = self.helper._ratio
			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_output = decoder_output
			self.alignments = alignments
			self.stop_token_prediction = stop_token_prediction
			self.stop_token_targets = stop_token_targets
			self.mel_outputs = mel_outputs
			if post_condition:
				self.linear_outputs = linear_outputs
				self.linear_targets = linear_targets
			self.mel_targets = mel_targets
			self.targets_lengths = targets_lengths
			log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
			log('  Train mode:               {}'.format(is_training))
			log('  Eval mode:                {}'.format(is_evaluating))
			log('  GTA mode:                 {}'.format(gta))
			log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
			log('  embedding:                {}'.format(embedded_inputs.shape))
			log('  enc conv out:             {}'.format(enc_conv_output_shape))
			log('  encoder out:              {}'.format(encoder_outputs.shape))
			log('  decoder out:              {}'.format(decoder_output.shape))
			log('  residual out:             {}'.format(residual.shape))
			log('  projected residual out:   {}'.format(projected_residual.shape))
			log('  mel out:                  {}'.format(mel_outputs.shape))
			if post_condition:
				log('  linear out:               {}'.format(linear_outputs.shape))
			log('  <stop_token> out:         {}'.format(stop_token_prediction.shape))
Exemplo n.º 40
0
	def __init__(self, coordinator, metadata_filename, hparams):
		super(Feeder, self).__init__()
		self._coord = coordinator
		self._hparams = hparams
		self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
		self._train_offset = 0
		self._test_offset = 0

		# Load metadata
		self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels')
		self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear')
		with open(metadata_filename, encoding='utf-8') as f:
			self._metadata = [line.strip().split('|') for line in f]
			frame_shift_ms = hparams.hop_size / hparams.sample_rate
			hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
			log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours))

		#Train test split
		if hparams.tacotron_test_size is None:
			assert hparams.tacotron_test_batches is not None

		test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None 
			else hparams.tacotron_test_batches * hparams.tacotron_batch_size)
		indices = np.arange(len(self._metadata))
		train_indices, test_indices = train_test_split(indices,
			test_size=test_size, random_state=hparams.tacotron_data_random_state)

		#Make sure test_indices is a multiple of batch_size else round up
		len_test_indices = self._round_up(len(test_indices), hparams.tacotron_batch_size)
		extra_test = test_indices[len_test_indices:]
		test_indices = test_indices[:len_test_indices]
		train_indices = np.concatenate([train_indices, extra_test])

		self._train_meta = list(np.array(self._metadata)[train_indices])
		self._test_meta = list(np.array(self._metadata)[test_indices])

		self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size

		if hparams.tacotron_test_size is None:
			assert hparams.tacotron_test_batches == self.test_steps

		#pad input sequences with the <pad_token> 0 ( _ )
		self._pad = 0
		#explicitely setting the padding to a value that doesn't originally exist in the spectogram
		#to avoid any possible conflicts, without affecting the output range of the model too much
		if hparams.symmetric_mels:
			self._target_pad = -(hparams.max_abs_value + .1)
		else:
			self._target_pad = -0.1
		#Mark finished sequences with 1s
		self._token_pad = 1.

		with tf.device('/cpu:0'):
			# Create placeholders for inputs and targets. Don't specify batch size because we want
			# to be able to feed different batch sizes at eval time.
			self._placeholders = [
			tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
			tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
			tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'),
			tf.placeholder(tf.float32, shape=(None, None), name='token_targets'),
			tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'),
			tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'),
			]

			# Create queue for buffering data
			queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32], name='input_queue')
			self._enqueue_op = queue.enqueue(self._placeholders)
			self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths = queue.dequeue()

			self.inputs.set_shape(self._placeholders[0].shape)
			self.input_lengths.set_shape(self._placeholders[1].shape)
			self.mel_targets.set_shape(self._placeholders[2].shape)
			self.token_targets.set_shape(self._placeholders[3].shape)
			self.linear_targets.set_shape(self._placeholders[4].shape)
			self.targets_lengths.set_shape(self._placeholders[5].shape)

			# Create eval queue for buffering eval data
			eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32], name='eval_queue')
			self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
			self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \
				self.eval_linear_targets, self.eval_targets_lengths = eval_queue.dequeue()

			self.eval_inputs.set_shape(self._placeholders[0].shape)
			self.eval_input_lengths.set_shape(self._placeholders[1].shape)
			self.eval_mel_targets.set_shape(self._placeholders[2].shape)
			self.eval_token_targets.set_shape(self._placeholders[3].shape)
			self.eval_linear_targets.set_shape(self._placeholders[4].shape)
			self.eval_targets_lengths.set_shape(self._placeholders[5].shape)
Exemplo n.º 41
0
	def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None):
		'''Initialize wavenet graph for train, eval and test cases.
		'''
		hparams = self._hparams
		self.is_training = x is not None
		self.is_evaluating = not self.is_training and y is not None
		#Set all convolutions to corresponding mode
		self.set_mode(self.is_training)

		log('Initializing Wavenet model.  Dimensions (? = dynamic shape): ')
		log('  Train mode:                {}'.format(self.is_training))
		log('  Eval mode:                 {}'.format(self.is_evaluating))
		log('  Synthesis mode:            {}'.format(not (self.is_training or self.is_evaluating)))
		with tf.variable_scope('inference') as scope:
			#Training
			if self.is_training:
				batch_size = tf.shape(x)[0]
				#[batch_size, time_length, 1]
				self.mask = self.get_mask(input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation
				#[batch_size, channels, time_length]
				y_hat = self.step(x, c, g, softmax=False) #softmax is automatically computed inside softmax_cross_entropy if needed

				if is_mulaw_quantize(hparams.input_type):
					#[batch_size, time_length, channels]
					self.y_hat_q = tf.transpose(y_hat, [0, 2, 1])

				self.y_hat = y_hat
				self.y = y
				self.input_lengths = input_lengths

				#Graph extension for log saving
				#[batch_size, time_length]
				shape_control = (batch_size, tf.shape(x)[-1], 1)
				with tf.control_dependencies([tf.assert_equal(tf.shape(y), shape_control)]):
					y_log = tf.squeeze(y, [-1])
					if is_mulaw_quantize(hparams.input_type):
						self.y = y_log

				y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4),
					lambda: tf.squeeze(y_hat, [-1]),
					lambda: y_hat)
				y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1])

				if is_mulaw_quantize(hparams.input_type):
					#[batch_size, time_length]
					y_hat_log = tf.reduce_max(tf.nn.softmax(y_hat_log, axis=1), 1)

					y_hat_log = util.inv_mulaw_quantize(y_hat_log, hparams.quantize_channels)
					y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels)

				else:
					#[batch_size, time_length]
					y_hat_log = sample_from_discretized_mix_logistic(
						y_hat_log, log_scale_min=hparams.log_scale_min)

					if is_mulaw(hparams.input_type):
						y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels)
						y_log = util.inv_mulaw(y_log, hparams.quantize_channels)

				self.y_hat_log = y_hat_log
				self.y_log = y_log
				
				log('  inputs:                    {}'.format(x.shape))
				if self.local_conditioning_enabled():
					log('  local_condition:           {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:          {}'.format(g.shape))
				log('  targets:                   {}'.format(y_log.shape))
				log('  outputs:                   {}'.format(y_hat_log.shape))


			#evaluating
			elif self.is_evaluating: 
				#[time_length, ]
				idx = 0
				length = input_lengths[idx]
				y_target = tf.reshape(y[idx], [-1])[:length]

				if c is not None:
					c = tf.expand_dims(c[idx, :, :length], axis=0)
					with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3)]):
						c = tf.identity(c, name='eval_assert_c_rank_op')
				if g is not None:
					g = g[idx]

				#Start silence frame
				if is_mulaw_quantize(hparams.input_type):
					initial_value = mulaw_quantize(0, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					initial_value = mulaw(0.0, hparams.quantize_channels)
				else:
					initial_value = 0.0

				#[channels, ]
				if is_mulaw_quantize(hparams.input_type):
					initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32)
					initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels])
				else:
					initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value

				#Fast eval
				y_hat = self.incremental(initial_input, c=c, g=g, time_length=length,
					softmax=True, quantize=True, log_scale_min=hparams.log_scale_min)

				#Save targets and length for eval loss computation
				if is_mulaw_quantize(hparams.input_type):
					self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length]
				else:
					self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :]
				self.eval_length = length

				if is_mulaw_quantize(hparams.input_type):
					y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1])
					y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels)
					y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels)
					y_target = inv_mulaw(y_target, hparams.quantize_channels)
				else:
					y_hat = tf.reshape(y_hat, [-1])

				self.y_hat = y_hat
				self.y_target = y_target

				if self.local_conditioning_enabled():
					log('  local_condition:           {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:          {}'.format(g.shape))
				log('  targets:                   {}'.format(y_target.shape))
				log('  outputs:                   {}'.format(y_hat.shape))

			#synthesizing
			else:
				if c is None:
					assert synthesis_length is not None
				else:
					#[batch_size, local_condition_time, local_condition_dimension(num_mels)]
					message = ('Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'.format(
							hparams.cin_channels, c.shape))
					with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3, message=message)]):
						c = tf.identity(c, name='synthesis_assert_c_rank_op')

					Tc = tf.shape(c)[1]
					upsample_factor = audio.get_hop_size(self._hparams)

					#Overwrite length with respect to local condition features
					synthesis_length = Tc * upsample_factor

					#[batch_size, local_condition_dimension, local_condition_time]
					#time_length will be corrected using the upsample network
					c = tf.transpose(c, [0, 2, 1])

				#Start silence frame
				if is_mulaw_quantize(hparams.input_type):
					initial_value = mulaw_quantize(0, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					initial_value = mulaw(0.0, hparams.quantize_channels)
				else:
					initial_value = 0.0

				if is_mulaw_quantize(hparams.input_type):
					assert initial_value >= 0 and initial_value < hparams.quantize_channels
					initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32)
					initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels])
				else:
					initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value

				y_hat = self.incremental(initial_input, c=c, g=g, time_length=synthesis_length,
					softmax=True, quantize=True, log_scale_min=hparams.log_scale_min)

				if is_mulaw_quantize(hparams.input_type):
					y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1])
					y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					y_hat = util.inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels)
				else:
					y_hat = tf.reshape(y_hat, [-1])

				self.y_hat = y_hat

				if self.local_conditioning_enabled():
					log('  local_condition:            {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:           {}'.format(g.shape))
				log('  outputs:                    {}'.format(y_hat.shape))

		self.variables = tf.trainable_variables()
		self.ema = tf.train.ExponentialMovingAverage(decay=hparams.wavenet_ema_decay)
Exemplo n.º 42
0
def train(args, log_dir, hparams):
	log('\n#############################################################\n')
	log('Tacotron Train\n')
	log('###########################################################\n')
	checkpoint = tacotron_train(args, log_dir, hparams)
	tf.reset_default_graph()
	if checkpoint is None:
		raise('Error occured while training Tacotron, Exiting!')
	log('\n#############################################################\n')
	log('Tacotron GTA Synthesis\n')
	log('###########################################################\n')
	input_path = tacotron_synthesize(args, hparams, checkpoint)
	log('\n#############################################################\n')
	log('Wavenet Train\n')
	log('###########################################################\n')
	wavenet_train(args, log_dir, hparams, input_path)