Exemplo n.º 1
0
    def synthesize(self,
                   texts,
                   is_sequence=False,
                   pml_filenames=None,
                   tgt_filenames=None):
        if tgt_filenames:
            pml_filenames = tgt_filenames  # hacky way to handle tgts other than pml
        hp = self._hparams
        cleaner_names = [x.strip() for x in hp.cleaners.split(',')]

        if isinstance(texts, str):
            seqs = [
                np.asarray(text_to_sequence(texts, cleaner_names),
                           dtype=np.int32)
            ]
        elif is_sequence:
            seqs = [np.asarray(texts, dtype=np.int32)]
        else:
            seqs = [
                np.asarray(text_to_sequence(text, cleaner_names),
                           dtype=np.int32) for text in texts
            ]

        input_seqs = self._prepare_inputs(seqs)

        feed_dict = {
            self.model.inputs:
            np.asarray(input_seqs, dtype=np.int32),
            self.model.input_lengths:
            np.asarray([len(seq) for seq in seqs], dtype=np.int32)
        }

        if self.gta:
            np_targets = [
                np.load(pml_filename) for pml_filename in pml_filenames
            ]
            prepared_targets = self._prepare_targets(np_targets,
                                                     hp.outputs_per_step)
            feed_dict[self.targets] = prepared_targets
            assert len(np_targets) == len(texts)

        alignments, = self.session.run([self.alignments], feed_dict=feed_dict)

        if not self.cut_lengths:
            max_length = hp.max_iters
            alignments = self.pad_along_axis(alignments, max_length, axis=2)

        if len(alignments) == 1:
            return alignments[0]

        return alignments
    def eval(self, batch):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in batch
        ]
        input_lengths = [len(seq) for seq in seqs]
        seqs = self._prepare_inputs(seqs)
        feed_dict = {
            self.model.inputs: seqs,
            self.model.input_lengths: np.asarray(input_lengths,
                                                 dtype=np.int32),
        }
        features, stop_tokens = self.session.run(
            [self.model.final_outputs, self.stop_token_outputs],
            feed_dict=feed_dict)

        #Get feature output lengths for the entire batch from stop_tokens outputs
        output_lengths = self._get_output_lengths(stop_tokens)
        features = [
            feature[:output_length, :]
            for feature, output_length in zip(features, output_lengths)
        ]
        assert len(features) == len(batch)

        wavs = []
        for i, feature in enumerate(features):
            np.save('tacotron_output/{}.npy'.format(i + 1), feature)
            wavs.append(audio.synthesize(feature, hparams))
        return np.concatenate(wavs)
Exemplo n.º 3
0
    def _get_next_example(self):
        """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
		"""
        if self._train_offset >= len(self._train_meta):
            self._train_offset = 0
            np.random.shuffle(self._train_meta)

        meta = self._train_meta[self._train_offset]
        self._train_offset += 1

        text = meta[5]

        input_data = np.asarray(text_to_sequence(text, self._cleaner_names),
                                dtype=np.int32)

        num = int((meta[1].split('-')[2]).split('.')[0])
        folder_num = str(num // 1000)
        sub_folder_num = str((num % 1000) // 100)

        mel_target = np.load(
            os.path.join(self._mel_dir, folder_num, sub_folder_num, meta[1]))
        #Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * (len(mel_target) - 1))
        linear_target = np.load(
            os.path.join(self._linear_dir, folder_num, sub_folder_num,
                         meta[2]))
        return (input_data, mel_target, token_target, linear_target,
                len(mel_target))
Exemplo n.º 4
0
	def _get_test_groups(self):
		meta = self._test_meta[self._test_offset]
		self._test_offset += 1

		text = meta[5]

		if self._hparams.preload_spectrogram:
			if meta[1] in self._mel_target:
				mel_target = self._mel_target[meta[1]]
			else:
				mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
				self._mel_target[meta[1]] = mel_target
			if meta[2] in self._linear_target:
				linear_target = self._inear_target[meta[1]]
			else:
				linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
				self._linear_target[meta[1]] = linear_target
		else:
			mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
			linear_target = np.load(os.path.join(self._linear_dir, meta[2]))

		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
		#Create parallel sequences containing zeros to represent a non finished sequence
		token_target = np.asarray([0.] * (len(mel_target) - 1))
		return (input_data, mel_target, token_target, linear_target, len(mel_target))
	def eval(self, batch):
		hparams = self._hparams
		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
		seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in batch]
		input_lengths = [len(seq) for seq in seqs]
		seqs = self._prepare_inputs(seqs)
		feed_dict = {
			self.model.inputs: seqs,
			self.model.input_lengths: np.asarray(input_lengths, dtype=np.int32),
		}

		linears, stop_tokens = self.session.run([self.linear_outputs, self.stop_token_prediction], feed_dict=feed_dict)

		#Get Mel/Linear lengths for the entire batch from stop_tokens predictions
		target_lengths = self._get_output_lengths(stop_tokens)

		#Take off the batch wise padding
		linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
		assert len(linears) == len(batch)

		#save wav (linear -> wav)
		results = []
		for i, linear in enumerate(linears):
			linear_wav = self.session.run(self.linear_wav_outputs, feed_dict={self.linear_spectrograms: linear})
			wav = audio.inv_preemphasis(linear_wav, hparams.preemphasis)
			results.append(wav)
		return np.concatenate(results)
Exemplo n.º 6
0
    def inference(self, texts):
        cleaner_names = [x.strip() for x in self._hp.cleaners.split(',')]
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]
        size_per_device = len(seqs)

        input_seqs = None
        split_infos = []
        for i in range(self._hp.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input
            split_infos.append([max_seq_len, 0, 0, 0])

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
            self.split_infos: np.asarray(split_infos, dtype=np.int32)
        }

        self.sess.run([
            tf.global_variables_initializer(),
            tf.local_variables_initializer()
        ])
        linear = self.sess.run(self.linear_outputs, feed_dict=feed_dict)
        wav = self.sess.run(self.GLGPU_lin_outputs,
                            feed_dict={self.GLGPU_lin_inputs: linear[0]})
        return wav
Exemplo n.º 7
0
	def synthesize(self, text, index, out_dir, log_dir, mel_filename):
		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
		seq = text_to_sequence(text, cleaner_names)
		feed_dict = {
			self.model.inputs: [np.asarray(seq, dtype=np.int32)],
			self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
		}

		if self.gta:
			feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80)

		mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)

		mels = mels.reshape(-1, 80) #Thanks to @imdatsolak for pointing this out

		# Write the spectrogram to disk
		# Note: outputs mel-spectrogram files and target ones have same names, just different folders
		mel_filename = os.path.join(out_dir, 'ljspeech-mel-{:05d}.npy'.format(index))
		np.save(mel_filename, mels, allow_pickle=False)

		if log_dir is not None:
			#save wav
			wav = audio.inv_mel_spectrogram(mels.T)
			audio.save_wav(wav, os.path.join(log_dir, 'wavs/ljspeech-wav-{:05d}.wav'.format(index)))

			#save alignments
			plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/ljspeech-alignment-{:05d}.png'.format(index)),
				info='{}'.format(text), split_title=True)

			#save mel spectrogram plot
			plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/ljspeech-mel-{:05d}.png'.format(index)),
				info='{}'.format(text), split_title=True)

		return mel_filename
Exemplo n.º 8
0
    def run(self, text, speaker_id, play=True):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
            self.model.speaker_ids: np.asarray([speaker_id], dtype=np.int32)
        }

        mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)
        mels = mels.reshape(-1, hparams.num_mels)  # Thanks to @imdatsolak for pointing this out

        if play:
            # Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels.T, hparams)
            audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate)  # Find a better way

            chunk = 512
            f = wave.open('temp.wav', 'rb')
            p = pyaudio.PyAudio()
            stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
                            channels=f.getnchannels(),
                            rate=f.getframerate(),
                            output=True)
            data = f.readframes(chunk)
            while data:
                stream.write(data)
                data = f.readframes(chunk)

            stream.stop_stream()
            stream.close()
            p.terminate()

        return mels
Exemplo n.º 9
0
    def synthesize(self, text, idx, out_dir, mel_filename):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
        }

        mels, alignment = self.session.run([self.mel_outputs, self.alignment],
                                           feed_dict=feed_dict)

        mels = mels.reshape(-1, hparams.num_mels)

        wav = audio.inv_mel_spectrogram(mels.T)
        audio.save_wav(wav,
                       os.path.join(out_dir, 'audio-{:02d}.wav'.format(idx)))

        # save mel spectrogram plot
        plot.plot_spectrogram(mels,
                              os.path.join(out_dir,
                                           'mel-{:02d}.png'.format(idx)),
                              info='{}'.format(text),
                              split_title=True)

        return 1
Exemplo n.º 10
0
    def _get_next_example(self):
        """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
		"""
        if self._train_offset >= len(self._train_meta):
            self._train_offset = 0
            np.random.shuffle(self._train_meta)

        meta = self._train_meta[self._train_offset]
        self._train_offset += 1
        dur_file = meta[1].lstrip('mel-')
        dur_file = os.path.join(self.duration_dir, dur_file)
        dur = np.squeeze(np.load(dur_file))
        alignment = convert_dur2alignment(dur)
        text = meta[5]

        input_data = np.asarray(text_to_sequence(text), dtype=np.int32)
        mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
        # Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * (len(mel_target) - 1))
        linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
        if len(dur) != len(input_data):
            raise RuntimeError('wrong dur')

        return (input_data, mel_target, token_target, linear_target, dur,
                alignment, len(mel_target))
Exemplo n.º 11
0
def preprocess_text(texts):
    seqs = [
        np.array(text_to_sequence(text, ['english_cleaners']))
        for text in texts
    ]
    input_legths = [seq.shape[0] for seq in seqs]
    max_len = max(input_legths)
    seqs = np.stack([_pad_input(x, max_len) for x in seqs])
    return seqs, input_legths
Exemplo n.º 12
0
    def synthesize(self,
                   texts,
                   pml_filenames=None,
                   to_wav=False,
                   num_workers=4,
                   **kwargs):
        hp = self._hparams

        kwargs.setdefault('pp_mcep', self.cfg.pp_mcep)
        kwargs.setdefault('spec_type', hp.spec_type)

        cleaner_names = [x.strip() for x in hp.cleaners.split(',')]
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names), dtype=np.int32)
            for text in texts
        ]
        input_seqs = self._prepare_inputs(seqs)

        feed_dict = {
            self.model.inputs:
            np.asarray(input_seqs, dtype=np.int32),
            self.model.input_lengths:
            np.asarray([len(seq) for seq in seqs], dtype=np.int32)
        }

        #         if self.gta:
        if self.gta or self.eal:
            np_targets = [
                np.load(pml_filename) for pml_filename in pml_filenames
            ]
            prepared_targets = self._prepare_targets(np_targets,
                                                     hp.outputs_per_step)
            feed_dict[self.targets] = prepared_targets
            assert len(np_targets) == len(texts)

        if self.flag_online:
            pml_features_matrix = self.session.run(self.pml_outputs_eal,
                                                   feed_dict=feed_dict)
        else:
            pml_features_matrix = self.session.run(self.pml_outputs,
                                                   feed_dict=feed_dict)

        if to_wav:
            executor = ProcessPoolExecutor(max_workers=num_workers)
            futures = []

            for pml_features in pml_features_matrix:
                futures.append(
                    executor.submit(
                        partial(_pml_to_wav, pml_features, self.cfg,
                                **kwargs)))

            wavs = [future.result() for future in futures]
            return wavs

        return pml_features_matrix
Exemplo n.º 13
0
    def synthesize(self, text, mel, out_dir, idx):
        hparams = self._hparams
        r = hparams.outputs_per_step

        T2_output_range = (
            -hparams.max_abs_value,
            hparams.max_abs_value) if hparams.symmetric_mels else (
                0, hparams.max_abs_value)

        target = np.load(mel)
        target = np.clip(target, T2_output_range[0], T2_output_range[1])
        target_length = target.shape[0]

        targets = padding_targets(target, r, T2_output_range[0])
        new_target_length = targets.shape[0]

        pyin, text = get_pyin(text)
        print(text)

        inputs = [np.asarray(text_to_sequence(pyin.split(' ')))]
        print(inputs)
        input_lengths = [len(inputs[0])]

        feed_dict = {
            self.inputs: np.asarray(inputs, dtype=np.int32),
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
            self.targets: np.asarray([targets], dtype=np.float32),
            self.target_lengths: np.asarray([new_target_length],
                                            dtype=np.int32),
        }

        mels, alignments = self.session.run(
            [self.mel_outputs, self.alignments], feed_dict=feed_dict)

        mel = mels[0]
        print('pred_mel.shape', mel.shape)
        mel = np.clip(mel, T2_output_range[0], T2_output_range[1])
        mel = mel[:target_length, :]
        mel = (mel + T2_output_range[1]) / (2 * T2_output_range[1])
        mel = np.clip(mel, 0.0, 1.0)  # 0~1.0
        print(target_length, new_target_length)

        pred_mel_path = os.path.join(out_dir, 'mel-{}-pred.npy'.format(idx))
        np.save(pred_mel_path, mel, allow_pickle=False)
        plot.plot_spectrogram(mel,
                              pred_mel_path.replace('.npy', '.png'),
                              title='')

        alignment = alignments[0]
        alignment_path = os.path.join(out_dir, 'align-{}.png'.format(idx))
        plot.plot_alignment(alignment, alignment_path, title='')
        #alignment_path = os.path.join(out_dir, 'align-{}.npy'.format(idx))
        #np.save(alignment_path, alignment, allow_pickle=False)

        return pred_mel_path, alignment_path
Exemplo n.º 14
0
	def _get_test_groups(self):
		meta = self._test_meta[self._test_offset]
		self._test_offset += 1

		text = meta[5]

		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
		#Create parallel sequences containing zeros to represent a non finished sequence
		token_target = np.asarray([0.] * (len(mel_target) - 1))
		linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
		return (input_data, mel_target, token_target, linear_target, len(mel_target))
Exemplo n.º 15
0
    def synthesize(self, text):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
        }

        if self.gta:
            feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(
                1, -1, 80)

        if self.gta or not hparams.predict_linear:
            mels, alignment = self.session.run(
                [self.mel_outputs, self.alignment], feed_dict=feed_dict)

        else:
            linear, mels, alignment = self.session.run(
                [self.linear_outputs, self.mel_outputs, self.alignment],
                feed_dict=feed_dict)
            linear = linear.reshape(-1, hparams.num_freq)

        mels = mels.reshape(
            -1, hparams.num_mels)  #Thanks to @imdatsolak for pointing this out

        wav = audio.inv_mel_spectrogram(mels.T)
        out = io.BytesIO()
        audio.save_wav(wav, out)
        return out.getvalue()
        # # Write the spectrogram to disk
        # # Note: outputs mel-spectrogram files and target ones have same names, just different folders
        # mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index))
        # np.save(mel_filename, mels, allow_pickle=False)

        # if log_dir is not None:
        # 	#save wav (mel -> wav)
        # 	wav = audio.inv_mel_spectrogram(mels.T)
        # 	audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)))

        # 	if hparams.predict_linear:
        # 		#save wav (linear -> wav)
        # 		wav = audio.inv_linear_spectrogram(linear.T)
        # 		audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)))

        # 	#save alignments
        # 	plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)),
        # 		info='{}'.format(text), split_title=True)

        # 	#save mel spectrogram plot
        # 	plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)),
        # 		info='{}'.format(text), split_title=True)

        # return mel_filename
Exemplo n.º 16
0
    def _get_test_groups(self):
        meta = self._test_meta[self._test_offset]
        self._test_offset += 1

        text = meta[4]

        input_data = np.asarray(text_to_sequence(text, self._cleaner_names),
                                dtype=np.int32)
        mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
        #Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * (len(mel_target) - 1))
        return (input_data, mel_target, token_target, len(mel_target))
Exemplo n.º 17
0
def reverse_equal_original(text):
    print('text:', text, 'len:', len(text))
    seq_text = text_to_sequence(text, cleaner_names)
    print('seq text:', seq_text, 'len:', len(seq_text))
    textReverse_seq_text = sequence_to_text(seq_text)
    print('textReverse_seq_txt:', textReverse_seq_text, 'len:',
          len(textReverse_seq_text))
    if len(textReverse_seq_text) - 1 != len(
            text) or textReverse_seq_text[-1] != '~':
        return False
    if len(text) != len(seq_text) - 1:
        return False
    return True
Exemplo n.º 18
0
    def _get_next_example(self):
        """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
        """
        if self._train_offset >= len(self._train_meta):
            self._train_offset = 0
            np.random.shuffle(self._train_meta)

        meta = self._train_meta[self._train_offset]
        self._train_offset += 1

        if not self._hparams.tacotron_phoneme_transcription:
            text = meta[5]
            input_data = np.asarray(text_to_sequence(text,
                                                     self._cleaner_names),
                                    dtype=np.int32)

        # Phoneme transcription
        else:
            '''
            text_as_words = meta[5].split(' ')
            text_as_phonemes = meta[6].split(' ')
            assert len(text_as_words) == len(text_as_phonemes)
            for i in range(0, len(text_as_words)):
                random_number = np.random.random()
                if random_number < self._proba_phoneme:
                    text_as_words[i] = text_as_phonemes[i]
            text = " ".join(text_as_words)
            '''
            text = meta[6]
            input_data = np.asarray(ipa_to_articulatory_sequence(text),
                                    dtype=np.int32)

        if self._hparams.tacotron_multi_speaker:
            speaker_id = [0 for i in range(int(self._nb_speaker))]
            speaker_id[int(meta[7])] = 1
        print()
        mel_reference = np.load(os.path.join(self._mel_dir, meta[2]))

        # input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
        mel_target = np.load(os.path.join(self._mel_dir, meta[8]))
        # Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * (len(mel_target) - 1))
        linear_target = np.load(os.path.join(self._linear_dir, meta[2]))

        if self._hparams.tacotron_multi_speaker:
            return (input_data, mel_target, token_target, linear_target,
                    len(mel_target), speaker_id, mel_reference)
        else:
            return (input_data, mel_target, token_target, linear_target,
                    len(mel_target), mel_reference)
Exemplo n.º 19
0
	def eval(self, text):
		hparams = self._hparams
		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
		seqs = [np.asarray(text_to_sequence(text, cleaner_names))]
		input_lengths = [len(seq) for seq in seqs]
		feed_dict = {
			self.model.inputs: seqs,
			self.model.input_lengths: np.asarray(input_lengths, dtype=np.int32),
		}
		linear_wavs = self.session.run(self.linear_wav_outputs, feed_dict=feed_dict)
		wav = audio.inv_preemphasis(linear_wavs, hparams.preemphasis)
		out = io.BytesIO()
		audio.save_wav(wav, out ,sr=hparams.sample_rate)
		return out.getvalue()
Exemplo n.º 20
0
    def synthesize_check(self,
                         texts,
                         pml_filenames=None,
                         tgt_filenames=None,
                         to_wav=False,
                         num_workers=4,
                         **kwargs):
        if tgt_filenames is None: tgt_filenames = pml_filenames
        hp = self._hparams

        kwargs.setdefault('pp_mcep', self.cfg.pp_mcep)
        kwargs.setdefault('spec_type', hp.spec_type)

        cleaner_names = [x.strip() for x in hp.cleaners.split(',')]
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names), dtype=np.int32)
            for text in texts
        ]
        input_seqs = self._prepare_inputs(seqs)

        feed_dict = {
            self.model.inputs:
            np.asarray(input_seqs, dtype=np.int32),
            self.model.input_lengths:
            np.asarray([len(seq) for seq in seqs], dtype=np.int32)
        }

        #         if self.gta:
        if self.gta or self.eal:
            np_targets = [
                np.load(tgt_filename) for tgt_filename in tgt_filenames
            ]
            prepared_targets = self._prepare_targets(np_targets,
                                                     hp.outputs_per_step)
            feed_dict[self.targets] = prepared_targets
            assert len(np_targets) == len(texts)

        alignments, = self.session.run([self.model.alignments],
                                       feed_dict=feed_dict)
        #         alignments, pml_intermediates = self.session.run([self.model.alignments, self.model.pml_intermediates], feed_dict=feed_dict)

        if True:  # not self.cut_lengths
            max_length = hp.max_iters
            alignments = self.pad_along_axis(alignments, max_length, axis=2)

        if len(alignments) == 1:
            return alignments[0]

        return alignments
    def synthesize(self, texts, basenames, out_dir, log_dir):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]
        seqs = self._prepare_inputs(seqs)
        feed_dict = {
            self.model.inputs: seqs,
            self.model.input_lengths: np.asarray(input_lengths,
                                                 dtype=np.int32),
        }

        features, alignments, stop_tokens = self.session.run(
            [self.final_outputs, self.alignments, self.stop_token_outputs],
            feed_dict=feed_dict)

        #Get feature output lengths for the entire batch from stop_tokens outputs
        output_lengths = self._get_output_lengths(stop_tokens)
        features = [
            feature[:output_length, :]
            for feature, output_length in zip(features, output_lengths)
        ]
        assert len(features) == len(texts)

        for i, feature in enumerate(features):
            # Write the predicted features to disk
            # Note: outputs files and target ones have same names, just different folders
            np.save(os.path.join(out_dir, 'feature-{:03d}.npy'.format(i + 1)),
                    feature,
                    allow_pickle=False)

            if log_dir is not None:
                #save alignments
                plot.plot_alignment(
                    alignments[i],
                    os.path.join(log_dir,
                                 'plots/alignment-{:03d}.png'.format(i + 1)),
                    info='{}'.format(texts[i]),
                    split_title=True)

                #save wav
                wav = audio.synthesize(feature, hparams)
                audio.save_wav(
                    wav,
                    os.path.join(log_dir, 'wavs/wav-{:03d}.wav'.format(i + 1)),
                    hparams)
Exemplo n.º 22
0
    def _get_next_example(self):
        if self._offset >= len(self._metadata):
            self._offset = 0
            np.random.shuffle(self._metadata)
        meta = self._metadata[self._offset]
        self._offset += 1

        text = meta[5]

        input_data = np.asarray(text_to_sequence(text, self._cleaner_names),
                                dtype=np.int32)
        mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
        token_target = np.asarray([0.] * len(mel_target))
        linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
        return (input_data, mel_target, token_target, linear_target,
                len(mel_target))
Exemplo n.º 23
0
    def _get_test_groups(self):
        meta = self._test_meta[self._test_offset]
        self._test_offset += 1
        dur_file = meta[1].lstrip('mel-')
        dur_file = os.path.join(self.duration_dir, dur_file)
        dur = np.squeeze(np.load(dur_file))
        alignment = convert_dur2alignment(dur)
        text = meta[5]

        input_data = np.asarray(text_to_sequence(text), dtype=np.int32)
        mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
        # Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * (len(mel_target) - 1))
        linear_target = np.load(os.path.join(self._linear_dir, meta[2]))

        return (input_data, mel_target, token_target, linear_target, dur,
                alignment, len(mel_target))
Exemplo n.º 24
0
    def _get_test_groups(self):
        meta = self._test_meta[self._test_offset]
        self._test_offset += 1

        text = meta[
            5]  #train.txt it is text sentences meta[0] = audio/f32 meta[1] = mel meta[2] = linear

        input_data = np.asarray(text_to_sequence(text, self._cleaner_names),
                                dtype=np.int32)
        mel_target = np.fromfile(os.path.join(self._audio_dir, meta[0]),
                                 dtype='float32')
        mel_target = np.resize(mel_target, (-1, self._hparams.num_mels))
        #Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * (len(mel_target) - 1))
        linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
        return (input_data, mel_target, token_target, linear_target,
                len(mel_target))
Exemplo n.º 25
0
    def predict(self, text, out_dir, speaker_id):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
            self.model.speaker_ids: np.asarray([speaker_id], dtype=np.int32)
        }

        mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)
        mels = mels.reshape(-1, hparams.num_mels)  # Thanks to @imdatsolak for pointing this out

        wav = audio.inv_mel_spectrogram(mels.T, hparams)
        audio.save_wav(wav, out_dir, sr=hparams.sample_rate)

        return out_dir
Exemplo n.º 26
0
    def _get_test_groups(self):
        meta = self._test_meta[self._test_offset]
        self._test_offset += 1

        dataset = meta[0]
        text = meta[7]
        emt_label = meta[8]
        spk_label = meta[9]

        input_data = np.asarray(text_to_sequence(text, self._cleaner_names),
                                dtype=np.int32)
        mel_target = np.load(
            os.path.join(self.data_folder, dataset, 'mels', meta[2]))
        #Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * (len(mel_target) - 1))

        # linear_target_path = os.path.join(self.data_folder, dataset, 'linear', meta[3])
        # if hparams.predict_linear:
        # 	if os.path.exists(linear_target_path):
        # 		linear_target = np.load(linear_target_path)
        # 	else:
        # 		raise ("linear target does not exist -", linear_target_path)
        # else:
        # 	linear_target = np.zeros((1,hparams.num_freq))

        #check for speaker embedding
        # spk_emb_path = os.path.join(self.data_folder,dataset,'spkemb', meta[4])
        # if os.path.exists(spk_emb_path):
        # 	spk_emb = np.load(spk_emb_path)
        # else:
        # 	spk_emb = np.zeros(hparams.tacotron_spk_emb_dim)
        # assert spk_emb.shape[0] == hparams.tacotron_spk_emb_dim

        #just use the same sample for the reference when testing
        ref_mel_emt = mel_target
        ref_mel_spk = mel_target

        assert (
            (ref_mel_emt == mel_target).all()
        )  #using the mel target lengths as the lengths for attention, must adjust accordingly

        # print("in gen", dataset, input_data[0:5], mel_target[0][0:5], emt_label, spk_label)

        # return (input_data, mel_target, token_target, linear_target, spk_emb, emt_label, spk_label, ref_mel_emt, ref_mel_spk, len(mel_target))
        return (input_data, mel_target, token_target, emt_label, spk_label,
                ref_mel_emt, ref_mel_spk, len(mel_target))
Exemplo n.º 27
0
    def _get_next_example(self):
        """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
        """
        if self._train_offset >= len(self._train_meta):
            self._train_offset = 0
            np.random.shuffle(self._train_meta)

        meta = self._train_meta[self._train_offset]
        self._train_offset += 1

        text = meta[-1].strip().split(' ')

        input_data = np.asarray(text_to_sequence(text), dtype=np.int32)
        mel_target = np.load(os.path.join(self._mel_dir, meta[0]))
        #Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * (len(mel_target) - 1))
        return (input_data, mel_target, token_target, len(mel_target))
Exemplo n.º 28
0
    def eval(self, text):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seqs = [np.asarray(text_to_sequence(text, cleaner_names))]

        input_lengths = [len(seq) for seq in seqs]

        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

        #Pad inputs according to each GPU max length
        input_seqs = None
        split_infos = []
        for i in range(self._hparams.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input
            split_infos.append([max_seq_len, 0, 0, 0])

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)

        #linear_wavs = self.session.run(self.linear_wav_outputs, feed_dict=feed_dict)
        linear_wavs, linears, mels, alignments, stop_tokens = self.session.run(
            [
                self.linear_wav_outputs, self.linear_outputs, self.mel_outputs,
                self.alignments, self.stop_token_prediction
            ],
            feed_dict=feed_dict)
        linear_wavs = [
            linear_wav for gpu_linear_wav in linear_wavs
            for linear_wav in gpu_linear_wav
        ]

        wav = audio.inv_preemphasis(linear_wavs, hparams.preemphasis)
        #audio.save_wav(wav, 'wavs/wav-1-linear.wav', sr=hparams.sample_rate)

        out = io.BytesIO()
        audio.save_wav(wav, out, sr=hparams.sample_rate)
        return out.getvalue(), wav
Exemplo n.º 29
0
    def _get_test_groups(self):
        meta = self._test_meta[self._test_offset]
        self._test_offset += 1

        text = meta[5]

        input_data = np.asarray(text_to_sequence(text, self._cleaner_names),
                                dtype=np.int32)
        mel_target = np.fromfile(os.path.join(
            '/mnt/training_data/audio',
            meta[0].replace('audio-', '').replace('/mels/', '/audio/')),
                                 dtype='float32')
        mel_target = np.resize(mel_target, (-1, self._hparams.num_mels))
        #Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * (len(mel_target) - 1))
        linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
        return (input_data, mel_target, token_target, linear_target,
                len(mel_target))
Exemplo n.º 30
0
	def _get_next_example(self):
		"""Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
		"""
		if self._train_offset >= len(self._train_meta):
			self._train_offset = 0
			np.random.shuffle(self._train_meta)

		meta = self._train_meta[self._train_offset]
		self._train_offset += 1

		text = meta[5]

		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
		#Create parallel sequences containing zeros to represent a non finished sequence
		token_target = np.asarray([0.] * (len(mel_target) - 1))
		linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
		return (input_data, mel_target, token_target, linear_target, len(mel_target))
Exemplo n.º 31
0
    def _get_next_example(self):
        """
		Gets a single example (input, mel_target, token_target) from disk
		"""
        if self._offset >= len(self._metadata):
            self._offset = 0
            np.random.shuffle(self._metadata)
        meta = self._metadata[self._offset]
        self._offset += 1

        text = meta[4]

        input_data = np.asarray(text_to_sequence(text, self._cleaner_names),
                                dtype=np.int32)
        mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
        #Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * len(mel_target))
        return (input_data, mel_target, token_target, len(mel_target))
Exemplo n.º 32
0
    def synthesize(self, text, index, out_dir, log_dir, mel_filename):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
        }

        if self.gta:
            feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(
                1, -1, 80)

        if self.gta or not hparams.predict_linear:
            mels, alignment = self.session.run(
                [self.mel_outputs, self.alignment], feed_dict=feed_dict)

        else:
            linear, mels, alignment = self.session.run(
                [self.linear_outputs, self.mel_outputs, self.alignment],
                feed_dict=feed_dict)
            linear = linear.reshape(-1, hparams.num_freq)

        mels = mels.reshape(
            -1, hparams.num_mels)  #Thanks to @imdatsolak for pointing this out

        #convert checkpoint to frozen model
        minimal_graph = tf.graph_util.convert_variables_to_constants(
            self.session, self.session.graph_def, ["model/inference/add"])
        tf.train.write_graph(minimal_graph,
                             '.',
                             'inference_model.pb',
                             as_text=False)

        npy_data = mels.reshape((-1, ))
        print(mels)
        print("==============================================")
        print(npy_data)
        text = text.replace(" ", "_")
        text = text.replace("?", ".")
        filename = text + 'f32'
        npy_data.tofile(filename)

        return
Exemplo n.º 33
0
	def synthesize(self, text, index, out_dir, log_dir, mel_filename):
		hparams = self._hparams
		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
		seq = text_to_sequence(text, cleaner_names)
		feed_dict = {
			self.model.inputs: [np.asarray(seq, dtype=np.int32)],
			self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
		}

		if self.gta:
			feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80)

		if self.gta or not hparams.predict_linear:
			mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)

		else:
			linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict)
			linear = linear.reshape(-1, hparams.num_freq)

		mels = mels.reshape(-1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out


		if index is None:
			#Generate wav and read it
			wav = audio.inv_mel_spectrogram(mels.T, hparams)
			audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way

			chunk = 512
			f = wave.open('temp.wav', 'rb')
			p = pyaudio.PyAudio()
			stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
				channels=f.getnchannels(),
				rate=f.getframerate(),
				output=True)
			data = f.readframes(chunk)
			while data:
				stream.write(data)
				data=f.readframes(chunk)

			stream.stop_stream()
			stream.close()

			p.terminate()
			return


		# Write the spectrogram to disk
		# Note: outputs mel-spectrogram files and target ones have same names, just different folders
		mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index))
		np.save(mel_filename, mels, allow_pickle=False)

		if log_dir is not None:
			#save wav (mel -> wav)
			wav = audio.inv_mel_spectrogram(mels.T, hparams)
			audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)), sr=hparams.sample_rate)

			if hparams.predict_linear:
				#save wav (linear -> wav)
				wav = audio.inv_linear_spectrogram(linear.T, hparams)
				audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)), sr=hparams.sample_rate)

			#save alignments
			plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)),
				info='{}'.format(text), split_title=True)

			#save mel spectrogram plot
			plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)),
				info='{}'.format(text), split_title=True)

		return mel_filename