예제 #1
0
def train(log_dir, args, hparams):
	save_dir = os.path.join(log_dir, 'taco_pretrained/')
	checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
	input_path = os.path.join(args.base_dir, args.tacotron_input)
	plot_dir = os.path.join(log_dir, 'plots')
	wav_dir = os.path.join(log_dir, 'wavs')
	mel_dir = os.path.join(log_dir, 'mel-spectrograms')
	eval_dir = os.path.join(log_dir, 'eval-dir')
	eval_plot_dir = os.path.join(eval_dir, 'plots')
	eval_wav_dir = os.path.join(eval_dir, 'wavs')
	os.makedirs(eval_dir, exist_ok=True)
	os.makedirs(plot_dir, exist_ok=True)
	os.makedirs(wav_dir, exist_ok=True)
	os.makedirs(mel_dir, exist_ok=True)
	os.makedirs(eval_plot_dir, exist_ok=True)
	os.makedirs(eval_wav_dir, exist_ok=True)

	if hparams.predict_linear:
		linear_dir = os.path.join(log_dir, 'linear-spectrograms')
		os.makedirs(linear_dir, exist_ok=True)

	log('Checkpoint path: {}'.format(checkpoint_path))
	log('Loading training data from: {}'.format(input_path))
	log('Using model: {}'.format(args.model))
	log(hparams_debug_string())

	#Start by setting a seed for repeatability
	tf.set_random_seed(hparams.tacotron_random_seed)

	#Set up data feeder
	coord = tf.train.Coordinator()
	with tf.variable_scope('datafeeder') as scope:
		feeder = Feeder(coord, input_path, hparams)

	#Set up model:
	global_step = tf.Variable(0, name='global_step', trainable=False)
	model, stats = model_train_mode(args, feeder, hparams, global_step)
	eval_model = model_test_mode(args, feeder, hparams, global_step)

	#Book keeping
	step = 0
	time_window = ValueWindow(100)
	loss_window = ValueWindow(100)
	saver = tf.train.Saver(max_to_keep=5)

	log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps))

	#Memory allocation on the GPU as needed
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True

	#Train
	with tf.Session(config=config) as sess:
		try:
			summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
			sess.run(tf.global_variables_initializer())

			#saved model restoring
			if args.restore:
				#Restore saved model if the user requested it, Default = True.
				try:
					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
				except tf.errors.OutOfRangeError as e:
					log('Cannot restore checkpoint: {}'.format(e))

			if (checkpoint_state and checkpoint_state.model_checkpoint_path):
				log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path))
				saver.restore(sess, checkpoint_state.model_checkpoint_path)

			else:
				if not args.restore:
					log('Starting new training!')
				else:
					log('No model to load at {}'.format(save_dir))

			#initializing feeder
			feeder.start_threads(sess)

			#Training loop
			while not coord.should_stop() and step < args.tacotron_train_steps:
				start_time = time.time()
				step, loss, opt = sess.run([global_step, model.loss, model.optimize])
				time_window.append(time.time() - start_time)
				loss_window.append(loss)
				message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
					step, time_window.average, loss, loss_window.average)
				log(message, end='\r')

				if np.isnan(loss):
					log('Loss exploded to {:.5f} at step {}'.format(loss, step))
					raise Exception('Loss exploded')

				if step % args.summary_interval == 0:
					log('\nWriting summary at step {}'.format(step))
					summary_writer.add_summary(sess.run(stats), step)

				if step % args.eval_interval == 0:
					#Run eval and save eval stats
					log('\nRunning evaluation at step {}'.format(step))

					eval_losses = []
					before_losses = []
					after_losses = []
					stop_token_losses = []
					linear_losses = []
					linear_loss = None

					if hparams.predict_linear:
						for i in tqdm(range(feeder.test_steps)):
							eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p = sess.run(
								[eval_model.loss, eval_model.before_loss, eval_model.after_loss,
								eval_model.stop_token_loss, eval_model.linear_loss, eval_model.mel_outputs[0], 
								eval_model.mel_targets[0], eval_model.targets_lengths[0], 
								eval_model.alignments[0], eval_model.linear_outputs[0]])
							eval_losses.append(eloss)
							before_losses.append(before_loss)
							after_losses.append(after_loss)
							stop_token_losses.append(stop_token_loss)
							linear_losses.append(linear_loss)
						linear_loss = sum(linear_losses) / len(linear_losses)

						wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
						audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-linear.wav'.format(step)), sr=hparams.sample_rate)
					else:
						for i in tqdm(range(feeder.test_steps)):
							eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run(
								[eval_model.loss, eval_model.before_loss, eval_model.after_loss,
								eval_model.stop_token_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0],
								eval_model.targets_lengths[0], eval_model.alignments[0]])
							eval_losses.append(eloss)
							before_losses.append(before_loss)
							after_losses.append(after_loss)
							stop_token_losses.append(stop_token_loss)

					eval_loss = sum(eval_losses) / len(eval_losses)
					before_loss = sum(before_losses) / len(before_losses)
					after_loss = sum(after_losses) / len(after_losses)
					stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)

					log('Saving eval log to {}..'.format(eval_dir))
					#Save some log to monitor model improvement on same unseen sequence
					wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
					audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-mel.wav'.format(step)), sr=hparams.sample_rate)

					plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)),
						info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eloss),
						max_len=t_len // hparams.outputs_per_step)
					plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)),
						info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, eloss), target_spectrogram=mel_t,
						max_len=t_len)

					log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss))
					log('Writing eval summary!')
					add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss)

				
				if step % args.checkpoint_interval == 0:
					#Save model and current global step
					saver.save(sess, checkpoint_path, global_step=global_step)
					
					log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..')
					if hparams.predict_linear:
						input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run([
							model.inputs[0],
							model.mel_outputs[0],
							model.linear_outputs[0],
							model.alignments[0],
							model.mel_targets[0],
							model.targets_lengths[0],
							])

						#save predicted linear spectrogram to disk (debug)
						linear_filename = 'linear-prediction-step-{}.npy'.format(step)
						np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False)

						#save griffin lim inverted wav for debug (linear -> wav)
						wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams)
						audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)

					else:
						input_seq, mel_prediction, alignment, target, target_length = sess.run([model.inputs[0],
							model.mel_outputs[0],
							model.alignments[0],
							model.mel_targets[0],
							model.targets_lengths[0],
							])

					#save predicted mel spectrogram to disk (debug)
					mel_filename = 'mel-prediction-step-{}.npy'.format(step)
					np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False)

					#save griffin lim inverted wav for debug (mel -> wav)
					wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
					audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)

					#save alignment plot to disk (control purposes)
					plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
						info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss),
						max_len=target_length // hparams.outputs_per_step)
					#save real and predicted mel-spectrogram plot to disk (control purposes)
					plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)),
						info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss), target_spectrogram=target,
						max_len=target_length)
					log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))

			log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps))
			return save_dir

		except Exception as e:
			log('Exiting due to exception: {}'.format(e))
			traceback.print_exc()
			coord.request_stop(e)
예제 #2
0
	def synthesize(self, text, index, out_dir, log_dir, mel_filename):
		hparams = self._hparams
		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
		seq = text_to_sequence(text, cleaner_names)
		feed_dict = {
			self.model.inputs: [np.asarray(seq, dtype=np.int32)],
			self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
		}

		if self.gta:
			feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80)

		if self.gta or not hparams.predict_linear:
			mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)

		else:
			linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict)
			linear = linear.reshape(-1, hparams.num_freq)

		mels = mels.reshape(-1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out


		if index is None:
			#Generate wav and read it
			wav = audio.inv_mel_spectrogram(mels.T, hparams)
			audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way

			chunk = 512
			f = wave.open('temp.wav', 'rb')
			p = pyaudio.PyAudio()
			stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
				channels=f.getnchannels(),
				rate=f.getframerate(),
				output=True)
			data = f.readframes(chunk)
			while data:
				stream.write(data)
				data=f.readframes(chunk)

			stream.stop_stream()
			stream.close()

			p.terminate()
			return


		# Write the spectrogram to disk
		# Note: outputs mel-spectrogram files and target ones have same names, just different folders
		mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index))
		np.save(mel_filename, mels, allow_pickle=False)

		if log_dir is not None:
			#save wav (mel -> wav)
			wav = audio.inv_mel_spectrogram(mels.T, hparams)
			audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)), sr=hparams.sample_rate)

			if hparams.predict_linear:
				#save wav (linear -> wav)
				wav = audio.inv_linear_spectrogram(linear.T, hparams)
				audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)), sr=hparams.sample_rate)

			#save alignments
			plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)),
				info='{}'.format(text), split_title=True)

			#save mel spectrogram plot
			plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)),
				info='{}'.format(text), split_title=True)

		return mel_filename
예제 #3
0
    def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
        hparams = self._hparams
        # [-max, max] or [0,max]
        t2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (
            0, hparams.max_abs_value)

        # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])

        seqs = [np.asarray(text_to_sequence(text)) for text in texts]
        input_lengths = [len(seq) for seq in seqs]
        input_seqs, max_seq_len = self._prepare_inputs(seqs)

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if self.gta:
            np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
            target_lengths = [len(np_target) for np_target in np_targets]
            target_seqs, max_target_len = self._prepare_targets(np_targets, self._hparams.outputs_per_step)
            feed_dict[self.targets] = target_seqs
            assert len(np_targets) == len(texts)
        linears = None
        if self.gta or not hparams.predict_linear:
            mels, alignments, stop_tokens = self.session.run(
                [self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict)

            # Natural batch synthesis
            # Get Mel lengths for the entire batch from stop_tokens predictions
            target_lengths = self._get_output_lengths(stop_tokens)

            # Take off the batch wise padding
            mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction],
                feed_dict=feed_dict)

            # Natural batch synthesis
            # Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            target_lengths = self._get_output_lengths(stop_tokens)

            # Take off the batch wise padding
            mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
            linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
            linears = np.clip(linears, t2_output_range[0], t2_output_range[1])
            assert len(mels) == len(linears) == len(texts)

        mels = np.clip(mels, t2_output_range[0], t2_output_range[1])

        if basenames is None:
            # Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels[0].T, hparams)
            audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate)  # Find a better way

            if platform.system() == 'Linux':
                # Linux wav reader
                os.system('aplay temp.wav')

            elif platform.system() == 'Windows':
                # windows wav reader
                os.system('start /min mplay32 /play /close temp.wav')

            else:
                raise RuntimeError(
                    'Your OS type is not supported yet, please add it to "synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!')

            return

        saved_mels_paths = []
        for i, mel in enumerate(mels):
            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                # save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)

                # save alignments
                plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])),
                                    title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i])

                # save mel spectrogram plot
                plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
                                      title='{}'.format(texts[i]), split_title=True)

                if linears:
                    # save wav (linear -> wav)
                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                    audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])),
                                   sr=hparams.sample_rate)

                    # save linear spectrogram plot
                    plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
                                          title='{}'.format(texts[i]), split_title=True, auto_aspect=True)

        return saved_mels_paths
예제 #4
0
    def synthesize(self, texts, speaker_labels, language_labels, basenames,
                   out_dir, log_dir, mel_filenames):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]

        #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.tacotron_synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])

        assert 0 == len(texts) % self._hparams.tacotron_num_gpus
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]

        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

        #Pad inputs according to each GPU max length
        input_seqs = None
        input_speaker_labels = None
        input_language_labels = None
        split_infos = []
        for i in range(self._hparams.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input

            device_speaker_label = speaker_labels[size_per_device *
                                                  i:size_per_device * (i + 1)]
            input_speaker_labels = np.concatenate(
                (input_speaker_labels, device_speaker_label), axis=0
            ) if input_speaker_labels is not None else device_speaker_label

            device_language_label = language_labels[size_per_device *
                                                    i:size_per_device *
                                                    (i + 1)]
            input_language_labels = np.concatenate(
                (input_language_labels, device_language_label), axis=0
            ) if input_language_labels is not None else device_language_label
            split_infos.append([max_seq_len, 0, 0, 0])

        feed_dict = {
            self.inputs: input_seqs,
            self.speaker_labels: input_speaker_labels,
            self.language_labels: input_language_labels,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if self.gta:
            np_targets = [
                np.load(mel_filename) for mel_filename in mel_filenames
            ]
            target_lengths = [len(np_target) for np_target in np_targets]

            #pad targets according to each GPU max length
            target_seqs = None
            for i in range(self._hparams.tacotron_num_gpus):
                device_target = np_targets[size_per_device *
                                           i:size_per_device * (i + 1)]
                device_target, max_target_len = self._prepare_targets(
                    device_target, self._hparams.outputs_per_step)
                target_seqs = np.concatenate(
                    (target_seqs, device_target),
                    axis=1) if target_seqs is not None else device_target
                split_infos[i][
                    1] = max_target_len  #Not really used but setting it in case for future development maybe?

            feed_dict[self.targets] = target_seqs
            assert len(np_targets) == len(texts)

        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
        if self.gta or not hparams.predict_linear:
            mels, alignments, stop_tokens = self.session.run(
                [
                    self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            if not self.gta:
                #Natural batch synthesis
                #Get Mel lengths for the entire batch from stop_tokens predictions
                target_lengths = self._get_output_lengths(stop_tokens)

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [
                    self.linear_outputs, self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            linears = [
                linear for gpu_linear in linears for linear in gpu_linear
            ]
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            #Natural batch synthesis
            #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            # target_lengths = self._get_output_lengths(stop_tokens)
            target_lengths = [9999]

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            linears = [
                linear[:target_length, :]
                for linear, target_length in zip(linears, target_lengths)
            ]
            assert len(mels) == len(linears) == len(texts)

        if basenames is None:
            #Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels.T, hparams)
            audio.save_wav(wav, 'temp.wav',
                           sr=hparams.sample_rate)  #Find a better way

            chunk = 512
            f = wave.open('temp.wav', 'rb')
            p = pyaudio.PyAudio()
            stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
                            channels=f.getnchannels(),
                            rate=f.getframerate(),
                            output=True)
            data = f.readframes(chunk)
            while data:
                stream.write(data)
                data = f.readframes(chunk)

            stream.stop_stream()
            stream.close()

            p.terminate()
            return

        saved_mels_paths = []
        speaker_ids = []
        for i, mel in enumerate(mels):
            #Get speaker id for global conditioning (only used with GTA generally)
            if hparams.gin_channels > 0:
                raise RuntimeError(
                    'Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.'
                )
                speaker_id = '<no_g>'  #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
                speaker_ids.append(
                    speaker_id
                )  #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
            else:
                speaker_id = '<no_g>'
                speaker_ids.append(speaker_id)

            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir,
                                        'mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                #save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav,
                               os.path.join(
                                   log_dir,
                                   'wavs/wav-{}-mel.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)

                #save alignments
                plot.plot_alignment(alignments[i],
                                    os.path.join(
                                        log_dir,
                                        'plots/alignment-{}.png'.format(
                                            basenames[i])),
                                    title='{}'.format(texts[i]),
                                    split_title=True,
                                    max_len=target_lengths[i])

                #save mel spectrogram plot
                plot.plot_spectrogram(
                    mel,
                    os.path.join(log_dir,
                                 'plots/mel-{}.png'.format(basenames[i])),
                    title='{}'.format(texts[i]),
                    split_title=True)

                if hparams.predict_linear:
                    #save wav (linear -> wav)
                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                    audio.save_wav(wav,
                                   os.path.join(
                                       log_dir,
                                       'wavs/wav-{}-linear.wav'.format(
                                           basenames[i])),
                                   sr=hparams.sample_rate)

                    #save linear spectrogram plot
                    plot.plot_spectrogram(linears[i],
                                          os.path.join(
                                              log_dir,
                                              'plots/linear-{}.png'.format(
                                                  basenames[i])),
                                          title='{}'.format(texts[i]),
                                          split_title=True,
                                          auto_aspect=True)

        return saved_mels_paths, speaker_ids
예제 #5
0
	def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
		hparams = self._hparams
		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
		#[-max, max] or [0,max]
		T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)

		#Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
		while len(texts) % hparams.tacotron_synthesis_batch_size != 0:
			texts.append(texts[-1])
			basenames.append(basenames[-1])
			if mel_filenames is not None:
				mel_filenames.append(mel_filenames[-1])

		assert 0 == len(texts) % self._hparams.tacotron_num_gpus
		seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
		input_lengths = [len(seq) for seq in seqs]

		size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

		#Pad inputs according to each GPU max length
		input_seqs = None
		split_infos = []
		for i in range(self._hparams.tacotron_num_gpus):
			device_input = seqs[size_per_device*i: size_per_device*(i+1)]
			device_input, max_seq_len = self._prepare_inputs(device_input)
			input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input
			split_infos.append([max_seq_len, 0, 0, 0])

		feed_dict = {
			self.inputs: input_seqs,
			self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
		}

		if self.gta:
			np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
			target_lengths = [len(np_target) for np_target in np_targets]

			#pad targets according to each GPU max length
			target_seqs = None
			for i in range(self._hparams.tacotron_num_gpus):
				device_target = np_targets[size_per_device*i: size_per_device*(i+1)]
				device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step)
				target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target
				split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe?

			feed_dict[self.targets] = target_seqs
			assert len(np_targets) == len(texts)

		feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)

		if self.gta or not hparams.predict_linear:
			mels, alignments, stop_tokens = self.session.run([self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict)

			#Linearize outputs (n_gpus -> 1D)
			mels = [mel for gpu_mels in mels for mel in gpu_mels]
			alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
			stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]

			if not self.gta:
				#Natural batch synthesis
				#Get Mel lengths for the entire batch from stop_tokens predictions
				target_lengths = self._get_output_lengths(stop_tokens)

			#Take off the batch wise padding
			mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
			assert len(mels) == len(texts)

		else:
			linears, mels, alignments, stop_tokens = self.session.run([self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict)
			
			#Linearize outputs (1D arrays)
			linears = [linear for gpu_linear in linears for linear in gpu_linear]
			mels = [mel for gpu_mels in mels for mel in gpu_mels]
			alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
			stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]

			#Natural batch synthesis
			#Get Mel/Linear lengths for the entire batch from stop_tokens predictions
			target_lengths = self._get_output_lengths(stop_tokens)

			#Take off the batch wise padding
			mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
			linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
			linears = np.clip(linears, T2_output_range[0], T2_output_range[1])
			assert len(mels) == len(linears) == len(texts)

		mels = np.clip(mels, T2_output_range[0], T2_output_range[1])

		if basenames is None:
			#Generate wav and read it
			if hparams.GL_on_GPU:
				wav = self.session.run(self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mels[0]})
				wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
			else:
				wav = audio.inv_mel_spectrogram(mels[0].T, hparams)
			audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way

			if platform.system() == 'Linux':
				#Linux wav reader
				os.system('aplay temp.wav')

			elif platform.system() == 'Windows':
				#windows wav reader
				os.system('start /min mplay32 /play /close temp.wav')

			else:
				raise RuntimeError('Your OS type is not supported yet, please add it to "tacotron/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!')

			return


		saved_mels_paths = []
		speaker_ids = []
		for i, mel in enumerate(mels):
			#Get speaker id for global conditioning (only used with GTA generally)
			if hparams.gin_channels > 0:
				raise RuntimeError('Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.')
				speaker_id = '<no_g>' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
				speaker_ids.append(speaker_id) #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
			else:
				speaker_id = '<no_g>'
				speaker_ids.append(speaker_id)

			# Write the spectrogram to disk
			# Note: outputs mel-spectrogram files and target ones have same names, just different folders
			mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i]))
			np.save(mel_filename, mel, allow_pickle=False)
			saved_mels_paths.append(mel_filename)

			if log_dir is not None:
				#save wav (mel -> wav)
				if hparams.GL_on_GPU:
					wav = self.session.run(self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mel})
					wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
				else:
					wav = audio.inv_mel_spectrogram(mel.T, hparams)
				audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate)

				#save alignments
				plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])),
					title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i])

				#save mel spectrogram plot
				plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
					title='{}'.format(texts[i]), split_title=True)

				if hparams.predict_linear:
					#save wav (linear -> wav)
					if hparams.GL_on_GPU:
						wav = self.session.run(self.GLGPU_lin_outputs, feed_dict={self.GLGPU_lin_inputs: linears[i]})
						wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
					else:
						wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
					audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate)

					#save linear spectrogram plot
					plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
						title='{}'.format(texts[i]), split_title=True, auto_aspect=True)

		return saved_mels_paths, speaker_ids
예제 #6
0
파일: train.py 프로젝트: silifor/taco2swe
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, 'taco_pretrained')
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    mel_dir = os.path.join(log_dir, 'mel-spectrograms')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'tacotron_events')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
    input_path = os.path.join(args.base_dir, args.tacotron_input)

    if hparams.predict_linear:
        linear_dir = os.path.join(log_dir, 'linear-spectrograms')
        os.makedirs(linear_dir, exist_ok=True)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))
    log(hparams_debug_string())

    #Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, hparams)

    #Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    eval_model = model_test_mode(args, feeder, hparams, global_step)

    #Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=100)

    log('Tacotron training set to a maximum of {} steps'.format(
        args.tacotron_train_steps))

    #Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    #Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            #saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if (checkpoint_state
                            and checkpoint_state.model_checkpoint_path):
                        log('Loading checkpoint {}'.format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)
                    else:
                        log('No model to load at {}'.format(save_dir),
                            slack=True)

                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e), slack=True)
            else:
                log('Starting new training!', slack=True)

            #initializing feeder
            feeder.start_threads(sess)

            #Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
                    step, time_window.average, loss, loss_window.average)
                log(message,
                    end='\r',
                    slack=(step % args.checkpoint_interval == 0))

                if loss > 100 or np.isnan(loss):
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step {}'.format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.eval_interval == 0:
                    #Run eval and save eval stats
                    log('\nRunning evaluation at step {}'.format(step))

                    eval_losses = []
                    before_losses = []
                    after_losses = []
                    stop_token_losses = []
                    linear_losses = []
                    linear_loss = None

                    if hparams.predict_linear:
                        for i in tqdm(range(feeder.test_steps)):
                            eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p = sess.run(
                                [
                                    eval_model.loss, eval_model.before_loss,
                                    eval_model.after_loss,
                                    eval_model.stop_token_loss,
                                    eval_model.linear_loss,
                                    eval_model.mel_outputs[0],
                                    eval_model.mel_targets[0],
                                    eval_model.targets_lengths[0],
                                    eval_model.alignments[0],
                                    eval_model.linear_outputs[0]
                                ])
                            eval_losses.append(eloss)
                            before_losses.append(before_loss)
                            after_losses.append(after_loss)
                            stop_token_losses.append(stop_token_loss)
                            linear_losses.append(linear_loss)
                        linear_loss = sum(linear_losses) / len(linear_losses)

                        wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
                        audio.save_wav(
                            wav,
                            os.path.join(
                                eval_wav_dir,
                                'step-{}-eval-waveform-linear.wav'.format(
                                    step)),
                            sr=hparams.sample_rate)
                    else:
                        for i in tqdm(range(feeder.test_steps)):
                            eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run(
                                [
                                    eval_model.loss, eval_model.before_loss,
                                    eval_model.after_loss,
                                    eval_model.stop_token_loss,
                                    eval_model.mel_outputs[0],
                                    eval_model.mel_targets[0],
                                    eval_model.targets_lengths[0],
                                    eval_model.alignments[0]
                                ])
                            eval_losses.append(eloss)
                            before_losses.append(before_loss)
                            after_losses.append(after_loss)
                            stop_token_losses.append(stop_token_loss)

                    eval_loss = sum(eval_losses) / len(eval_losses)
                    before_loss = sum(before_losses) / len(before_losses)
                    after_loss = sum(after_losses) / len(after_losses)
                    stop_token_loss = sum(stop_token_losses) / len(
                        stop_token_losses)

                    log('Saving eval log to {}..'.format(eval_dir))
                    #Save some log to monitor model improvement on same unseen sequence
                    wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(
                            eval_wav_dir,
                            'step-{}-eval-waveform-mel.wav'.format(step)),
                        sr=hparams.sample_rate)

                    plot.plot_alignment(
                        align,
                        os.path.join(eval_plot_dir,
                                     'step-{}-eval-align.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, eval_loss),
                        max_len=t_len // hparams.outputs_per_step)
                    plot.plot_spectrogram(
                        mel_p,
                        os.path.join(
                            eval_plot_dir,
                            'step-{}-eval-mel-spectrogram.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5}'.format(
                            args.model, time_string(), step, eval_loss),
                        target_spectrogram=mel_t,
                        max_len=t_len)

                    log('Eval loss for global step {}: {:.3f}'.format(
                        step, eval_loss))
                    log('Writing eval summary!')
                    add_eval_stats(summary_writer, step, linear_loss,
                                   before_loss, after_loss, stop_token_loss,
                                   eval_loss)

                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps:
                    #Save model and current global step
                    saver.save(sess, checkpoint_path, global_step=global_step)

                    log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..'
                        )
                    if hparams.predict_linear:
                        input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run(
                            [
                                model.inputs[0],
                                model.mel_outputs[0],
                                model.linear_outputs[0],
                                model.alignments[0],
                                model.mel_targets[0],
                                model.targets_lengths[0],
                            ])

                        #save predicted linear spectrogram to disk (debug)
                        linear_filename = 'linear-prediction-step-{}.npy'.format(
                            step)
                        np.save(os.path.join(linear_dir, linear_filename),
                                linear_prediction.T,
                                allow_pickle=False)

                        #save griffin lim inverted wav for debug (linear -> wav)
                        wav = audio.inv_linear_spectrogram(
                            linear_prediction.T, hparams)
                        audio.save_wav(
                            wav,
                            os.path.join(
                                wav_dir,
                                'step-{}-wave-from-linear.wav'.format(step)),
                            sr=hparams.sample_rate)

                    else:
                        input_seq, mel_prediction, alignment, target, target_length = sess.run(
                            [
                                model.inputs[0],
                                model.mel_outputs[0],
                                model.alignments[0],
                                model.mel_targets[0],
                                model.targets_lengths[0],
                            ])

                    #save predicted mel spectrogram to disk (debug)
                    mel_filename = 'mel-prediction-step-{}.npy'.format(step)
                    np.save(os.path.join(mel_dir, mel_filename),
                            mel_prediction.T,
                            allow_pickle=False)

                    #save griffin lim inverted wav for debug (mel -> wav)
                    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(wav_dir,
                                     'step-{}-wave-from-mel.wav'.format(step)),
                        sr=hparams.sample_rate)

                    #save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     'step-{}-align.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss),
                        max_len=target_length // hparams.outputs_per_step)
                    #save real and predicted mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        mel_prediction,
                        os.path.join(
                            plot_dir,
                            'step-{}-mel-spectrogram.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5}'.format(
                            args.model, time_string(), step, loss),
                        target_spectrogram=target,
                        max_len=target_length)
                    log('Input at step {}: {}'.format(
                        step, sequence_to_text(input_seq)))

            log('Tacotron training complete after {} global steps!'.format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
예제 #7
0
def gen(content, t):
    t1 = time.time()
    out = io.BytesIO()
    output = np.array([])
    mhash = hashlib.md5(content.encode(encoding='UTF-8')).hexdigest()
    print(content)
    content = cn2pinyin(content)
    print(len(content))
    ts = content.split("E")
    t2 = time.time()
    for text in ts:
        text = text.strip()
        if len(text) <= 0:
            continue
        text += " E"
        st1 = time.time()
        data, wav = synth.eval(text)
        st2 = time.time()
        print(">>>>>" + text, "cost=", st2 - st1)
        output = np.append(output, wav, axis=0)
    t3 = time.time()
    audio.save_wav(output, out, hparams.sample_rate)
    t4 = time.time()
    if t == "g1":
        mp3_path = "wavs/" + mhash + ".mp3"
        song = AudioSegment.from_file(out, format='wav')
        song.set_frame_rate(hparams.sample_rate)
        song.set_channels(2)
        filter = "atempo=0.95,highpass=f=300,lowpass=f=3000,aecho=0.8:0.88:6:0.4"
        song.export(
            mp3_path,
            format="mp3",
            parameters=["-filter_complex", filter, "-q:a", "4", "-vol", "150"])
        t5 = time.time()
        out2 = io.BytesIO()
        song.export(
            out2,
            format="mp3",
            parameters=["-filter_complex", filter, "-q:a", "4", "-vol", "150"])
        data = out2.getvalue()
        t6 = time.time()
        print("gen cost", t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5)
        return mp3_path, data
    else:
        effect = "-rate=-5 -pitch=+4"
        if t == "g3":
            effect = "-rate=+45 -pitch=+3"
        elif t == "b1":
            effect = "-pitch=-4"
        wav_file = "wavs/" + mhash + ".wav"
        audio.save_wav(output, wav_file, hparams.sample_rate)
        mp3_file = "wavs/" + mhash + ".mp3"
        out_file = "wavs/" + mhash + "1.wav"
        # effect ="-rate=-5 -pitch=+4" #"-rate=-10 -pitch=+8" 小姐姐 #"-rate=+45 -pitch=+3" 汤姆猫
        popen = Popen("soundstretch " + wav_file + " " + out_file + " " +
                      effect,
                      shell=True,
                      stdout=PIPE,
                      stderr=PIPE)
        popen.wait()
        if popen.returncode != 0:
            print("Error.")

        song = AudioSegment.from_wav(out_file)
        song.set_frame_rate(hparams.sample_rate)
        song.set_channels(1)
        filter = "atempo=0.95,highpass=f=200,lowpass=f=1000,aecho=0.8:0.88:6:0.4"
        song.export(
            mp3_file,
            format="mp3",
            parameters=["-filter_complex", filter, "-q:a", "4", "-vol", "200"])
        out2 = io.BytesIO()
        song.export(
            out2,
            format="mp3",
            parameters=["-filter_complex", filter, "-q:a", "4", "-vol", "200"])
        data = out2.getvalue()

    # mp3_path = "wavs/"+mhash + ".mp3"
    # song = AudioSegment.from_file(out, format='wav')
    # song.set_frame_rate(hparams.sample_rate)
    # song.set_channels(2)
    # filter = "atempo=0.95,highpass=f=300,lowpass=f=3000,aecho=0.8:0.88:6:0.4"
    # song.export(mp3_path, format="mp3",parameters=["-filter_complex",filter,"-q:a", "4", "-vol", "150"])
    # out2 = io.BytesIO()
    # song.export(out2, format="mp3",parameters=["-filter_complex",filter,"-q:a", "4", "-vol", "150"])
    # data = out2.getvalue()
    # return mp3_path, data

    return mp3_file, data