def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
        
        hparams = self.hparams

        # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.tacotron_num_gpus != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])
        assert 0 == len(texts) % self.hparams.tacotron_num_gpus
        seqs = [np.asarray(hangul_to_sequence(dir=hparams.base_dir, hangul_text=text, hangul_type=hparams.hangul_type)) for text in texts]
        input_lengths = [len(seq) for seq in seqs]
        ## calculate sequence length on each GPU device
        sequence_size_per_device = len(seqs)// hparams.tacotron_num_gpus

        ##### create input sequence on each GPU then concat
        split_infos = []

        input_sequence = None ### synthesize input sequence
        for i in range(hparams.tacotron_num_gpus):
            on_device_input_sequence = seqs[sequence_size_per_device*i : sequence_size_per_device*(i+1)]
            on_device_input_sequence_padded, input_length = _prepare_inputs(inputs = on_device_input_sequence)
            input_sequence = np.concatenate((input_sequence, on_device_input_sequence_padded), axis=1) if input_sequence is not None else on_device_input_sequence_padded
            split_infos.append([input_length, 0,0,0])

        ### add input info to feed dict
        feed_dict = {
            self.inputs: input_sequence,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32)
        }

        if self.GTA:
            np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
            assert len(np_targets) == len(texts)

            #### get target sequence from mel_targets on each GPU
            target_sequence = None  ### synthesize input sequence
            for i in range(hparams.tacotron_num_gpus):
                on_device_target_sequence = np_targets[sequence_size_per_device * i: sequence_size_per_device * (i + 1)]
                on_device_target_sequence_padded, target_length = _prepare_targets(targets=on_device_target_sequence, alignment=hparams.outputs_per_step)
                target_sequence = np.concatenate((target_sequence, on_device_target_sequence_padded), axis=1) if target_sequence is not None else on_device_target_sequence_padded
                split_infos[i][1] = target_length


            ## add target  mel sequence to feed dict
            feed_dict[self.targets] = target_sequence
        feed_dict[self.split_infos]= np.asarray(split_infos, dtype=np.int32)

        ####### synthesize #######
        mels, alignments, stop_tokens,encoder_outputs = self.session.run([self.mel_outputs, self.alignment, self.stop_token,self.encoder_outputs], feed_dict=feed_dict)
        # Linearised outputs (n_gpus -> 1D)
        mels = [mel for gpu_mels in mels for mel in gpu_mels]
        
        alignments = [align for gpu_aligns in alignments for align in gpu_aligns]

        stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
        # for i,seq in enumerate(seqs):
        #     print(feed_dict[self.inputs][i])
        #     print(len(seq))
        #     print(texts[i])
        #     print('=============================================')
        #
        # print([max(stop_token) for stop_token in stop_tokens])
        # print([len(stop_token) for stop_token in stop_tokens])
        # print(feed_dict[self.input_lengths])
        target_lengths = _get_output_lengths(stop_tokens)
        ##todo: need more effort this code part

        # cut off the silence part (the part behind stop_token)
        mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]

        # [-max, max] or [0,max]
        T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)
        
        mels = [np.clip(mel, T2_output_range[0], T2_output_range[1]) for mel in mels]
        
        if basenames is None:
            # Generate wav and read it
            wav = mel_to_audio_serie(mels[0].T, hparams)
            save_wav(wav, 'temp.wav', sr=hparams.sample_rate)  # Find a better way
            if system() == 'Linux':
                # Linux wav reader
                os.system('aplay temp.wav')
            elif system() == 'Windows':
                # windows wav reader
                os.system('start /min mplay32 /play /close temp.wav')
            else:
                raise RuntimeError('Your OS type is not supported yet, please add it to "tacotron/synthesizer.py, line-150" and feel free to make a Pull Request ;) Thanks!')
            return
        
        saved_mels_paths = []
        speaker_ids=[]
        for (i,mel), text in zip(enumerate(mels), texts):
            # Get speaker id for global conditioning (only used with GTA generally)
            if hparams.gin_channels > 0:
                ### when there are many speakers (need edit this code part for multiple speakers model
                speaker_id = '<no_g>'  # set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
                speaker_ids.append(speaker_id)  # finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
            else:
                ### when only 1 speaker
                speaker_id = '<no_g>'
                speaker_ids.append(speaker_id)
                
            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir, 'speech-mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                # save wav (mel -> wav)
                wav = mel_to_audio_serie(mel.T, hparams)
                wav = inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
                save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{}-mel.wav'.format(basenames[i])),
                         sr=hparams.sample_rate)
                # save alignments
                plot_alignment(alignments[i], os.path.join(log_dir, 'plots/speech-alignment-{}.png'.format(basenames[i])),
                               info='{}'.format(texts[i]), split_title=True)
                # save mel spectrogram plot
                plot_spectrogram(mel, os.path.join(log_dir, 'plots/speech-mel-{}.png'.format(basenames[i])),
                                 info='{}'.format(texts[i]), split_title=True)
        return saved_mels_paths, speaker_ids
    def synthesize(self, text, index, out_dir, log_dir, mel_filename):
        hparams = self._hparams
        seq = hangul_to_sequence(dir=hparams.base_dir,
                                 hangul_text=text,
                                 hangul_type=hparams.hangul_type)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
        }

        if self.gta:
            feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(
                1, -1, hparams.num_mels)

        if self.gta or not hparams.predict_linear:
            mels, alignment = self.session.run(
                [self.mel_outputs, self.alignment], feed_dict=feed_dict)

        else:
            linear, mels, alignment = self.session.run(
                [self.linear_outputs, self.mel_outputs, self.alignment],
                feed_dict=feed_dict)
            linear = linear.reshape(-1, hparams.num_freq)

        mels = mels.reshape(
            -1,
            hparams.num_mels)  # Thanks to @imdatsolak for pointing this out

        if index is None:
            # Generate wav and read it
            wav = mel_to_audio_serie(mels.T, hparams)
            save_wav(wav, 'temp.wav',
                     sr=hparams.sample_rate)  # Find a better way

            chunk = 512
            f = wave.open('temp.wav', 'rb')
            p = pyaudio.PyAudio()
            stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
                            channels=f.getnchannels(),
                            rate=f.getframerate(),
                            output=True)
            data = f.readframes(chunk)
            while data:
                stream.write(data)
                data = f.readframes(chunk)

            stream.stop_stream()
            stream.close()

            p.terminate()
            return

        # Write the spectrogram to disk
        # Note: outputs mel-spectrogram files and target ones have same names, just different folders
        mel_filename = os.path.join(out_dir,
                                    'speech-mel-{:05d}.npy'.format(index))
        np.save(mel_filename, mels, allow_pickle=False)

        if log_dir is not None:
            # save wav (mel -> wav)
            wav = mel_to_audio_serie(mels.T, hparams)
            save_wav(wav,
                     os.path.join(
                         log_dir,
                         'wavs/speech-wav-{:05d}-mel.wav'.format(index)),
                     sr=hparams.sample_rate)

            if hparams.predict_linear:
                # save wav (linear -> wav)
                wav = linear_to_audio_serie(linear.T, hparams)
                save_wav(
                    wav,
                    os.path.join(
                        log_dir,
                        'wavs/speech-wav-{:05d}-linear.wav'.format(index)),
                    sr=hparams.sample_rate)

            # save alignments
            plot_alignment(
                alignment,
                os.path.join(
                    log_dir,
                    'plots/speech-alignment-{:05d}.png'.format(index)),
                info='{}'.format(text),
                split_title=True)

            # save mel spectrogram plot
            plot_spectrogram(mels,
                             os.path.join(
                                 log_dir,
                                 'plots/speech-mel-{:05d}.png'.format(index)),
                             info='{}'.format(text),
                             split_title=True)

        return mel_filename
Пример #3
0
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, 'taco_pretrained/')
    checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
    input_path = os.path.join(args.base_dir, args.tacotron_input)
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    mel_dir = os.path.join(log_dir, 'mel-spectrograms')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    ## check whethe post-processing network will be used for linear spectrogram prediction
    if hparams.predict_linear:
        linear_dir = os.path.join(log_dir, 'linear-spectrograms')
        os.makedirs(linear_dir, exist_ok=True)
    # save log info
    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))
    # Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)
    # Set up data feeder
    ## create an object of Feeder class to feed preprocessed data:
    #  (audio time series, mel spectrogram matrix, text sequences) to training model
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, hparams)


########################################
# Set up model:
# create model based on '--model' parameters ('Tacotron', 'Tacotron2', 'WaveNet', 'Both')
    global_step = tf.Variable(
        0, name='global_step', trainable=False
    )  ## define global step to use in tf.train.cosine_decay() when using teacher forcing
    model, stats = model_train_mode(args, feeder, hparams, global_step)

    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=5)
    log('Tacotron training set to a maximum of {} steps'.format(
        args.tacotron_train_steps))

    # Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            # saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, Default = True.
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)
                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e))

            if (checkpoint_state and checkpoint_state.model_checkpoint_path):
                log('Loading checkpoint {}'.format(
                    checkpoint_state.model_checkpoint_path))
                saver.restore(sess, checkpoint_state.model_checkpoint_path)

            else:
                if not args.restore:
                    log('Starting new training!')
                else:
                    log('No model to load at {}'.format(save_dir))

            # initializing feeder
            ## feed preprocessed data to threads
            feeder.start_threads(sess)
            # Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
                    step, time_window.average, loss, loss_window.average)
                log(message, end='\r')
                if np.isnan(loss):
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step {}'.format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                ##### save check point when check point interval has been met
                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps:
                    # Save model and current global step
                    saver.save(sess, checkpoint_path, global_step=global_step)

                    log('\nSaving Mel-Spectrograms and griffin-lim inverted waveform..'
                        )
                    if hparams.predict_linear:
                        input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run(
                            [
                                model.inputs[0],
                                model.mel_outputs[0],
                                model.linear_outputs[0],
                                model.alignments[0],
                                model.mel_targets[0],
                                model.targets_lengths[0],
                            ])

                        # save predicted linear spectrogram to disk (debug)
                        linear_filename = 'linear-prediction-step-{}.npy'.format(
                            step)
                        np.save(os.path.join(linear_dir, linear_filename),
                                linear_prediction.T,
                                allow_pickle=False)

                        # save griffin lim inverted wav for debug (linear -> wav)
                        wav = linear_to_audio_serie(linear_prediction.T,
                                                    hparams)
                        save_wav(
                            wav,
                            os.path.join(
                                wav_dir,
                                'step-{}-wave-from-linear.wav'.format(step)),
                            sr=hparams.sample_rate)

                    else:
                        input_seq, mel_prediction, alignment, target, target_length = sess.run(
                            [
                                model.inputs[0], model.mel_outputs[0],
                                model.alignments[0], model.mel_targets[0],
                                model.targets_lengths[0]
                            ])
                    # save predicted mel spectrogram to disk (debug)
                    mel_filename = 'mel-prediction-step-{}.npy'.format(step)
                    np.save(os.path.join(mel_dir, mel_filename),
                            mel_prediction.T,
                            allow_pickle=False)

                    # save griffin lim inverted wav for debug (mel -> wav)
                    wav = mel_to_audio_serie(mel_prediction.T, hparams)
                    save_wav(wav,
                             os.path.join(
                                 wav_dir,
                                 'step-{}-wave-from-mel.wav'.format(step)),
                             sr=hparams.sample_rate)

                    # save alignment plot to disk (control purposes)
                    plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     'step-{}-align.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5f}'.format(
                            args.model,
                            datetime.now().strftime('%Y-%m-%d %H:%M'), step,
                            loss),
                        max_len=target_length // hparams.outputs_per_step)
                    # save real and predicted mel-spectrogram plot to disk (control purposes)
                    plot_spectrogram(
                        mel_prediction,
                        os.path.join(
                            plot_dir,
                            'step-{}-mel-spectrogram.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5}'.format(
                            args.model,
                            datetime.now().strftime('%Y-%m-%d %H:%M'), step,
                            loss),
                        target_spectrogram=target,
                        max_len=target_length)
                    log('Input at step {}: {}'.format(
                        step, sequence_to_text(input_seq)))
                    ##### FINISH training
                    ##### Testing....
                    ## do the test when step is the maximum of tacotron training step
            return save_dir
        except Exception as e:
            log('Exiting due to exception: {}'.format(e))
            traceback.print_exc()
            coord.request_stop(e)