Exemplo n.º 1
0
 def synthesize(self, text):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     out = io.BytesIO()
     audio.save_wav(audio.inv_preemphasis(wav), out)
     return out.getvalue()
Exemplo n.º 2
0
 def synthesize(self, text, out):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     audio.save_wav(wav, out)
     return
Exemplo n.º 3
0
 def on_get(self, req, res):
     if not req.params.get('text'):
         raise falcon.HTTPBadRequest()
     text = req.params.get('text')
     speaker = 5 if not req.params.get('speaker') else int(
         req.params.get('speaker'))
     wav = tts(model, req.params.get('text'), 0, speaker)
     #    wav = audio.inv_preemphasis(wav)
     #    wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     res.data = out.getvalue()
     res.content_type = 'audio/wav'
Exemplo n.º 4
0
 def synthesize(self, text):
   cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
   seq = text_to_sequence(text, cleaner_names)
   feed_dict = {
     self.model.inputs: [np.asarray(seq, dtype=np.int32)],
     self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
   }
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   wav = wav[:audio.find_endpoint(wav)]
   out = io.BytesIO()
   audio.save_wav(wav, out)
   return out.getvalue()
Exemplo n.º 5
0
 def synthesize(self, text):
   # 将中文转换为注音字符
   text = Pinyin().get_pinyin(text, " ", tone_marks='numbers')
   cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
   # 注音字符到序列
   seq = text_to_sequence(text, cleaner_names)
   feed_dict = {
     self.model.inputs: [np.asarray(seq, dtype=np.int32)],
     self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)}
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   out = io.BytesIO()
   audio.save_wav(wav, out)
   return out.getvalue()
Exemplo n.º 6
0
 def synthesize(self, text):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     linears = self.session.run(self.model.linear_outputs[0],
                                feed_dict=feed_dict)
     wav = audio.inv_spectrogram(linears.T)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
Exemplo n.º 7
0
 def synthesize(self, in_file):
   src_spectrogram = audio.spectrogram(in_file,
                                       num_src_freq=hparams.num_src_freq,
                                       frame_length_ms=hparams.src_frame_length_ms).astype(np.float32)
   feed_dict = {
     self.model.inputs: [np.asarray(src_spectrogram, dtype=np.float32)],
     self.model.input_lengths: np.asarray([len(src_spectrogram)], dtype=np.int32)
   }
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   wav = wav[:audio.find_endpoint(wav)]
   out = io.BytesIO()
   audio.save_wav(wav, out)
   return out.getvalue()
Exemplo n.º 8
0
 def synthesize(self, text, reference_mel):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
         self.model.mel_targets:
         [np.asarray(reference_mel, dtype=np.float32)]
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
Exemplo n.º 9
0
    def synthesize(self, input_path):
        s, sr = sf.read(input_path)
        spec = audio.melspectrogram(s).astype(np.float32).T

        feed_dict = {
            self.model.inputs: [np.asarray(spec, dtype=np.float32)],
            self.model.input_lengths: np.asarray([spec.shape[0]],
                                                 dtype=np.int32)
        }
        wav = self.session.run(self.wav_output, feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()
        audio.save_wav(wav, out)
        return out.getvalue()
Exemplo n.º 10
0
 def synthesize(self, text):
     seq = textinput.to_sequence(
         text,
         force_lowercase=hparams.force_lowercase,
         expand_abbreviations=hparams.expand_abbreviations)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     spec, alignments = self.session.run(
         [self.model.linear_outputs[0], self.model.alignments[0]],
         feed_dict=feed_dict)
     out = io.BytesIO()
     audio.save_wav(audio.inv_spectrogram(spec.T), out)
     return out.getvalue(), alignments
    def synthesize(self, text):  # for demo_server
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
        }
        wav = self.session.run(self.wav_output, feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()

        audio.save_wav(wav, out)
        # print(type(out))
        return out.getvalue()  # returns bytes obj
Exemplo n.º 12
0
 def synthesize(self, text):
   text = arpa.to_arpa(text)
   cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
   seq = text_to_sequence(text, cleaner_names)
   feed_dict = {
     self.model.inputs: [np.asarray(seq, dtype=np.int32)],
     self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
   }
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   wav = wav[:audio.find_endpoint(wav)]
   out = io.BytesIO()
   #audio.save_wav(wav, out)
   audio.save_wav(wav, "/content/drive/MyDrive/voice_cloning/out_sample")
   print("finishhhhhhhhhhhhhhh")
   return out.getvalue()
Exemplo n.º 13
0
 def synthesize(self, text, base_path, idx):
     seq = text_to_sequence(text)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     input_seq, wav, alignment = self.session.run(
         [self.inputs, self.wav_output, self.alignments],
         feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     input_seq = sequence_to_text(input_seq)
     plot.plot_alignment(alignment, '%s-%d-align.png' % (base_path, idx),
                         input_seq)
     return out.getvalue()
Exemplo n.º 14
0
def main(args):
    os.makedirs(args.model_dir, exist_ok=True)
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=args.model_dir,
        params=hparams,
        config=RunConfig(
            save_summary_steps=args.summary_interval,
            save_checkpoints_steps=args.checkpoint_interval,
            session_config=SESS_CFG,
            # log_step_count_steps=100,
            keep_checkpoint_max=2))
    if args.mode == 'train':
        os.makedirs(args.data_dir, exist_ok=True)
        estimator.train(input_fn=lambda: train_input_fn(args.data_dir))
    elif args.mode == 'predict':
        assert len(args.texts), "No text to predict"
        results = estimator.predict(
            input_fn=lambda: predict_input_fn(args.texts))
        for idx, wav in enumerate(results):
            wav = inv_preemphasis(wav)
            # wav = wav[:find_endpoint(wav)]
            # sp.save('wav_{}.npy'.format(idx), wav, allow_pickle=False)
            save_wav(wav, 'output_{}.wav'.format(idx))
            # break
    elif args.mode == 'export':
        os.makedirs(args.export_dir, exist_ok=True)
        estimator.export_saved_model(
            args.export_dir,
            tf.estimator.export.build_raw_serving_input_receiver_fn(
                {
                    'inputs':
                    tf.placeholder(
                        dtype=tf.int32, shape=(None, None), name='inputs'),
                    'lengths':
                    tf.placeholder(
                        dtype=tf.int32, shape=(None, ), name='lengths'),
                },
                default_batch_size=None),
            # assets_extra=None,
            # as_text=False,
            # checkpoint_path=None,
            # experimental_mode=ModeKeys.PREDICT
        )
    else:
        raise KeyError('Unknown Mode <{}>'.format(args.mode))
Exemplo n.º 15
0
 def save_audio():
     # model instance has spectrogram data which was processed last
     spectrogram = model.spectrogram  #TODO: change this specification
     waveform = audio.inv_spectrogram(spectrogram.T)
     audio.save_wav(
         waveform,
         os.path.join(
             log_dir,
             'iteration_{.updater.iteration}-audio.wav'.format(trainer)))
     plot.plot_alignment(
         alignment,
         os.path.join(
             log_dir,
             'iteration_{.updater.iteration}-align.png'.format(trainer)),
         info='%s, %s, %s, iteration_{.updater.iteration}, loss=%.5f'.
         format(args.model, commit, time_string(), trainer, loss))
     log('Input: %s' % textinput.to_string(input_seq))
Exemplo n.º 16
0
    def synthesize(self,
                   path_in,
                   path_re,
                   mel_targets=None,
                   reference_mel=None,
                   alignment_path=None):
        wav_in = audio.load_wav(path_in)
        wav_re = audio.load_wav(path_re)
        mel_in = audio.melspectrogram(wav_in).astype(np.float32)
        mel_re = audio.melspectrogram(wav_re).astype(np.float32)
        # print(mel_jp)
        feed_dict = {
            self.model.inputs: [mel_in.T],
            self.model.input_lengths: np.asarray([len(mel_in)],
                                                 dtype=np.int32),
            self.model.inputs_jp: [mel_re.T],
        }
        # if mel_targets is not None:
        #   mel_targets = np.expand_dims(mel_targets, 0)
        #   print(reference_mel.shapex)
        #   feed_dict.update({self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32)})
        # if reference_mel is not None:
        #   reference_mel = np.expand_dims(reference_mel, 0)
        #   print(reference_mel.shapex)
        #   feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)})

        wav_out, alignments = self.session.run(
            [self.wav_output, self.alignments], feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav_out)
        end_point = audio.find_endpoint(wav)
        wav = wav[:end_point]
        nowTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")  # 生成当前时间
        randomNum = random.randint(0, 100)  # 生成的随机整数n,其中0<=n<=100
        if randomNum <= 10:
            randomNum = str(0) + str(randomNum)
        uniqueNum = str(nowTime) + str(randomNum)
        out_dir = "static\\out\\" + uniqueNum + ".wav"
        out_name = uniqueNum + ".wav"

        audio.save_wav(wav, out_dir)
        out = io.BytesIO()
        audio.save_wav(wav, out)
        # n_frame = int(end_point / (hparams.frame_shift_ms / 1000* hparams.sample_rate)) + 1
        # plot.plot_alignment(alignments[:,:n_frame], alignment_path, info='%s' % (path))
        return out_dir, out_name
Exemplo n.º 17
0
 def synthesize(self, text):
     #print('synthesize:',text)
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     #text=sentence_to_pinyin(text)
     #print('text:',text)
     #print('cleaner_names:',cleaner_names)
     seq = text_to_sequence_zh(text, cleaner_names)
     print(seq)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
Exemplo n.º 18
0
 def synthesize(self, text):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     # g2p = G2p()
     c_text=text.split('|')[0]
     p_text=text.split('|')[1]
     c_seq = text_to_sequence(c_text, cleaner_names)
     p_seq = text_to_sequence(p_text, cleaner_names)
     feed_dict = {
         self.model.c_inputs: [np.asarray(c_seq, dtype=np.int32)],
         self.model.p_inputs: [np.asarray(p_seq, dtype=np.int32)],
         self.model.c_input_lengths: np.asarray([len(c_seq)], dtype=np.int32),
         self.model.p_input_lengths: np.asarray([len(p_seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
Exemplo n.º 19
0
 def synthesize(self, text, title):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     cwd = os.getcwd()
     audio_dir = cwd + "/narration/saved_audio/" + title + ".wav"
     print(audio_dir)
     with open(audio_dir, "wb") as f:
         f.write(out.getvalue())
     os.system("aplay " + audio_dir)
     return out.getvalue()
Exemplo n.º 20
0
 def synthesize(self, text):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     pprint('Text: ' + text)
     #pprint('Seq')
     #pprint(seq)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     pprint(self.wav_output)
     pprint('>>> Getting wav')
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     pprint('>>> Gotten wav')
     #wav = audio.inv_preemphasis(wav)
     # The audio is typically ~13 seconds unless truncated:
     #wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
Exemplo n.º 21
0
 def synthesize(self, text):
     with Synthesizer.mutex:
         if not Synthesizer.processing:
             Synthesizer.processing = True
             cleaner_names = [
                 x.strip() for x in hparams.cleaners.split(',')
             ]
             seq = text_to_sequence(text, cleaner_names)
             feed_dict = {
                 self.model.inputs: [np.asarray(seq, dtype=np.int32)],
                 self.model.input_lengths:
                 np.asarray([len(seq)], dtype=np.int32)
             }
             wav = self.session.run(self.wav_output, feed_dict=feed_dict)
             wav = audio.inv_preemphasis(wav)
             wav = wav[:audio.find_endpoint(wav)]
             out = io.BytesIO()
             audio.save_wav(wav, out)
             Synthesizer.processing = False
             return out.getvalue()
         else:
             return None
Exemplo n.º 22
0
    def synthesize(self,
                   text,
                   mel_targets=None,
                   reference_mel=None,
                   alignment_path=None):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
        }
        if mel_targets is not None:
            mel_targets = np.expand_dims(mel_targets, 0)
            feed_dict.update({
                self.model.mel_targets:
                np.asarray(mel_targets, dtype=np.float32)
            })
        if reference_mel is not None:
            reference_mel = np.expand_dims(reference_mel, 0)
            feed_dict.update({
                self.model.reference_mel:
                np.asarray(reference_mel, dtype=np.float32)
            })

        wav, alignments = self.session.run([self.wav_output, self.alignments],
                                           feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        end_point = audio.find_endpoint(wav)
        wav = wav[:end_point]
        out = io.BytesIO()
        audio.save_wav(wav, out)
        n_frame = int(
            end_point /
            (hparams.frame_shift_ms / 1000 * hparams.sample_rate)) + 1
        text = '\n'.join(textwrap.wrap(text, 70, break_long_words=False))
        plot.plot_alignment(alignments[:, :n_frame],
                            alignment_path,
                            info='%s' % (text))
        return out.getvalue()
Exemplo n.º 23
0
    def synthesize(self, images_dir, output_wav_dir):
        for path, _, filenames in os.walk(images_dir):
            for i in trange(len(filenames)):
                test_file = filenames[i]
                if str.endswith(test_file, '.png'):
                    base_file_name, _ = os.path.splitext(test_file)
                    raw_image = imread(os.path.join(path, test_file),
                                       mode='RGB')
                    processed_image = imresize(raw_image, (224, 224, 3))

                    feed_dict = {
                        self.model.inputs:
                        [np.asarray(processed_image, dtype=np.float32)],
                    }
                    wav = self.session.run(self.wav_output,
                                           feed_dict=feed_dict)
                    wav = audio.inv_preemphasis(wav)
                    wav = wav[:audio.find_endpoint(wav)]
                    audio_out_path = os.path.join(
                        output_wav_dir, 'eval-{}.wav'.format(base_file_name))
                    audio.save_wav(wav, audio_out_path)
                    print('Wav - {} generated successfully!'.format(
                        audio_out_path))
 def synthesize(self, lab_name):
     lab = np.load(lab_name)
     lab = np.expand_dims(lab, axis=0)
     feed_dict = {
         self.model.inputs:
         lab,
         self.model.input_lengths:
         np.asarray([lab.shape[1]], dtype=np.int32),
         # change 0 to 1 or others based on the speaker
         self.model.speaker_ids:
         np.asarray([2], dtype=np.int32)
     }
     wav, mel_outputs = self.session.run(
         [self.wav_output, self.model.mel_outputs[0]], feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     _len = audio.find_endpoint(wav)
     wav = wav[:_len]
     _len = audio.find_endpoint(wav)
     wav = wav[:_len]
     mel_output = mel_output[:frames, :]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue(), mel_outputs
Exemplo n.º 25
0
    def synthesize(self, text, identity, path=None, path_align=None):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence2(text, cleaner_names)[:-1]
        print(seq)
        print(sequence_to_text2(seq))
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
            self.model.identities: np.asarray([identity], dtype=np.int32),
        }
        wav, alignment = self.session.run([self.wav_output, self.alignment],
                                          feed_dict=feed_dict)
        if path_align is not None:
            plot.plot_alignment(alignment, path_align)
        wav = audio.inv_preemphasis(wav)
        #wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()
        if path is not None:
            audio.save_wav(wav, path)
        else:
            audio.save_wav(wav, './1.wav')

        return out.getvalue()
Exemplo n.º 26
0
    def synthesize(self, text, return_wav=False):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
        }
        wav, alignment = self.session.run([self.wav_output, self.alignment],
                                          feed_dict=feed_dict)

        audio_endpoint = audio.find_endpoint(wav)
        alignment_endpoint = find_alignment_endpoint(alignment.shape,
                                                     audio_endpoint / len(wav))

        wav = wav[:audio_endpoint]
        alignment = alignment[:, :alignment_endpoint]

        if return_wav:
            return wav, alignment

        out = io.BytesIO()
        audio.save_wav(wav, out)
        return out.getvalue(), alignment
Exemplo n.º 27
0
 def synthesize(self, text1, text2):
     seq1 = textinput_fr.to_sequence(
         text1,
         force_lowercase=hparams.force_lowercase,
         expand_abbreviations=hparams.expand_abbreviations)
     seq2 = textinput_fr.to_sequence(
         text2,
         force_lowercase=hparams.force_lowercase,
         expand_abbreviations=False)
     feed_dict = {
         self.model.inputs1: [np.asarray(seq1, dtype=np.int32)],
         self.model.input_lengths1: np.asarray([len(seq1)], dtype=np.int32),
         self.model.inputs2: [np.asarray(seq2, dtype=np.int32)],
         self.model.input_lengths2: np.asarray([len(seq2)], dtype=np.int32)
     }
     spec, alignments1, alignments2 = self.session.run([
         self.model.linear_outputs[0], self.model.alignments1[0],
         self.model.alignments2[0]
     ],
                                                       feed_dict=feed_dict)
     out = io.BytesIO()
     audio.save_wav(audio.inv_spectrogram(spec.T), out)
     return out.getvalue(), alignments1, alignments2
Exemplo n.º 28
0
    def synthesize(self,
                   text,
                   reference_mel=None,
                   gst_index=None,
                   gst_scale=None):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
        }
        if reference_mel is not None:
            reference_mel = np.expand_dims(reference_mel, 0)
            feed_dict.update({
                self.model.reference_mel:
                np.asarray(reference_mel, dtype=np.float32)
            })

        wav = self.session.run(self.wav_output, feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()
        audio.save_wav(wav, out)
        return out.getvalue()
Exemplo n.º 29
0
def train(log_dir, input_path, checkpoint_path, is_restore):
    # Log the info
    log('Checkpoint path: %s' % checkpoint_path)
    log('Loading training data from: %s' % input_path)
    log(hparams_debug_string())

    # Set up DataFeeder:
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = DataFeeder(coord, input_path, hparams)

    # Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    with tf.variable_scope('model') as scope:
        model = create_model('tacotron', hparams)
        model.initialize(feeder.inputs, feeder.input_lengths,
                         feeder.mel_targets, feeder.linear_targets)
        model.add_loss()
        model.add_optimizer(global_step)
        stats = add_stats(model)

    # Bookkeeping:
    step = 0
    saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)
    # Train!
    with tf.Session() as sess:
        try:
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            if is_restore:
                # Restore from a checkpoint if the user requested it.
                restore_path = '%s' % (checkpoint_path)
                saver.restore(sess, restore_path)
                log('Resuming from checkpoint')
            else:
                log('Starting new training')

            feeder.start_in_session(sess)

            while not coord.should_stop():
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_interval = time.time() - start_time

                message = 'Step %d, %.03f sec, loss=%.05f' % (step, loss,
                                                              time_interval)
                log(message)

                if loss > 100 or math.isnan(loss):
                    log('Loss exploded to %.05f at step %d!' % (loss, step),
                        slack=True)
                    raise Exception('Loss Exploded')

                if step % hparams.summary_interval == 0:
                    log('Writing summary at step: %d' % step)
                    summary_writer.add_summary(sess.run(stats), step)

                if step % hparams.checkpoint_interval == 0:
                    log('Saving checkpoint to: %s-%d' %
                        (checkpoint_path, step))
                    saver.save(sess, checkpoint_path, global_step=step)
                    log('Saving audio and alignment...')
                    input_seq, spectrogram, alignment = sess.run([
                        model.inputs[0], model.linear_outputs[0],
                        model.alignments[0]
                    ])
                    waveform = audio.inv_spectrogram(spectrogram.T)
                    audio.save_wav(
                        waveform,
                        os.path.join(log_dir, 'step-%d-audio.wav' % step))
                    plot.plot_alignment(
                        alignment,
                        os.path.join(log_dir, 'step-%d-align.png' % step),
                        info='%s, %s, step=%d, loss=%.5f' %
                        ('tacotron', time_string(), step, loss))
                    log('Input: %s' % sequence_to_text(input_seq))

        except Exception as e:
            log('Exiting due to exception: %s' % e, slack=True)
            coord.request_stop(e)
Exemplo n.º 30
0
def train(log_dir, args):
    run_name = args.name or args.model

    log_dir = os.path.join(args.base_dir, 'logs-%s' % run_name)
    os.makedirs(log_dir, exist_ok=True)
    infolog.init(os.path.join(log_dir, 'train.log'), run_name, args.slack_url)
    checkpoint_path = os.path.join(log_dir, 'model.ckpt')

    with open(args.input, encoding='utf-8') as f:
        metadata = [row.strip().split('|') for row in f]
    metadata = sorted(metadata, key=lambda x: x[2])

    data_element = get_dataset(metadata, args.data_dir, hparams)

    global_step = tf.Variable(0, name='global_step', trainable=False)
    with tf.variable_scope('model') as scope:
        model = create_model(args.model, hparams)
        model.initialize(data_element['input'], data_element['input_lengths'],
                         data_element['mel_targets'],
                         data_element['linear_targets'])
        model.add_loss()
        model.add_optimizer(global_step)

    saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    for _ in range(int(args.max_iter)):

        start_time = time.time()
        step, mel_loss, lin_loss, loss, opt = sess.run([
            global_step, model.mel_loss, model.linear_loss, model.loss,
            model.optimize
        ])
        end_time = time.time()

        message = 'Step %7d [%.03f sec/step, loss = %.05f (mel : %.05f + lin : %.05f)]' % (
            step, end_time - start_time, loss, mel_loss, lin_loss)

        log(message)

        if loss > 100 or math.isnan(loss):
            log('Loss exploded to %.05f at step %d!' % (loss, step))
            raise Exception('Loss Exploded')

        if step % args.checkpoint_interval == 0:
            log('Saving checkpoint to: %s-%d' % (checkpoint_path, step))
            saver.save(sess, checkpoint_path, global_step=step)

            log('Saving audio and alignment...')
            input_seq, spectrogram, alignment = sess.run([
                model.inputs[0], model.linear_outputs[0], model.alignments[0]
            ])
            waveform = audio.inv_spectrogram(spectrogram.T)
            audio.save_wav(waveform,
                           os.path.join(log_dir, 'step-%d-audio.wav' % step))
            plot.plot_alignment(alignment,
                                os.path.join(log_dir,
                                             'step-%d-align.png' % step),
                                info='%s, %s, step=%d, loss=%.5f' %
                                (args.model, time_string(), step, loss))

            log('Input: %s' % sequence_to_text(input_seq))
Exemplo n.º 31
0
def train(log_dir, args):
  commit = get_git_commit() if args.git else 'None'
  checkpoint_path = os.path.join(log_dir, 'model.ckpt')
  input_path = os.path.join(args.base_dir, args.input)
  log('Checkpoint path: %s' % checkpoint_path)
  log('Loading training data from: %s' % input_path)
  log('Using model: %s' % args.model)
  log(hparams_debug_string())

  # Set up DataFeeder:
  coord = tf.train.Coordinator()
  with tf.variable_scope('datafeeder') as scope:
    feeder = DataFeeder(coord, input_path, hparams)

  # Set up model:
  global_step = tf.Variable(0, name='global_step', trainable=False)
  with tf.variable_scope('model') as scope:
    model = create_model(args.model, hparams)
    model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets)
    model.add_loss()
    model.add_optimizer(global_step)
    stats = add_stats(model)

  # Bookkeeping:
  step = 0
  time_window = ValueWindow(100)
  loss_window = ValueWindow(100)
  saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)

  # Train!
  with tf.Session() as sess:
    try:
      summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
      sess.run(tf.global_variables_initializer())

      if args.restore_step:
        # Restore from a checkpoint if the user requested it.
        restore_path = '%s-%d' % (checkpoint_path, args.restore_step)
        saver.restore(sess, restore_path)
        log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True)
      else:
        log('Starting new training run at commit: %s' % commit, slack=True)

      feeder.start_in_session(sess)

      while not coord.should_stop():
        start_time = time.time()
        step, loss, opt = sess.run([global_step, model.loss, model.optimize])
        time_window.append(time.time() - start_time)
        loss_window.append(loss)
        message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % (
          step, time_window.average, loss, loss_window.average)
        log(message, slack=(step % args.checkpoint_interval == 0))

        if loss > 100 or math.isnan(loss):
          log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True)
          raise Exception('Loss Exploded')

        if step % args.summary_interval == 0:
          log('Writing summary at step: %d' % step)
          summary_writer.add_summary(sess.run(stats), step)

        if step % args.checkpoint_interval == 0:
          log('Saving checkpoint to: %s-%d' % (checkpoint_path, step))
          saver.save(sess, checkpoint_path, global_step=step)
          log('Saving audio and alignment...')
          input_seq, spectrogram, alignment = sess.run([
            model.inputs[0], model.linear_outputs[0], model.alignments[0]])
          waveform = audio.inv_spectrogram(spectrogram.T)
          audio.save_wav(waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step))
          plot.plot_alignment(alignment, os.path.join(log_dir, 'step-%d-align.png' % step),
            info='%s, %s, %s, step=%d, loss=%.5f' % (args.model, commit, time_string(), step, loss))
          log('Input: %s' % sequence_to_text(input_seq))

    except Exception as e:
      log('Exiting due to exception: %s' % e, slack=True)
      traceback.print_exc()
      coord.request_stop(e)
Exemplo n.º 32
0
def train(log_dir, args, trans_ckpt_dir=None):
    commit = get_git_commit() if args.git else 'None'
    checkpoint_path = os.path.join(log_dir, 'model.ckpt')
    if trans_ckpt_dir != None:
        trans_checkpoint_path = os.path.join(trans_ckpt_dir, 'model.ckpt')

    input_path = os.path.join(args.base_dir, args.input)
    log('Checkpoint path: %s' % trans_checkpoint_path)
    log('Loading training data from: %s' % input_path)
    log('Using model: %s' % args.model)
    log(hparams_debug_string())

    # Set up DataFeeder:
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = DataFeeder(coord, input_path, hparams)

    # Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    with tf.variable_scope('model') as scope:
        model = create_model(args.model, hparams)
        model.initialize(feeder.inputs, feeder.input_lengths,
                         feeder.mel_targets, feeder.linear_targets)
        model.add_loss()
        model.add_optimizer(global_step)
        stats = add_stats(model)

    # Bookkeeping:
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)

    # Train!
    with tf.Session() as sess:
        try:
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            if args.restore_step:
                # Restore from a checkpoint if the user requested it.
                restore_path = '%s-%d' % (trans_checkpoint_path,
                                          args.restore_step)
                saver.restore(sess, restore_path)
                log('Resuming from checkpoint: %s at commit: %s' %
                    (restore_path, commit),
                    slack=True)
            else:
                log('Starting new training run at commit: %s' % commit,
                    slack=True)

            feeder.start_in_session(sess)

            while not coord.should_stop():
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % (
                    step, time_window.average, loss, loss_window.average)
                log(message, slack=(step % args.checkpoint_interval == 0))

                if loss > 100 or math.isnan(loss):
                    log('Loss exploded to %.05f at step %d!' % (loss, step),
                        slack=True)
                    raise Exception('Loss Exploded')

                if step % args.summary_interval == 0:
                    log('Writing summary at step: %d' % step)
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.checkpoint_interval == 0:
                    log('Saving checkpoint to: %s-%d' %
                        (checkpoint_path, step))
                    saver.save(sess, checkpoint_path, global_step=step)
                    log('Saving audio and alignment...')
                    input_seq, spectrogram, alignment = sess.run([
                        model.inputs[0], model.linear_outputs[0],
                        model.alignments[0]
                    ])
                    waveform = audio.inv_spectrogram(spectrogram.T)
                    audio.save_wav(
                        waveform,
                        os.path.join(log_dir, 'step-%d-audio.wav' % step))
                    plot.plot_alignment(
                        alignment,
                        os.path.join(log_dir, 'step-%d-align.png' % step),
                        info='%s, %s, %s, step=%d, loss=%.5f' %
                        (args.model, commit, time_string(), step, loss))
                    log('Input: %s' % sequence_to_text(input_seq))

        except Exception as e:
            log('Exiting due to exception: %s' % e, slack=True)
            traceback.print_exc()
            coord.request_stop(e)