def synthesize(self, text, mel_targets=None, reference_mel=None):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
     }
     if mel_targets is not None:
         mel_targets = np.expand_dims(mel_targets, 0)
         feed_dict.update({
             self.model.mel_targets:
             np.asarray(mel_targets, dtype=np.float32)
         })
     if reference_mel is not None:
         reference_mel = np.expand_dims(reference_mel, 0)
         feed_dict.update({
             self.model.reference_mel:
             np.asarray(reference_mel, dtype=np.float32)
         })
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
示例#2
0
  def synthesize(self, text):
    cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
    print ('***cleaner_names:', cleaner_names)
    print ('***text:', text)
    texts = tokenizer.tokenize(text)
    waves=[]

    for text in texts:
      seq = text_to_sequence(text, cleaner_names)
      print ('***seq:', seq)

      feed_dict = {
        self.model.inputs: [np.asarray(seq, dtype=np.int32)],
        self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
      }
      wav = self.session.run(self.wav_output, feed_dict=feed_dict)
      wav = audio.inv_preemphasis(wav)
      wav = wav[:audio.find_endpoint(wav)]
      waves.append(wav)
    wavestack=waves[0]
    for wave in waves[1:]:
      wavestack=np.hstack((wavestack,wave))  
    out = io.BytesIO()
    audio.save_wav(wavestack, out)
    return out.getvalue()
示例#3
0
    def synthesize(self,
                   text,
                   mel_targets=None,
                   reference_mel=None,
                   reference_weight=None,
                   alignment_path=None,
                   reference_path=None,
                   style_path=None,
                   weight_path=None):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
        }
        if mel_targets is not None:
            mel_targets = np.expand_dims(mel_targets, 0)
            feed_dict.update({
                self.model.mel_targets:
                np.asarray(mel_targets, dtype=np.float32)
            })
        elif reference_mel is not None:
            reference_mel = np.expand_dims(reference_mel, 0)
            feed_dict.update({
                self.model.reference_mel:
                np.asarray(reference_mel, dtype=np.float32)
            })
        elif reference_weight is not None:
            feed_dict.update({
                self.model.reference_weight:
                np.asarray(reference_weight, dtype=np.float32)
            })

        wav, alignments, style_embeddings, style_weights = self.session.run(
            [
                self.wav_output, self.alignments, self.style_embeddings,
                self.style_weights
            ],
            feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        end_point = audio.find_endpoint(wav)
        wav = wav[:end_point]
        out = io.BytesIO()
        audio.save_wav(wav, out)
        n_frame = int(
            end_point /
            (hparams.frame_shift_ms / 1000 * hparams.sample_rate)) + 1
        text = '\n'.join(textwrap.wrap(text, 70, break_long_words=False))
        plot.plot_alignment(alignments[:, :n_frame],
                            alignment_path,
                            info='%s' % (text))
        plot.plot_weight(style_weights, weight_path)
        # np.save(reference_path, refer_embeddings)
        np.save(style_path, style_embeddings)
        return out.getvalue()
示例#4
0
 def synthesize(self, text, out):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     audio.save_wav(wav, out)
     return
示例#5
0
 def synthesize(self, text):
   cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
   seq = text_to_sequence(text, cleaner_names)
   feed_dict = {
     self.model.inputs: [np.asarray(seq, dtype=np.int32)],
     self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
   }
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   wav = wav[:audio.find_endpoint(wav)]
   out = io.BytesIO()
   audio.save_wav(wav, out)
   return out.getvalue()
 def synthesize(self, lab_name):
     lab = np.load(lab_name)
     lab = np.expand_dims(lab, axis=0)
     feed_dict = {
         self.model.inputs:
         lab,
         self.model.input_lengths:
         np.asarray([lab.shape[1]], dtype=np.int32),
         # change 0 to 1 or others based on the speaker
         self.model.speaker_ids:
         np.asarray([2], dtype=np.int32)
     }
     wav, mel_outputs = self.session.run(
         [self.wav_output, self.model.mel_outputs[0]], feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     _len = audio.find_endpoint(wav)
     wav = wav[:_len]
     _len = audio.find_endpoint(wav)
     wav = wav[:_len]
     mel_output = mel_output[:frames, :]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue(), mel_outputs
示例#7
0
 def synthesize(self, in_file):
   src_spectrogram = audio.spectrogram(in_file,
                                       num_src_freq=hparams.num_src_freq,
                                       frame_length_ms=hparams.src_frame_length_ms).astype(np.float32)
   feed_dict = {
     self.model.inputs: [np.asarray(src_spectrogram, dtype=np.float32)],
     self.model.input_lengths: np.asarray([len(src_spectrogram)], dtype=np.int32)
   }
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   wav = wav[:audio.find_endpoint(wav)]
   out = io.BytesIO()
   audio.save_wav(wav, out)
   return out.getvalue()
示例#8
0
 def synthesize(self, text):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     linears = self.session.run(self.model.linear_outputs[0],
                                feed_dict=feed_dict)
     wav = audio.inv_spectrogram(linears.T)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
    def synthesize(self, text):  # for demo_server
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
        }
        wav = self.session.run(self.wav_output, feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()

        audio.save_wav(wav, out)
        # print(type(out))
        return out.getvalue()  # returns bytes obj
示例#10
0
    def synthesize(self, input_path):
        s, sr = sf.read(input_path)
        spec = audio.melspectrogram(s).astype(np.float32).T

        feed_dict = {
            self.model.inputs: [np.asarray(spec, dtype=np.float32)],
            self.model.input_lengths: np.asarray([spec.shape[0]],
                                                 dtype=np.int32)
        }
        wav = self.session.run(self.wav_output, feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()
        audio.save_wav(wav, out)
        return out.getvalue()
示例#11
0
 def synthesize(self, text):
   text = arpa.to_arpa(text)
   cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
   seq = text_to_sequence(text, cleaner_names)
   feed_dict = {
     self.model.inputs: [np.asarray(seq, dtype=np.int32)],
     self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
   }
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   wav = wav[:audio.find_endpoint(wav)]
   out = io.BytesIO()
   #audio.save_wav(wav, out)
   audio.save_wav(wav, "/content/drive/MyDrive/voice_cloning/out_sample")
   print("finishhhhhhhhhhhhhhh")
   return out.getvalue()
示例#12
0
 def synthesize(self, text, base_path, idx):
     seq = text_to_sequence(text)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     input_seq, wav, alignment = self.session.run(
         [self.inputs, self.wav_output, self.alignments],
         feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     input_seq = sequence_to_text(input_seq)
     plot.plot_alignment(alignment, '%s-%d-align.png' % (base_path, idx),
                         input_seq)
     return out.getvalue()
示例#13
0
    def synthesize(self,
                   path_in,
                   path_re,
                   mel_targets=None,
                   reference_mel=None,
                   alignment_path=None):
        wav_in = audio.load_wav(path_in)
        wav_re = audio.load_wav(path_re)
        mel_in = audio.melspectrogram(wav_in).astype(np.float32)
        mel_re = audio.melspectrogram(wav_re).astype(np.float32)
        # print(mel_jp)
        feed_dict = {
            self.model.inputs: [mel_in.T],
            self.model.input_lengths: np.asarray([len(mel_in)],
                                                 dtype=np.int32),
            self.model.inputs_jp: [mel_re.T],
        }
        # if mel_targets is not None:
        #   mel_targets = np.expand_dims(mel_targets, 0)
        #   print(reference_mel.shapex)
        #   feed_dict.update({self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32)})
        # if reference_mel is not None:
        #   reference_mel = np.expand_dims(reference_mel, 0)
        #   print(reference_mel.shapex)
        #   feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)})

        wav_out, alignments = self.session.run(
            [self.wav_output, self.alignments], feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav_out)
        end_point = audio.find_endpoint(wav)
        wav = wav[:end_point]
        nowTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")  # 生成当前时间
        randomNum = random.randint(0, 100)  # 生成的随机整数n,其中0<=n<=100
        if randomNum <= 10:
            randomNum = str(0) + str(randomNum)
        uniqueNum = str(nowTime) + str(randomNum)
        out_dir = "static\\out\\" + uniqueNum + ".wav"
        out_name = uniqueNum + ".wav"

        audio.save_wav(wav, out_dir)
        out = io.BytesIO()
        audio.save_wav(wav, out)
        # n_frame = int(end_point / (hparams.frame_shift_ms / 1000* hparams.sample_rate)) + 1
        # plot.plot_alignment(alignments[:,:n_frame], alignment_path, info='%s' % (path))
        return out_dir, out_name
示例#14
0
 def synthesize(self, text):
     #print('synthesize:',text)
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     #text=sentence_to_pinyin(text)
     #print('text:',text)
     #print('cleaner_names:',cleaner_names)
     seq = text_to_sequence_zh(text, cleaner_names)
     print(seq)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
示例#15
0
 def synthesize(self, text, title):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     cwd = os.getcwd()
     audio_dir = cwd + "/narration/saved_audio/" + title + ".wav"
     print(audio_dir)
     with open(audio_dir, "wb") as f:
         f.write(out.getvalue())
     os.system("aplay " + audio_dir)
     return out.getvalue()
示例#16
0
 def synthesize(self, text):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     # g2p = G2p()
     c_text=text.split('|')[0]
     p_text=text.split('|')[1]
     c_seq = text_to_sequence(c_text, cleaner_names)
     p_seq = text_to_sequence(p_text, cleaner_names)
     feed_dict = {
         self.model.c_inputs: [np.asarray(c_seq, dtype=np.int32)],
         self.model.p_inputs: [np.asarray(p_seq, dtype=np.int32)],
         self.model.c_input_lengths: np.asarray([len(c_seq)], dtype=np.int32),
         self.model.p_input_lengths: np.asarray([len(p_seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
示例#17
0
 def synthesize(self, text):
     with Synthesizer.mutex:
         if not Synthesizer.processing:
             Synthesizer.processing = True
             cleaner_names = [
                 x.strip() for x in hparams.cleaners.split(',')
             ]
             seq = text_to_sequence(text, cleaner_names)
             feed_dict = {
                 self.model.inputs: [np.asarray(seq, dtype=np.int32)],
                 self.model.input_lengths:
                 np.asarray([len(seq)], dtype=np.int32)
             }
             wav = self.session.run(self.wav_output, feed_dict=feed_dict)
             wav = audio.inv_preemphasis(wav)
             wav = wav[:audio.find_endpoint(wav)]
             out = io.BytesIO()
             audio.save_wav(wav, out)
             Synthesizer.processing = False
             return out.getvalue()
         else:
             return None
示例#18
0
    def synthesize(self, text, return_wav=False):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
        }
        wav, alignment = self.session.run([self.wav_output, self.alignment],
                                          feed_dict=feed_dict)

        audio_endpoint = audio.find_endpoint(wav)
        alignment_endpoint = find_alignment_endpoint(alignment.shape,
                                                     audio_endpoint / len(wav))

        wav = wav[:audio_endpoint]
        alignment = alignment[:, :alignment_endpoint]

        if return_wav:
            return wav, alignment

        out = io.BytesIO()
        audio.save_wav(wav, out)
        return out.getvalue(), alignment
示例#19
0
    def synthesize(self, images_dir, output_wav_dir):
        for path, _, filenames in os.walk(images_dir):
            for i in trange(len(filenames)):
                test_file = filenames[i]
                if str.endswith(test_file, '.png'):
                    base_file_name, _ = os.path.splitext(test_file)
                    raw_image = imread(os.path.join(path, test_file),
                                       mode='RGB')
                    processed_image = imresize(raw_image, (224, 224, 3))

                    feed_dict = {
                        self.model.inputs:
                        [np.asarray(processed_image, dtype=np.float32)],
                    }
                    wav = self.session.run(self.wav_output,
                                           feed_dict=feed_dict)
                    wav = audio.inv_preemphasis(wav)
                    wav = wav[:audio.find_endpoint(wav)]
                    audio_out_path = os.path.join(
                        output_wav_dir, 'eval-{}.wav'.format(base_file_name))
                    audio.save_wav(wav, audio_out_path)
                    print('Wav - {} generated successfully!'.format(
                        audio_out_path))
示例#20
0
def _pml_to_wav(pml_features,
                cfg,
                shift=0.005,
                dftlen=4096,
                nm_cont=False,
                verbose_level=0,
                mean_norm=None,
                std_norm=None,
                spec_type='mcep',
                pp_mcep=False,
                find_endpoint=False,
                threshold_db=0):
    # get the mean and variance, and denormalise
    if mean_norm is not None and std_norm is not None:
        std_tiled = np.tile(std_norm, (pml_features.shape[0], 1))
        mean_tiled = np.tile(mean_norm, (pml_features.shape[0], 1))
        pml_features = pml_features * std_tiled + mean_tiled

    # f0s is from flf0
    f0 = pml_features[:, cfg.acoustic_start_index['lf0']:cfg.
                      acoustic_start_index['lf0'] +
                      cfg.acoustic_in_dimension_dict['lf0']]

    f0 = np.squeeze(f0)  # remove the extra 1 dimension here
    f0[f0 > 0] = np.exp(f0[f0 > 0])
    ts = shift * np.arange(len(f0))
    f0s = np.vstack((ts, f0)).T

    # spec comes from fmcep or something else fwbnd
    if spec_type == 'mcep':
        mcep = pml_features[:, cfg.acoustic_start_index['mgc']:cfg.
                            acoustic_start_index['mgc'] +
                            cfg.acoustic_in_dimension_dict['mgc']]

        if pp_mcep:
            from lib.merlin import generate_pp
            mcep = generate_pp.mcep_postproc_sptk(mcep,
                                                  cfg.wav_sr,
                                                  dftlen=dftlen)

        spec = sp.mcep2spec(mcep, sp.bark_alpha(cfg.wav_sr), dftlen)
    elif spec_type == 'fwbnd':
        compspec = pml_features[:, cfg.acoustic_start_index['mgc']:cfg.
                                acoustic_start_index['mgc'] +
                                cfg.acoustic_in_dimension_dict['mgc']]
        spec = np.exp(sp.fwbnd2linbnd(compspec, cfg.wav_sr, dftlen))

        if pp_mcep:
            from lib.merlin import generate_pp
            mcep = sp.spec2mcep(spec * cfg.wav_sr, sp.bark_alpha(cfg.wav_sr),
                                256)
            mcep_pp = generate_pp.mcep_postproc_sptk(mcep,
                                                     cfg.wav_sr,
                                                     dftlen=dftlen)
            spec = sp.mcep2spec(
                mcep_pp, sp.bark_alpha(cfg.wav_sr), dftlen=dftlen) / cfg.wav_sr

    # NM comes from bap
    fwnm = pml_features[:, cfg.acoustic_start_index['bap']:cfg.
                        acoustic_start_index['bap'] +
                        cfg.acoustic_in_dimension_dict['bap']]

    nm = sp.fwbnd2linbnd(fwnm, cfg.wav_sr, dftlen)

    # use standard PML vocoder
    wav = synthesize(cfg.wav_sr,
                     f0s,
                     spec,
                     NM=nm,
                     nm_cont=nm_cont,
                     verbose=verbose_level)

    # clip the wav to the endpoint if required
    if find_endpoint:
        wav = wav[:audio.find_endpoint(wav, threshold_db=threshold_db)]

    # return the raw wav data
    return wav