def synthesize(self):
     self.ui.log("Generating the mel spectrogram...")
     self.ui.set_loading(1)
     
     # Synthesize the spectrogram
     if self.synthesizer is None:
         model_dir = self.ui.current_synthesizer_model_dir
         checkpoints_dir = model_dir.joinpath("taco_pretrained")
         self.synthesizer = Synthesizer(checkpoints_dir, low_mem=self.low_mem)
     if not self.synthesizer.is_loaded():
         self.ui.log("Loading the synthesizer %s" % self.synthesizer.checkpoint_fpath)
     
     texts = self.ui.text_prompt.toPlainText().split("\n")
     print(texts)
     texts = g2p(texts)
     print(texts)
     embed = self.ui.selected_utterance.embed
     embeds = np.stack([embed] * len(texts))
     specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
     breaks = [spec.shape[1] for spec in specs]
     spec = np.concatenate(specs, axis=1)
     
     self.ui.draw_spec(spec, "generated")
     self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None)
     self.ui.set_loading(0)
示例#2
0
def preprocess_speaker_sst(speaker_dir, out_dir: Path, skip_existing: bool, hparams):
    metadata = []
    lines = []
    texts = []
    index = 1

    with open(os.path.join(speaker_dir, 'metadata.csv'), encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(',')
            #if (float(parts[2])>2 and float(parts[2])<7):
            lines.append(parts[0])    
            with open(os.path.join(speaker_dir, parts[1]), encoding='utf-8') as f2:
                for line2 in f2:
                    txt_paths = line2
            texts.append(txt_paths)
        #print(texts)
        #print(lines)
        texts = g2p(texts)
        #print(texts)
        #print(lines)
        for basename, text in zip(lines,texts):
            wav_path = os.path.join(speaker_dir, basename)
            wav, _ = librosa.load(wav_path, hparams.sample_rate)
            if hparams.rescale:
                wav = wav / np.abs(wav).max() * hparams.rescaling_max
            #futures.append(partial(process_utterance, mel_dir, linear_dir, wav_dir, basename, wav_path, text, hparams))
            basename2 = basename.strip().split('/')
            basename3 = "sl_"+basename2[0]+"_"+basename2[1]+"_"+basename2[2]
            #+"_"+basename2[2]
            #print(basename2[5])
            metadata.append(process_utterance(wav, text, out_dir,basename3 , 
                    skip_existing, hparams))
            index += 1
    return [m for m in metadata if m is not None] 
示例#3
0
def preprocess_speaker2(speaker_dir, out_dir: Path, skip_existing: bool, hparams):
    metadata = []
    wavs = []
    texts = []
    for book_dir in speaker_dir.glob("*"):
        # Gather the utterance audios and texts
        try:
            alignments_fpath = next(book_dir.glob("*.alignment.txt"))
            with alignments_fpath.open("r") as alignments_file:
                alignments = [line.rstrip().split(" ") for line in alignments_file]
        except StopIteration:
            # A few alignment files will be missing
            continue
        for wav_fname, words, end_times in alignments:
            wav_fpath = book_dir.joinpath(wav_fname + ".flac")
            assert wav_fpath.exists()
            words = words.replace("\"", "").split(",")
            end_times = list(map(float, end_times.replace("\"", "").split(",")))
            
            # Process each sub-utterance
            wav, text = split_on_silences(wav_fpath, words, end_times, hparams)
            texts.extend(text)
            wavs.extend(wav)
    texts = g2p(texts)
    for i, (wav, text) in enumerate(zip(wavs, texts)):
        sub_basename = "%s_%02d" % (wav_fname, i)
        metadata.append(process_utterance(wav, text, out_dir, sub_basename, 
                                          skip_existing, hparams))

    return [m for m in metadata if m is not None]
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool,
                       hparams):
    metadata = []
    lines = []
    texts = []
    index = 1
    with open(os.path.join(speaker_dir, 'metadata.csv'),
              encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('|')
            basename = parts[0]
            text = parts[2]
            lines.append(basename)
            texts.append(text)
        texts = g2p(texts)
        for basename, text in zip(lines, texts):
            wav_path = os.path.join(speaker_dir, '{}.wav'.format(basename))
            wav, _ = librosa.load(wav_path, hparams.sample_rate)
            if hparams.rescale:
                wav = wav / np.abs(wav).max() * hparams.rescaling_max
            metadata.append(
                process_utterance(wav, text, out_dir, basename, skip_existing,
                                  hparams))
            index += 1
    return [m for m in metadata if m is not None]
示例#5
0
 preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
 print("Loaded file succesfully")
 
 # Then we derive the embedding. There are many functions and parameters that the 
 # speaker encoder interfaces. These are mostly for in-depth research. You will typically
 # only use this function (with its default parameters):
 embed = encoder.embed_utterance(preprocessed_wav)
 print("Created the embedding")
 
 
 ## Generating the spectrogram
 # text = input("Write a sentence (+-20 words) to be synthesized:(Введите предложение для синтеза)\n")
 
 # The synthesizer works in batch, so you need to put your data in a list or numpy array
 texts = [args.text]
 texts = g2p(texts)
 print(texts)
 embeds = [embed]
 # If you know what the attention layer alignments are, you can retrieve them here by
 # passing return_alignments=True
 specs = synthesizer.synthesize_spectrograms(texts, embeds)
 spec = specs[0]
 print("Created the mel spectrogram")
 
 
 ## Generating the waveform
 print("Synthesizing the waveform:")
 # Synthesizing the waveform is fairly straightforward. Remember that the longer the
 # spectrogram, the more time-efficient the vocoder.
 generated_wav = vocoder.infer_waveform(spec)