def preprocess(self): wav = self.ui.selected_utterance.wav out = aukit.remove_noise(wav, sr=Synthesizer.sample_rate) hp = aukit.Dict2Obj({}) hp["vad_window_length"] = 10 # milliseconds hp["vad_moving_average_width"] = 2 hp["vad_max_silence_length"] = 2 hp["audio_norm_target_dBFS"] = -32 hp["sample_rate"] = 16000 hp["int16_max"] = (2**15) - 1 out = trim_long_silences(out, hparams=hp) spec = Synthesizer.make_spectrogram(out) self.ui.draw_align(spec[::-1], "current")
def synthesize(self): self.ui.log("Generating the mel spectrogram...") self.ui.set_loading(1) # Synthesize the spectrogram if self.synthesizer is None: model_dir = Path(self.ui.current_synthesizer_model_dir) checkpoints_dir = model_dir.joinpath("checkpoints") hp_path = model_dir.joinpath( "metas", "hparams.json") # load from trained models if hp_path.exists(): hparams = aukit.Dict2Obj( json.load(open(hp_path, encoding="utf8"))) else: hparams = None self.synthesizer = Synthesizer(checkpoints_dir, low_mem=self.low_mem, hparams=hparams) if not self.synthesizer.is_loaded(): self.ui.log("Loading the synthesizer %s" % self.synthesizer.checkpoint_fpath) ptext = self.ui.text_prompt.toPlainText() texts = ptext.split("\n") embed = self.ui.selected_utterance.embed embeds = np.stack([embed] * len(texts)) specs, aligns = self.synthesizer.synthesize_spectrograms( texts, embeds, return_alignments=True) breaks = [spec.shape[1] for spec in specs] spec = np.concatenate(specs, axis=1) align = np.concatenate(aligns, axis=1) fref = self.ui.selected_utterance.name ftext = '。'.join(texts) ftime = '{}'.format(time_formatter()) fname = filename_formatter('{}_{}_{}zi_{}.npy'.format( fref, ftime, len(ftext), ftext)) np.save(self._out_mel_dir.joinpath(fname), spec, allow_pickle=False) # save self.ui.draw_spec(spec, "generated") self.ui.draw_align(align, "generated") self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None) self.ui.set_loading(0)
print( "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(args.enc_model_fpath, device='cpu') # 从模型目录导入hparams hp_path = args.syn_model_dir.parent.joinpath( "metas", "hparams.json") # load from trained models if hp_path.exists(): hparams = aukit.Dict2Obj(json.load(open(hp_path, encoding="utf8"))) print('hparams:') print( json.dumps({k: v for k, v in hparams.items()}, ensure_ascii=False, indent=4)) else: hparams = None print('hparams:', hparams) synthesizer = Synthesizer(args.syn_model_dir, low_mem=args.low_mem, hparams=hparams) # vocoder.load_model(args.voc_model_fpath)