def _initialize(self): """ initialize with the necessary elements """ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "step-162000") self.waveflow_checkpoint_path = os.path.join(self.directory, "assets", "vocoder", "step-2000000") self.waveflow_config_path = os.path.join(self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml") tts_config_path = os.path.join(self.directory, "assets", "tts", "ljspeech.yaml") with open(tts_config_path) as f: self.tts_config = yaml.load(f, Loader=yaml.Loader) with fluid.dygraph.guard(fluid.CPUPlace()): self.tts_model = FastSpeechModel( self.tts_config['network'], num_mels=self.tts_config['audio']['num_mels']) io.load_parameters(model=self.tts_model, checkpoint_path=self.tts_checkpoint_path) # Build vocoder. args = AttrDict() args.config = self.waveflow_config_path args.use_fp16 = False self.waveflow_config = io.add_yaml_config_to_args(args) self.waveflow = WaveFlowModule(self.waveflow_config) io.load_parameters(model=self.waveflow, checkpoint_path=self.waveflow_checkpoint_path)
def _initialize(self): """ initialize with the necessary elements """ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "step-120000") self.waveflow_checkpoint_path = os.path.join(self.directory, "assets", "vocoder", "step-2000000") self.waveflow_config_path = os.path.join(self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml") tts_config_path = os.path.join(self.directory, "assets", "tts", "ljspeech.yaml") with open(tts_config_path) as f: self.tts_config = yaml.load(f, Loader=yaml.Loader) # The max length of audio when synthsis. self.max_len = 1000 # The threshold of stop token which indicates the time step should stop generate spectrum or not. self.stop_threshold = 0.5 with fluid.dygraph.guard(fluid.CPUPlace()): # Build TTS. with fluid.unique_name.guard(): network_cfg = self.tts_config['network'] self.tts_model = TransformerTTSModel( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], self.tts_config['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) io.load_parameters(model=self.tts_model, checkpoint_path=self.tts_checkpoint_path) # Build vocoder. args = AttrDict() args.config = self.waveflow_config_path args.use_fp16 = False self.waveflow_config = io.add_yaml_config_to_args(args) self.waveflow = WaveFlowModule(self.waveflow_config) io.load_parameters(model=self.waveflow, checkpoint_path=self.waveflow_checkpoint_path)
def synthesis_with_waveflow(mel_output, args, checkpoint, place): fluid.enable_dygraph(place) args.config = args.config_vocoder args.use_fp16 = False config = io.add_yaml_config_to_args(args) mel_spectrogram = fluid.layers.transpose(mel_output, [0, 2, 1]) # Build model. waveflow = WaveFlowModule(config) io.load_parameters(model=waveflow, checkpoint_path=checkpoint) for layer in waveflow.sublayers(): if isinstance(layer, weight_norm.WeightNormWrapper): layer.remove_weight_norm() # Run model inference. wav = waveflow.synthesize(mel_spectrogram, sigma=config.sigma) return wav.numpy()[0]
iteration += 1 if iteration % config.test_every == 0: # Run validation step. model.valid_step(iteration) if rank == 0 and iteration % config.save_every == 0: # Save parameters. model.save(iteration) # Close TensorBoard. if rank == 0: vdl.close() if __name__ == "__main__": # Create parser. parser = argparse.ArgumentParser(description="Train WaveFlow model") #formatter_class='default_argparse') add_options_to_parser(parser) utils.add_config_options_to_parser(parser) # Parse argument from both command line and yaml config file. # For conflicting updates to the same field, # the preceding update will be overwritten by the following one. config = parser.parse_args() config = io.add_yaml_config_to_args(config) # Force to use fp32 in model training vars(config)["use_fp16"] = False train(config)