def create_hparams(hparams_string=None, verbose=False, level=2): """Create model hyperparameters. Parse nondefault from given string.""" # hparams = tf.contrib.training.HParams( hparams = Dict2Obj( dict( ################################ # Experiment Parameters # ################################ epochs=50000, iters_per_checkpoint=1000, # 500, seed=1234, dynamic_loss_scaling=True, fp16_run=False, distributed_run=False, dist_backend="nccl", dist_url="tcp://localhost:54321", cudnn_enabled=True, cudnn_benchmark=False, ignore_layers=['speaker_embedding.weight'], ################################ # Data Parameters # ################################ train_mode='train-f04', # f01:用基频;f02:用基频均值填充;f03:用零向量代替基频;f04:不用基频。 # f01,f02,f03的模式都把prenet_f0_dim设为1,f04把prenet_f0_dim设为0。 training_files= r"F:\github\zhrtvc\data\SV2TTS\mellotron\samples_ssml\train.txt", # 文件一行记录一个语音信息,每行的数据结构:数据文件夹名\t语音源文件\t文本\t说话人名称\n,样例如下: # 000000 Aibao/005397.mp3 他走近钢琴并开始演奏“祖国从哪里开始”。 0 validation_files= r"F:\github\zhrtvc\data\SV2TTS\mellotron\samples_ssml\validation.txt", # 'filelists/ljs_audiopaths_text_sid_val_filelist.txt', text_cleaners='ssml', # ['chinese_cleaners'], p_arpabet=1.0, cmudict_path=None, # "data/cmu_dictionary", ################################ # Audio Parameters # ################################ max_wav_value=32768.0, sampling_rate=hparams_griffinlim.sample_rate, # 16000, # 22050, filter_length=hparams_griffinlim.n_fft, # 1024, hop_length=hparams_griffinlim.hop_size, # 256, win_length=hparams_griffinlim.win_size, # 1024, n_mel_channels=401, # 80, mel_fmin=0.0, mel_fmax=8000.0, f0_min=80, f0_max=880, harm_thresh=0.25, ################################ # Model Parameters # ################################ n_symbols=145, # len(symbols), symbols_embedding_dim=128 * level, # 512, # Encoder parameters encoder_kernel_size=5, encoder_n_convolutions=3, encoder_embedding_dim=128 * level, # 512, # Decoder parameters n_frames_per_step=1, # currently only 1 is supported decoder_rnn_dim=256 * level, # 1024, prenet_dim=64 * level, # 256, prenet_f0_n_layers=1, prenet_f0_dim=0, # 1, 如果不启用f0,则设置为0。 prenet_f0_kernel_size=1, prenet_rms_dim=0, prenet_rms_kernel_size=1, max_decoder_steps=2000, # 1000, gate_threshold=0.5, p_attention_dropout=0.1, p_decoder_dropout=0.1, p_teacher_forcing=1.0, # Attention parameters attention_rnn_dim=256 * level, # 1024, attention_dim=32 * level, # 128, # Location Layer parameters attention_location_n_filters=8 * level, # 32, attention_location_kernel_size=31, # Mel-post processing network parameters postnet_embedding_dim=128 * level, # 512, postnet_kernel_size=5, postnet_n_convolutions=5, # Speaker embedding n_speakers=123, speaker_embedding_dim=16 * level, # 32 * level, # 128, # Reference encoder with_gst=False, # True, ref_enc_filters=[ 8 * level, 8 * level, 16 * level, 16 * level, 32 * level, 32 * level ], # [32, 32, 64, 64, 128, 128], ref_enc_size=[3, 3], ref_enc_strides=[2, 2], ref_enc_pad=[1, 1], ref_enc_gru_size=32 * level, # 128, # Style Token Layer token_embedding_size= 0, # 64 * level, # 256, # 如果with_gst=False,则手动改为0。 token_num=10, num_heads=8, ################################ # Optimization Hyperparameters # ################################ use_saved_learning_rate=False, learning_rate=1e-3, learning_rate_min=1e-5, learning_rate_anneal=50000, weight_decay=1e-6, grad_clip_thresh=1.0, batch_size=32, # 32, mask_padding=True, # set model's padded outputs to padded values )) if hparams_string: # tf.compat.v1.logging.info('Parsing command line hparams: %s', hparams_string) hparams.parse(hparams_string) # if verbose: # tf.compat.v1.logging.info('Final parsed hparams: %s', hparams.values()) return hparams
def create_hparams(hparams_string=None, verbose=False, level=2): """Create model hyperparameters. Parse nondefault from given string.""" # hparams = tf.contrib.training.HParams( hparams = Dict2Obj( dict( ################################ # Experiment Parameters # ################################ epochs=50000, iters_per_checkpoint=500, seed=1234, dynamic_loss_scaling=True, fp16_run=False, distributed_run=False, dist_backend="nccl", dist_url="tcp://localhost:54321", cudnn_enabled=True, cudnn_benchmark=False, ignore_layers=['speaker_embedding.weight'], ################################ # Data Parameters # ################################ training_files=r"F:\github\zhrtvc\data\SV2TTS\mellotron\train.txt", # 'filelists/ljs_audiopaths_text_sid_train_filelist.txt', validation_files= r"F:\github\zhrtvc\data\SV2TTS\mellotron\validation.txt", # 'filelists/ljs_audiopaths_text_sid_val_filelist.txt', text_cleaners=['english_cleaners'], p_arpabet=1.0, cmudict_path=None, # "data/cmu_dictionary", ################################ # Audio Parameters # ################################ max_wav_value=32768.0, sampling_rate=22050, # 22050, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, mel_fmin=0.0, mel_fmax=8000.0, f0_min=80, f0_max=880, harm_thresh=0.25, ################################ # Model Parameters # ################################ n_symbols=len(symbols), symbols_embedding_dim=128 * level, # 512, # Encoder parameters encoder_kernel_size=5, encoder_n_convolutions=3, encoder_embedding_dim=128 * level, # 512, # Decoder parameters n_frames_per_step=1, # currently only 1 is supported decoder_rnn_dim=256 * level, # 1024, prenet_dim=64 * level, # 256, prenet_f0_n_layers=1, prenet_f0_dim=1, prenet_f0_kernel_size=1, prenet_rms_dim=0, prenet_rms_kernel_size=1, max_decoder_steps=1000, gate_threshold=0.5, p_attention_dropout=0.1, p_decoder_dropout=0.1, p_teacher_forcing=1.0, # Attention parameters attention_rnn_dim=256 * level, # 1024, attention_dim=32 * level, # 128, # Location Layer parameters attention_location_n_filters=8 * level, # 32, attention_location_kernel_size=31, # Mel-post processing network parameters postnet_embedding_dim=128 * level, # 512, postnet_kernel_size=5, postnet_n_convolutions=5, # Speaker embedding n_speakers=123, speaker_embedding_dim=32 * level, # 128, # Reference encoder with_gst=True, ref_enc_filters=[ 8 * level, 8 * level, 16 * level, 16 * level, 32 * level, 32 * level ], # [32, 32, 64, 64, 128, 128], ref_enc_size=[3, 3], ref_enc_strides=[2, 2], ref_enc_pad=[1, 1], ref_enc_gru_size=32 * level, # 128, # Style Token Layer token_embedding_size=64 * level, # 256, token_num=10, num_heads=8, ################################ # Optimization Hyperparameters # ################################ use_saved_learning_rate=False, learning_rate=1e-3, learning_rate_min=1e-5, learning_rate_anneal=50000, weight_decay=1e-6, grad_clip_thresh=1.0, batch_size=32, # 32, mask_padding=True, # set model's padded outputs to padded values )) # if hparams_string: # tf.compat.v1.logging.info('Parsing command line hparams: %s', hparams_string) # hparams.parse(hparams_string) # # if verbose: # tf.compat.v1.logging.info('Final parsed hparams: %s', hparams.values()) return hparams
"win_size": 1024, # 800 "sample_rate": _sr, # 16000 "fmin": 0, # 55 "fmax": _sr // 2, # 7600 "preemphasize": False, # True 'symmetric_mels': True, # True 'signal_normalization': False, # True 'allow_clipping_in_normalization': False, # True 'ref_level_db': 0, # 20 'center': False, # True '__file__': __file__ } synthesizer_hparams = {k: v for k, v in default_hparams.items()} synthesizer_hparams = {**synthesizer_hparams, **my_hp} synthesizer_hparams = Dict2Obj(synthesizer_hparams) def audio2mel_synthesizer(src): """ 用aukit模块重现生成mel,和synthesizer的频谱适应。 :param src: :return: """ _pad_len = (synthesizer_hparams.n_fft - synthesizer_hparams.hop_size) // 2 wavs = src.cpu().numpy() mels = [] for wav in wavs: wav = np.pad(wav.flatten(), (_pad_len, _pad_len), mode="reflect") mel = mel_spectrogram(wav, synthesizer_hparams) mel = mel / 20
def create_hparams(hparams_string=None, verbose=False, level=2): """Create model hyperparameters. Parse nondefault from given string.""" # hparams = tf.contrib.training.HParams( hparams = Dict2Obj( dict( ################################ # Experiment Parameters # ################################ dataloader_num_workers=10, epochs=1000000, iters_per_checkpoint=1000, # 500, seed=1234, dynamic_loss_scaling=True, fp16_run=False, distributed_run=False, dist_backend="nccl", dist_url="tcp://localhost:54321", cudnn_enabled=True, cudnn_benchmark=False, ignore_layers=['speaker_embedding.weight'], ################################ # Data Parameters # ################################ train_mode='train-mspk', # f01:用基频,prenet_f0_dim=1。 # f02:用基频均值填充,prenet_f0_dim=1。 # f03:用零向量代替基频,prenet_f0_dim=1。 # f04:不用基频,prenet_f0_dim=0。 # f05s02:用speaker_id等距分配代替基频,speaker_id用0表示,prenet_f0_dim=0。 # f06s02:用语音的embed向量代替,基频speaker_id用0表示,prenet_f0_dim=8。 # gst:用gst模式,把speaker_id用0表示,prenet_f0_dim=0, token_embedding_size=64 * level, with_gst=True。 # tacotron:用tacotron模式,把speaker_id用0表示,prenet_f0_dim=0, token_embedding_size=0, with_gst=False。 # mspk:multispeaker,快捷表示说话人,用speaker的md5的32位16进制数代表说话人,不用基频,encoder_model_fpath='mspk', speaker_embedding_dim=32, n_speakers=0, prenet_f0_dim=0。 # rtvc:利用语音编码向量的语音克隆,用GE2E模型把语音转为256维向量,作为speaker向量输入,不用基频,encoder_model_fpath='fpath', speaker_embedding_dim=32, n_speakers=256, prenet_f0_dim=0。 # training_files=r"../../data/SV2TTS/mellotron/samples_ssml/train.txt", # 文件一行记录一个语音信息,每行的数据结构:数据文件夹名\t语音源文件\t文本\t说话人名称\n,样例如下: # 000000 Aibao/005397.mp3 他走近钢琴并开始演奏“祖国从哪里开始”。 0 # validation_files=r"../../data/SV2TTS/mellotron/samples_ssml/validation.txt", # 'filelists/ljs_audiopaths_text_sid_val_filelist.txt', encoder_model_fpath= r'/home/project/zhrtvc/models-gmw/models/encoder/saved_models/ge2e_pretrained.pt', text_cleaners='hanzi', # ['chinese_cleaners'], p_arpabet=1.0, cmudict_path=None, # "data/cmu_dictionary", ################################ # Audio Parameters # ################################ max_wav_value=32768.0, sampling_rate= 22050, # hparams_griffinlim.sample_rate, # 16000, # 22050, filter_length=1024, # hparams_griffinlim.n_fft, # 1024, hop_length=256, # hparams_griffinlim.hop_size, # 256, win_length=1024, # hparams_griffinlim.win_size, # 1024, n_mel_channels=80, # 401, # 80, mel_fmin=0.0, mel_fmax=8000.0, # 8000.0, f0_min=80, f0_max=880, harm_thresh=0.25, ################################ # Model Parameters # ################################ n_symbols=145, # len(symbols), symbols_embedding_dim=128 * level, # 512, # Encoder parameters encoder_kernel_size=5, encoder_n_convolutions=3, encoder_embedding_dim=128 * level, # 512, # Decoder parameters n_frames_per_step=1, # currently only 1 is supported decoder_rnn_dim=256 * level, # 1024, prenet_dim=64 * level, # 256, prenet_f0_n_layers=1, prenet_f0_dim=0, # 1, 如果不启用f0,则设置为0。 prenet_f0_kernel_size=1, prenet_rms_dim=0, prenet_rms_kernel_size=1, max_decoder_steps=1000, # 1000, gate_threshold=0.5, p_attention_dropout=0.1, p_decoder_dropout=0.1, p_teacher_forcing=1.0, # Attention parameters attention_rnn_dim=256 * level, # 1024, attention_dim=32 * level, # 128, # Location Layer parameters attention_location_n_filters=8 * level, # 32, attention_location_kernel_size=31, # Mel-post processing network parameters postnet_embedding_dim=128 * level, # 512, postnet_kernel_size=5, postnet_n_convolutions=5, # Speaker embedding n_speakers=32, # 1000, # 123 speaker_embedding_dim=32, # 16 * level, # 32 * level, # 128, # speaker_embedding_dim=64, # 16 * level, # 32 * level, # 128, # Reference encoder with_gst=False, # True, ref_enc_filters=[ 8 * level, 8 * level, 16 * level, 16 * level, 32 * level, 32 * level ], # [32, 32, 64, 64, 128, 128], ref_enc_size=[3, 3], ref_enc_strides=[2, 2], ref_enc_pad=[1, 1], ref_enc_gru_size=32 * level, # 128, # Style Token Layer token_embedding_size= 0, # 64 * level, # 256, # 如果with_gst=False,则手动改为0。 token_num=10, num_heads=8, ################################ # Optimization Hyperparameters # ################################ use_saved_learning_rate=False, learning_rate=1e-3, learning_rate_min=1e-5, learning_rate_anneal=50000, weight_decay=1e-6, grad_clip_thresh=1.0, batch_size=32, # 32, mask_padding=True, # set model's padded outputs to padded values )) if hparams_string: # tf.compat.v1.logging.info('Parsing command line hparams: %s', hparams_string) hparams.parse(hparams_string) # if verbose: # tf.compat.v1.logging.info('Final parsed hparams: %s', hparams.values()) return hparams