Exemplo n.º 1
0
def build_model(hparams_json=None):
    if hparams_json is not None:
        with open(hparams_json, 'r') as jf:
            hparams = HParams(**json.load(jf))
    if is_mulaw_quantize(hparams.input_type):
        if hparams.out_channels != hparams.quantize_channels:
            raise RuntimeError(
                "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
    if hparams.upsample_conditional_features and hparams.cin_channels < 0:
        s = "Upsample conv layers were specified while local conditioning disabled. "
        s += "Notice that upsample conv layers will never be used."
        warn(s)

    upsample_params = hparams.upsample_params
    upsample_params["cin_channels"] = hparams.cin_channels
    upsample_params["cin_pad"] = hparams.cin_pad
    use_speaker_embedding = True if hparams.gin_channels > 0 else False
    model = WaveNet(
        out_channels=hparams.out_channels,
        layers=hparams.layers,
        stacks=hparams.stacks,
        residual_channels=hparams.residual_channels,
        gate_channels=hparams.gate_channels,
        skip_out_channels=hparams.skip_out_channels,
        cin_channels=hparams.cin_channels,
        gin_channels=hparams.gin_channels,
        n_speakers=hparams.n_speakers,
        dropout=hparams.dropout,
        kernel_size=hparams.kernel_size,
        cin_pad=hparams.cin_pad,
        upsample_conditional_features=hparams.upsample_conditional_features,
        upsample_net=hparams.upsample_net,
        upsample_params=upsample_params,
        scalar_input=is_scalar_input(hparams.input_type),
        use_speaker_embedding=use_speaker_embedding,
        output_distribution=hparams.output_distribution,
    )
    return model
Exemplo n.º 2
0
hparams = HParams(
    name="wavenet_vocoder",

    # Input type:
    # 1. raw [-1, 1]
    # 2. mulaw [-1, 1]
    # 3. mulaw-quantize [0, mu]
    # If input_type is raw or mulaw, network assumes scalar input and
    # discretized mixture of logistic distributions output, otherwise one-hot
    # input and softmax output are assumed.
    # **NOTE**: if you change the one of the two parameters below, you need to
    # re-run preprocessing before training.
    input_type="raw",
    quantize_channels=65536,  # 65536 or 256

    # Audio:
    # time-domain pre/post-processing
    # e.g., preemphasis/inv_preemphasis
    # ref: LPCNet https://arxiv.org/abs/1810.11846
    preprocess="",
    postprocess="",
    # waveform domain scaling
    global_gain_scale=1.0,

    sample_rate=22050,
    # this is only valid for mulaw is True
    silence_threshold=2,
    num_mels=80,
    fmin=125,
    fmax=7600,
    fft_size=1024,
    # shift can be specified by either hop_size or frame_shift_ms
    hop_size=256,
    frame_shift_ms=None,
    win_length=1024,
    win_length_ms=-1.0,
    window="hann",

    # DC removal
    highpass_cutoff=70.0,

    # Parametric output distribution type for scalar input
    # 1) Logistic or 2) Normal
    output_distribution="Logistic",
    log_scale_min=-16.0,

    # Model:
    # This should equal to `quantize_channels` if mu-law quantize enabled
    # otherwise num_mixture * 3 (pi, mean, log_scale)
    # single mixture case: 2
    out_channels=10 * 3,
    layers=24,
    stacks=4,
    residual_channels=128,
    gate_channels=256,  # split into 2 gropus internally for gated activation
    skip_out_channels=128,
    dropout=0.0,
    kernel_size=3,

    # Local conditioning (set negative value to disable))
    cin_channels=80,
    cin_pad=2,
    # If True, use transposed convolutions to upsample conditional features,
    # otherwise repeat features to adjust time resolution
    upsample_conditional_features=True,
    upsample_net="ConvInUpsampleNetwork",
    upsample_params={
        "upsample_scales": [4, 4, 4, 4],  # should np.prod(upsample_scales) == hop_size
    },

    # Global conditioning (set negative value to disable)
    # currently limited for speaker embedding
    # this should only be enabled for multi-speaker dataset
    gin_channels=-1,  # i.e., speaker embedding dim
    n_speakers=7,  # 7 for CMU ARCTIC

    # Data loader
    pin_memory=True,
    num_workers=2,

    # Loss

    # Training:
    batch_size=8,
    optimizer="Adam",
    optimizer_params={
        "lr": 1e-3,
        "eps": 1e-8,
        "weight_decay": 0.0,
    },

    # see lrschedule.py for available lr_schedule
    lr_schedule="step_learning_rate_decay",
    lr_schedule_kwargs={"anneal_rate": 0.5, "anneal_interval": 200000},

    max_train_steps=1000000,
    nepochs=2000,

    clip_thresh=-1,

    # max time steps can either be specified as sec or steps
    # if both are None, then full audio samples are used in a batch
    max_time_sec=None,
    max_time_steps=10240,  # 256 * 40

    # Hold moving averaged parameters and use them for evaluation
    exponential_moving_average=True,
    # averaged = decay * averaged + (1 - decay) * x
    ema_decay=0.9999,

    # Save
    # per-step intervals
    checkpoint_interval=100000,
    train_eval_interval=100000,
    # per-epoch interval
    test_eval_epoch_interval=50,
    save_optimizer_state=True,

    # Eval:
)
Exemplo n.º 3
0
hparams = HParams(
	# Comma-separated list of cleaners to run on text prior to training and eval. For non-English
	# text, you may want to use "basic_cleaners" or "transliteration_cleaners".
	cleaners='english_cleaners',


	#Audio
	num_mels = 80, 
	num_freq = 513, #only used when adding linear spectrograms post processing network
	rescale = True, 
	rescaling_max = 0.999,
	trim_silence = True,

	#Mel spectrogram
	fft_size = 1024,
	hop_size = 256,
	sample_rate = 22050, #22050 Hz (corresponding to ljspeech dataset)
	frame_shift_ms = None,

	#Mel and Linear spectrograms normalization/scaling and clipping
	signal_normalization = True,
	allow_clipping_in_normalization = True, #Only relevant if mel_normalization = True
	symmetric_mels = True, #Whether to scale the data to be symmetric around 0
	max_abs_value = 4., #max absolute value of data. If symmetric, data will be [-max, max] else [0, max] 

	#Limits
	min_level_db =- 100,
	ref_level_db = 20,
	fmin = 125,
	fmax = 7600,

	#Griffin Lim
	power = 1.55,
	griffin_lim_iters = 60,


	#Tacotron
	outputs_per_step = 1, #number of frames to generate at each decoding step (speeds up computation and allows for higher batch size)
	stop_at_any = True, #Determines whether the decoder should stop when predicting <stop> to any frame or to all of them

	embedding_dim = 512, #dimension of embedding space

	enc_conv_num_layers = 3, #number of encoder convolutional layers
	enc_conv_kernel_size = (5, ), #size of encoder convolution filters for each layer
	enc_conv_channels = 512, #number of encoder convolutions filters for each layer
	encoder_lstm_units = 256, #number of lstm units for each direction (forward and backward)

	smoothing = False, #Whether to smooth the attention normalization function 
	attention_dim = 128, #dimension of attention space
	attention_filters = 32, #number of attention convolution filters
	attention_kernel = (31, ), #kernel size of attention convolution
	cumulative_weights = True, #Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True)

	prenet_layers = [256, 256], #number of layers and number of units of prenet
	decoder_layers = 2, #number of decoder lstm layers
	decoder_lstm_units = 1024, #number of decoder lstm units on each layer
	max_iters = 2500, #Max decoder steps during inference (Just for safety from infinite loop cases)

	postnet_num_layers = 5, #number of postnet convolutional layers
	postnet_kernel_size = (5, ), #size of postnet convolution filters for each layer
	postnet_channels = 512, #number of postnet convolution filters for each layer

	mask_encoder = False, #whether to mask encoder padding while computing attention
	impute_finished = False, #Whether to use loss mask for padded sequences
	mask_finished = False, #Whether to mask alignments beyond the <stop_token> (False for debug, True for style)

	predict_linear = False, #Whether to add a post-processing network to the Tacotron to predict linear spectrograms (True mode Not tested!!)


	#Wavenet
	# Input type:
	# 1. raw [-1, 1]
	# 2. mulaw [-1, 1]
	# 3. mulaw-quantize [0, mu]
	# If input_type is raw or mulaw, network assumes scalar input and
	# discretized mixture of logistic distributions output, otherwise one-hot
	# input and softmax output are assumed.
	# **NOTE**: if you change the one of the two parameters below, you need to
	# re-run preprocessing before training.
	# **NOTE**: scaler input (raw or mulaw) is experimental. Use it your own risk.
	input_type="mulaw-quantize",
	quantize_channels=256,  # 65536 or 256

	silence_threshold=2,

	# Mixture of logistic distributions:
	log_scale_min=float(np.log(1e-14)),

	#TODO model params


	#Tacotron Training
	tacotron_batch_size = 32, #number of training samples on each training steps
	tacotron_reg_weight = 1e-6, #regularization weight (for l2 regularization)
	tacotron_scale_regularization = True, #Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is high and biasing the model)

	tacotron_decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
	tacotron_start_decay = 50000, #Step at which learning decay starts
	tacotron_decay_steps = 50000, #starting point for learning rate decay (and determines the decay slope) (UNDER TEST)
	tacotron_decay_rate = 0.4, #learning rate decay rate (UNDER TEST)
	tacotron_initial_learning_rate = 1e-3, #starting learning rate
	tacotron_final_learning_rate = 1e-5, #minimal learning rate

	tacotron_adam_beta1 = 0.9, #AdamOptimizer beta1 parameter
	tacotron_adam_beta2 = 0.999, #AdamOptimizer beta2 parameter
	tacotron_adam_epsilon = 1e-6, #AdamOptimizer beta3 parameter

	tacotron_zoneout_rate = 0.1, #zoneout rate for all LSTM cells in the network
	tacotron_dropout_rate = 0.5, #dropout rate for all convolutional layers + prenet

	tacotron_teacher_forcing_ratio = 1., #Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder inputs
	

	#Wavenet Training TODO



	#Eval sentences
	sentences = [
	# From July 8, 2017 New York Times:
	'Scientists at the CERN laboratory say they have discovered a new particle.',
	'There\'s a way to measure the acute emotional intelligence that has never gone out of style.',
	'President Trump met with other leaders at the Group of 20 conference.',
	'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
	# From Google's Tacotron example page:
	'Generative adversarial network or variational auto-encoder.',
	'Basilar membrane and otolaryngology are not auto-correlations.',
	'He has read the whole thing.',
	'He reads books.',
	"Don't desert me here in the desert!",
	'He thought it was time to present the present.',
	'Thisss isrealy awhsome.',
	'Punctuation sensitivity, is working.',
	'Punctuation sensitivity is working.',
	"The buses aren't the problem, they actually provide a solution.",
	"The buses aren't the PROBLEM, they actually provide a SOLUTION.",
	"The quick brown fox jumps over the lazy dog.",
	"Does the quick brown fox jump over the lazy dog?",
	"Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
	"She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
	"The blue lagoon is a nineteen eighty American romance adventure film.",
	"Tajima Airport serves Toyooka.",
	'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
	#From Training data:
	'the rest being provided with barrack beds, and in dimensions varying from thirty feet by fifteen to fifteen feet by ten.',
	'in giltspur street compter, where he was first lodged.',
	'a man named burnett came with his wife and took up his residence at whitchurch, hampshire, at no great distance from laverstock,',
	'it appears that oswald had only one caller in response to all of his fpcc activities,',
	'he relied on the absence of the strychnia.',
	'scoggins thought it was lighter.',
	'''would, it is probable, have eventually overcome the reluctance of some of the prisoners at least, 
	and would have possessed so much moral dignity''',
	'''the only purpose of this whole sentence is to evaluate the scalability of the model for very long sentences. 
	This is not even a long sentence anymore, it has become an entire paragraph. 
	Should I stop now? Let\'s add this last sentence in which we talk about nothing special.''',
	'Thank you so much for your support!!'
	]

	)
Exemplo n.º 4
0
hparams = HParams(
    name="wavenet_vocoder",
    input_type="mulaw-quantize",
    quantize_channels=256,  # 65536 or 256
    sample_rate=16000,
    num_mels=80,
    # DC removal
    highpass_cutoff=70.0,

    # Parametric output distribution type for scalar input
    # 1) Logistic or 2) Normal
    output_distribution="Logistic",
    log_scale_min=-16.0,

    # Model:
    # This should equal to `quantize_channels` if mu-law quantize enabled
    # otherwise num_mixture * 3 (pi, mean, log_scale)
    # single mixture case: 2
    #     out_channels=10 * 3,
    out_channels=256,
    layers=18 * 2,
    stacks=2 * 2,
    residual_channels=128,
    gate_channels=256,  # split into 2 gropus internally for gated activation
    skip_out_channels=128,
    dropout=0.0,
    kernel_size=3,

    # Local conditioning (set negative value to disable))
    cin_channels=80,
    cin_pad=0,
    # If True, use transposed convolutions to upsample conditional features,
    # otherwise repeat features to adjust time resolution
    upsample_conditional_features=True,
    upsample_net="ConvInUpsampleNetwork",
    upsample_params={
        "upsample_scales":
        [4, 4, 4, 2],  # should np.prod(upsample_scales) == hop_size (128)
    },

    # Global conditioning (set negative value to disable)
    # currently limited for speaker embedding
    # this should only be enabled for multi-speaker dataset
    gin_channels=-1,  # i.e., speaker embedding dim
    n_speakers=7,  # 7 for CMU ARCTIC

    # initial learning rate
    lr=1e-3,
    n_epoches=10,
    n_split=8,
    max_time_steps=None,
)