def tacotron_synthesize(args, hparams, checkpoint, text=None, cwd=None): output_dir = 'tacotron_' + args.output_dir step = '' try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) step = checkpoint_path.split('/')[-1].split('-')[-1].strip() #/home/spurs/tts/project/Tacotron-2/logs-Tacotron-2/taco_pretrained/tacotron_model.ckpt-61000 except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: raise ValueError( 'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: raise ValueError( 'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if args.mode == 'eval': return run_eval(args, checkpoint_path, output_dir, hparams, text, step, cwd) elif args.mode == 'synthesis': return run_synthesis(args, checkpoint_path, output_dir, hparams) else: run_live(args, checkpoint_path, hparams)
def _enqueue_next_group(self): start = time.time() # Read a group of examples: n = self.batch_size r = self._hp.reduction_factor if self.static_batches is not None: batches = self.static_batches else: examples = [] for data_dir in self.data_dirs: if self._hp.initial_data_greedy: if self._step < self._hp.initial_phase_step and \ any("krbook" in data_dir for data_dir in self.data_dirs): data_dir = [data_dir for data_dir in self.data_dirs if "krbook" in data_dir][0] if self._step < self._hp.initial_phase_step: example = [self._get_next_example(data_dir) \ for _ in range(int(n * self._batches_per_group // len(self.data_dirs)))] else: example = [self._get_next_example(data_dir) \ for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))] examples.extend(example) examples.sort(key=lambda x: x[-1]) batches = [examples[i:i+n] for i in range(0, len(examples), n)] self.rng.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r, self.rng, self.data_type))) self._session.run(self._enqueue_op, feed_dict=feed_dict) self._step += 1
def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'): log('Constructing model: %s' % model_name) #Force the batch size to be known in order to use attention masking in batch synthesis inputs = tf.placeholder(tf.int32, (1, None), name='inputs') input_lengths = tf.placeholder(tf.int32, (1), name='input_lengths') targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets') split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos') with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope: self.model = create_model(model_name, hparams) if gta: self.model.initialize(inputs, input_lengths, targets, gta=gta) else: self.model.initialize(inputs, input_lengths) self.mel_outputs = self.model.mel_outputs if hparams.predict_linear: self.linear_outputs = self.model.linear_outputs self.alignments = self.model.alignments self.stop_token_prediction = self.model.stop_token_prediction self.targets = targets self.gta = gta self._hparams = hparams #pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 #explicitely setting the padding to a value that doesn't originally exist in the spectogram #to avoid any possible conflicts, without affecting the output range of the model too much if hparams.symmetric_mels: self._target_pad = -hparams.max_abs_value else: self._target_pad = 0. self.inputs = inputs self.input_lengths = input_lengths self.targets = targets self.split_infos = split_infos log('Loading checkpoint: %s' % checkpoint_path) #Memory allocation on the GPUs as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path)
def __init__(self, coordinator, metadata_filename, hparams): super(Feeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._offset = 0 # Load metadata self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels') self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear') with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format( len(self._metadata), hours)) # Create placeholders for inputs and targets. Don't specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.int32, shape=(None, None), name='inputs'), tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), tf.placeholder(tf.int32, [None], 'mel_lengths'), tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'), ] # Create queue for buffering data queue = tf.FIFOQueue( 8, [tf.int32, tf.int32, tf.float32, tf.int32, tf.float32, tf.float32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.mel_lengths, self.token_targets, self.linear_targets = queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.mel_lengths.set_shape(self._placeholders[3].shape) self.token_targets.set_shape(self._placeholders[4].shape) self.linear_targets.set_shape(self._placeholders[5].shape)
def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'): log('Constructing model: %s' % model_name) #Force the batch size to be known in order to use attention masking in batch synthesis inputs = tf.placeholder(tf.int32, (1, None), name='inputs') input_lengths = tf.placeholder(tf.int32, (1), name='input_lengths') targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets') target_lengths = tf.placeholder(tf.int32, (1), name='target_length') gta = True #initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, # linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, # is_evaluating=False) with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope: self.model = create_model(model_name, hparams) self.model.initialize(inputs=inputs, input_lengths=input_lengths, mel_targets=targets, targets_lengths=target_lengths, gta=gta, is_evaluating=True) self.mel_outputs = self.model.mel_outputs self.alignments = self.model.alignments self._hparams = hparams self.inputs = inputs self.input_lengths = input_lengths self.targets = targets self.target_lengths = target_lengths log('Loading checkpoint: %s' % checkpoint_path) #Memory allocation on the GPUs as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path)
def _enqueue_next_group(self): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step examples = [ self._get_next_example() for i in range(n * _batches_per_group) ] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def run_synthesis(args, checkpoint_path, output_dir, hparams): GTA = (args.GTA == 'True') if GTA: synth_dir = os.path.join(output_dir, 'gta') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) else: synth_dir = os.path.join(output_dir, 'natural') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) metadata_filename = os.path.join(args.input_dir, 'train.txt') log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, gta=GTA) with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format( len(metadata), hours)) #Set inputs batch wise metadata = [ metadata[i:i + hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size) ] log('Starting Synthesis') mel_dir = os.path.join(args.input_dir, 'mels') wav_dir = os.path.join(args.input_dir, 'audio') with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: for i, meta in enumerate(tqdm(metadata)): texts = [m[5] for m in meta] mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta] wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta] basenames = [ os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames ] mel_output_filenames, speaker_ids = synth.synthesize( texts, basenames, synth_dir, None, mel_filenames) for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(synth_dir)) return os.path.join(synth_dir, 'map.txt')
def get_path_dict( data_dirs, hparams, config, data_type, n_test=None, rng=np.random.RandomState(123)): # Load metadata: path_dict = {} for data_dir in data_dirs: paths = glob("{}/*.npz".format(data_dir)) if data_type == 'train': rng.shuffle(paths) if not config.skip_path_filter: items = parallel_run( get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True) min_n_frame = hparams.reduction_factor * hparams.min_iters max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor new_items = [(path, n) for path, n, n_tokens in items \ if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens] if any(check in data_dir for check in ["son", "yuinna"]): blacklists = [".0000.", ".0001.", "NB11479580.0001"] new_items = [item for item in new_items \ if any(check not in item[0] for check in blacklists)] new_paths = [path for path, n in new_items] new_n_frames = [n for path, n in new_items] hours = frames_to_hours(new_n_frames) log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'. \ format(data_dir, len(new_n_frames), hours)) log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames))) log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames))) else: new_paths = paths if data_type == 'train': new_paths = new_paths[:-n_test] elif data_type == 'test': new_paths = new_paths[-n_test:] else: raise Exception(" [!] Unkown data_type: {}".format(data_type)) path_dict[data_dir] = new_paths return path_dict
def run_eval(args, checkpoint_path, output_dir, hparams, text, step, cwd): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist #os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) #os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) log('Starting Synthesis') synth.synthesize(text, step, eval_dir, log_dir, None, cwd) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_live(args, checkpoint_path, hparams): #Log to Terminal without keeping any records in files log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) #Generate fast greeting message greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!' log(greetings) generate_fast(synth, greetings) #Interaction loop while True: try: text = input() generate_fast(synth, text) except KeyboardInterrupt: leave = 'Thank you for testing our features. see you soon.' log(leave) generate_fast(synth, leave) sleep(2) break
def train(args, log_dir, hparams): log('\n#############################################################\n') log('Tacotron Train\n') log('###########################################################\n') tacotron_train(args, log_dir, hparams)
def initialize(self, inputs, input_lengths, mel_targets=None, mel_lengths=None, stop_token_targets=None, linear_targets=None, gta=False, reference_mel=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if gta == False and self._hparams.predict_linear == True and linear_targets is None: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) if hp.use_vae: if is_training: reference_mel = mel_targets style_embeddings, mu, log_var = VAE(inputs=reference_mel, input_lengths=mel_lengths, filters=hp.filters, kernel_size=(3, 3), strides=(2, 2), num_units=hp.vae_dim, is_training=is_training, scope='vae') self.mu = mu self.log_var = log_var style_embeddings = tf.layers.dense(style_embeddings, hp.encoder_depth) style_embeddings = tf.expand_dims(style_embeddings, axis=1) style_embeddings = tf.tile( style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 256] encoder_outputs = encoder_outputs + style_embeddings #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio, gta) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not is_training else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection( hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.reference_mel = reference_mel if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.mel_lengths = mel_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format( linear_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): hp = self._hparams batch_size = tf.shape(inputs)[0] gta = False self.num_atten = 5 T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else ( 0, hp.max_abs_value) with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) self.encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose self.enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = ForwardLocationSensitiveAttention( hp.attention_dim, self.encoder_outputs, hparams=hp, is_training=is_training or is_evaluating, memory_sequence_length=input_lengths, smoothing=hp.smoothing) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp, input_lengths) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] self.decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) self.stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) if hp.clip_outputs: self.decoder_output = tf.minimum( tf.maximum(self.decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(self.decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') self.projected_residual = residual_projection(residual) #Compute the mel spectrogram self.mel_outputs = self.decoder_output + self.projected_residual if hp.clip_outputs: self.mel_outputs = tf.minimum( tf.maximum(self.mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) if hp.predict_linear: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, hp.batch_norm_position, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] self.post_outputs = post_cbhg(self.mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] self.linear_outputs = linear_specs_projection( self.post_outputs) if hp.clip_outputs: self.linear_outputs = tf.minimum( tf.maximum(self.linear_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Grab alignments from the final decoder state self.alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) log('initialisation done.') if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.mel_targets = mel_targets self.linear_targets = linear_targets self.targets_lengths = targets_lengths self.stop_token_targets = stop_token_targets self.gta = gta self.all_vars = tf.trainable_variables() self.is_training = is_training self.is_evaluating = is_evaluating self.fine_tune_params = [ v for v in self.all_vars if not ('inputs_embedding' in v.name or 'encoder_' in v.name) ] self.final_params = self.all_vars if not hp.tacotron_fine_tuning else self.fine_tune_params log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format( self.enc_conv_output_shape)) log(' encoder out: {}'.format( self.encoder_outputs.shape)) log(' decoder out: {}'.format(self.decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( self.projected_residual.shape)) log(' mel out: {}'.format(self.mel_outputs.shape)) if hp.predict_linear: log(' linear out: {}'.format( self.linear_outputs.shape)) log(' <stop_token> out: {}'.format( self.stop_token_prediction.shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000)) log(' fine tune paarmaters: {:.3f} Million.'.format( np.sum([ np.prod(v.get_shape().as_list()) for v in self.fine_tune_params ]) / 1000000)) log(' final paarmaters: {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.final_params]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, gta=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio, gta) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not is_training else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def __init__(self, coordinator, metadata_filename, hparams): super(Feeder, self).__init__() self._coord = coordinator self._hparams = hparams self._train_offset = 0 self._test_offset = 0 # Load metadata self._mel_dir = os.path.dirname(metadata_filename) self._linear_dir = os.path.dirname(metadata_filename) #, 'linear') dura = 0 self._metadata = [] with open(metadata_filename, encoding='utf-8') as f: for line in f: #audio-000001.npy|mel-000001.npy|46200|168|卡尔普陪外孙玩滑梯。|k a3 er3 p u3 p ei2 w ai4 s un1 w an2 h ua2 t i1 。 line = line.strip().split('|') mel = line[1].strip() dura += int(line[3]) pyin = line[-1].strip() self._metadata.append([mel, pyin]) frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = dura * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format( len(self._metadata), hours)) self._train_meta = self._metadata print(len(self._train_meta), '*' * 100) #pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 #explicitely setting the padding to a value that doesn't originally exist in the spectogram #to avoid any possible conflicts, without affecting the output range of the model too much if hparams.symmetric_mels: self._target_pad = -hparams.max_abs_value else: self._target_pad = 0. #Mark finished sequences with 1s self._token_pad = 1. # Create placeholders for inputs and targets. Don't specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.int32, shape=(None, None), name='inputs'), tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'), ] # Create queue for buffering data queue = tf.FIFOQueue( 8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.targets_lengths = queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.token_targets.set_shape(self._placeholders[3].shape) self.targets_lengths.set_shape(self._placeholders[4].shape)
def __init__(self, coordinator, data_dirs, hparams, config, batches_per_group, data_type, batch_size): super(DataFeeder, self).__init__() self._coord = coordinator self._hp = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._step = 0 self._offset = defaultdict(lambda: 2) self._batches_per_group = batches_per_group self.rng = np.random.RandomState(config.random_seed) self.data_type = data_type self.batch_size = batch_size self.min_tokens = hparams.min_tokens self.min_n_frame = hparams.reduction_factor * hparams.min_iters self.max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor self.skip_path_filter = config.skip_path_filter # Load metadata: self.path_dict = get_path_dict( data_dirs, self._hp, config, self.data_type, n_test=self.batch_size, rng=self.rng) self.data_dirs = list(self.path_dict.keys()) self.data_dir_to_id = { data_dir: idx for idx, data_dir in enumerate(self.data_dirs)} data_weight = { data_dir: 1. for data_dir in self.data_dirs } if self._hp.main_data_greedy_factor > 0 and \ any(main_data in data_dir for data_dir in self.data_dirs \ for main_data in self._hp.main_data): for main_data in self._hp.main_data: for data_dir in self.data_dirs: if main_data in data_dir: data_weight[data_dir] += self._hp.main_data_greedy_factor weight_Z = sum(data_weight.values()) self.data_ratio = { data_dir: weight / weight_Z for data_dir, weight in data_weight.items() } log("="*40) log(pprint.pformat(self.data_ratio, indent=4)) log("="*40) #audio_paths = [path.replace("/data/", "/audio/"). \ # replace(".npz", ".wav") for path in self.data_paths] #duration = get_durations(audio_paths, print_detail=False) # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None], 'loss_coeff'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'), ] # Create queue for buffering data: dtypes = [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32] self.is_multi_speaker = len(self.data_dirs) > 1 if self.is_multi_speaker: self._placeholders.append( tf.placeholder(tf.int32, [None], 'inputs'), ) dtypes.append(tf.int32) num_worker = 8 if self.data_type == 'train' else 1 queue = tf.FIFOQueue(num_worker, dtypes, name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) if self.is_multi_speaker: self.inputs, self.input_lengths, self.loss_coeff, \ self.mel_targets, self.linear_targets, self.speaker_id = queue.dequeue() else: self.inputs, self.input_lengths, self.loss_coeff, \ self.mel_targets, self.linear_targets = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.loss_coeff.set_shape(self._placeholders[2].shape) self.mel_targets.set_shape(self._placeholders[3].shape) self.linear_targets.set_shape(self._placeholders[4].shape) if self.is_multi_speaker: self.speaker_id.set_shape(self._placeholders[5].shape) else: self.speaker_id = None if self.data_type == 'test': examples = [] while True: for data_dir in self.data_dirs: examples.append(self._get_next_example(data_dir)) #print(data_dir, text.sequence_to_text(examples[-1][0], False, True)) if len(examples) >= self.batch_size: break if len(examples) >= self.batch_size: break self.static_batches = [examples for _ in range(self._batches_per_group)] else: self.static_batches = None
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError('no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training: raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') if gta and linear_targets is not None: raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams if hp.tacotron_curriculum_dropout_rate: assert global_step is not None self.dropout_rate = self._curriculum_dropout( hp.tacotron_dropout_rate, hp.tacotron_curriculum_dropout_gamma, global_step) else: self.dropout_rate = tf.convert_to_tensor( hp.tacotron_dropout_rate) if hp.tacotron_curriculum_zoneout_rate: assert global_step is not None self.zoneout_rate = self._curriculum_dropout( hp.tacotron_zoneout_rate, hp.tacotron_curriculum_zoneout_gamma, global_step) else: self.zoneout_rate = tf.convert_to_tensor( hp.tacotron_zoneout_rate) assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hp.enc_conv_kernel_size, hp.enc_conv_channels, hp.enc_conv_num_layers, self.dropout_rate, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=self.zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=self.dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=self.zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: if mel_targets is not None and stop_token_targets is not None: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) else: if gta: log('Warning: gta set to True but mel_targets or ' + 'mel_targets or stop_token_targets not provided' + ', falling back to natural inference') self.helper = TacoTestHelper(batch_size, hp) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hp.postnet_kernel_size, hp.postnet_channels, hp.postnet_num_layers, self.dropout_rate, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = tf.add(decoder_output, projected_residual, name='mel_outputs') if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hp.enc_conv_kernel_size, hp.enc_conv_channels, hp.enc_conv_num_layers, self.dropout_rate, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=self.zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0], name='alignments') self.optimize = None self.loss = None if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format(linear_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_prediction.shape))
import tensorflow as tf from tacotron.utils.symbols import symbols from tacotron.utils.infolog import log from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper from tacotron.models.modules import * from tacotron.models.zoneout_LSTM import ZoneoutLSTMCell from tensorflow.contrib.seq2seq import dynamic_decode from tacotron.models.Architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell from tacotron.models.custom_decoder import CustomDecoder if int(tf.__version__.replace('.', '')) < 160: log('using old attention Tensorflow structure (1.5.0 and earlier)') from tacotron.models.attention_old import LocationSensitiveAttention else: log('using new attention Tensorflow structure (1.6.0 and later)') from tacotron.models.attention import LocationSensitiveAttention class Tacotron(): """Tacotron-2 Feature prediction Model. """ def __init__(self, hparams): self._hparams = hparams def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, gta=False): """