def tacotron_synthesize(args, hparams, checkpoint, sentences=None): output_dir = args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log("loaded model at {}".format(checkpoint_path)) except: raise RuntimeError( "Failed to load checkpoint at {}".format(checkpoint)) if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: raise ValueError( "Defined synthesis batch size {} is smaller than minimum required {} " "(num_gpus)! Please verify your synthesis batch size choice.". format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: raise ValueError( "Defined synthesis batch size {} is not a multiple of {} (num_gpus)! " "Please verify your synthesis batch size choice!".format( hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if args.mode == "eval": return run_eval(args, checkpoint_path, output_dir, hparams, sentences) elif args.mode == "synthesis": return run_synthesis(args, checkpoint_path, output_dir, hparams) else: run_live(args, checkpoint_path, hparams)
def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron", seed=None): log("Constructing model: %s" % model_name) # Initialize tensorflow random number seed for deterministic operation if provided if seed is not None: tf.compat.v1.set_random_seed(seed) #Force the batch size to be known in order to use attention masking in batch synthesis inputs = tf.compat.v1.placeholder(tf.int32, (None, None), name="inputs") input_lengths = tf.compat.v1.placeholder(tf.int32, (None,), name="input_lengths") speaker_embeddings = tf.compat.v1.placeholder(tf.float32, (None, hparams.speaker_embedding_size), name="speaker_embeddings") targets = tf.compat.v1.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets") split_infos = tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos") with tf.compat.v1.variable_scope("Tacotron_model") as scope: self.model = create_model(model_name, hparams) if gta: self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta, split_infos=split_infos) else: self.model.initialize(inputs, input_lengths, speaker_embeddings, split_infos=split_infos) self.mel_outputs = self.model.tower_mel_outputs self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None self.alignments = self.model.tower_alignments self.stop_token_prediction = self.model.tower_stop_token_prediction self.targets = targets self.gta = gta self._hparams = hparams #pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 #explicitely setting the padding to a value that doesn"t originally exist in the spectogram #to avoid any possible conflicts, without affecting the output range of the model too much if hparams.symmetric_mels: self._target_pad = -hparams.max_abs_value else: self._target_pad = 0. self.inputs = inputs self.input_lengths = input_lengths self.speaker_embeddings = speaker_embeddings self.targets = targets self.split_infos = split_infos log("Loading checkpoint: %s" % checkpoint_path) #Memory allocation on the GPUs as needed config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.session = tf.compat.v1.Session(config=config) self.session.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() saver.restore(self.session, checkpoint_path)
def make_test_batches(self): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step #Test on entire test set examples = [self._get_test_groups() for i in range(1)] # Bucket examples based on similar output sequence length for efficiency batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log("\nGenerated %d test batches of size %d in %.3f sec" % (len(batches), n, time.time() - start)) return batches, r
def _enqueue_next_train_group(self): while not self._coord.should_stop(): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step examples = [self._get_next_example() for i in range(n * _batches_per_group)] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i: i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log("\nGenerated {} train batches of size {} in {:.3f} sec".format(len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, "eval") log_dir = os.path.join(output_dir, "logs-eval") #Create output path if it doesn"t exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, "wavs"), exist_ok=True) os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True) log(hparams_debug_string()) synth = Tacotron2(checkpoint_path, hparams) #Set inputs batch wise sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] log("Starting Synthesis") with open(os.path.join(eval_dir, "map.txt"), "w") as file: for i, texts in enumerate(tqdm(sentences)): start = time.time() basenames = ["batch_{}_sentence_{}".format(i, j) for j in range(len(texts))] mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None) for elems in zip(texts, mel_filenames, speaker_ids): file.write("|".join([str(x) for x in elems]) + "\n") log("synthesized mel spectrograms at {}".format(eval_dir)) return eval_dir
def run_live(args, checkpoint_path, hparams): #Log to Terminal without keeping any records in files log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) #Generate fast greeting message greetings = "Hello, Welcome to the Live testing tool. Please type a message and I will try " \ "to read it!" log(greetings) generate_fast(synth, greetings) #Interaction loop while True: try: text = input() generate_fast(synth, text) except KeyboardInterrupt: leave = "Thank you for testing our features. see you soon." log(leave) generate_fast(synth, leave) sleep(2) break
def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - embed_targets: float32 Tensor with shape [N, E] where E is the speaker embedding size. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( "no multi targets were provided but token_targets were given") if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( "Mel targets are provided without corresponding token_targets") if not gta and self._hparams.predict_linear == True and linear_targets is None and \ is_training: raise ValueError( "Model is set to use post processing to predict linear spectrograms in training " "but no linear targets given!") if gta and linear_targets is not None: raise ValueError( "Linear spectrogram prediction is not supported in GTA mode!") if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( "Model set to mask paddings but no targets lengths provided for the mask!" ) if is_training and is_evaluating: raise RuntimeError( "Model can not be in training and evaluation modes at the same time!" ) split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or \ self._hparams.split_on_cpu else "/gpu:{}".format( self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = \ tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if \ targets_lengths is not None else targets_lengths ### SV2TTS ### tower_embed_targets = tf.split( embed_targets, num_or_size_splits=hp.tacotron_num_gpus, axis=0) ############## p_inputs = tf.numpy_function(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.numpy_function( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.numpy_function(split_func, [stop_token_targets, split_infos[:, 2]], lout_float) if stop_token_targets is not None else \ stop_token_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_cond_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus) ] for i in range(hp.tacotron_num_gpus): with tf.device( tf.compat.v1.train.replica_device_setter( ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.compat.v1.variable_scope("inference") as scope: assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled") if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training: assert global_step is not None # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit # post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.compat.v1.get_variable( "inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope="encoder_convolutions"), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope="encoder_LSTM")) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) # For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape ### SV2TT2 ### # Append the speaker embedding to the encoder output at each timestep tileable_shape = [ -1, 1, self._hparams.speaker_embedding_size ] tileable_embed_targets = tf.reshape( tower_embed_targets[i], tileable_shape) tiled_embed_targets = tf.tile( tileable_embed_targets, [1, tf.shape(encoder_outputs)[1], 1]) encoder_cond_outputs = tf.concat( (encoder_outputs, tiled_embed_targets), 2) ############## # Decoder Parts # Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope="decoder_prenet") # Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_cond_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) # Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope="decoder_LSTM") # Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope="linear_transform_projection") # <stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope="stop_token_projection") # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) # Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) # initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) # Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None # Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet postnet = Postnet(is_training, hparams=hp, scope="postnet_convolutions") # Compute residual using post-net ==> [batch_size, decoder_steps * r, # postnet_channels] residual = postnet(decoder_output) # Project residual to same dimension as mel spectrogram # ==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope="postnet_projection") projected_residual = residual_projection(residual) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features # from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name="CBHG_postnet") # [batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) # Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope="cbhg_linear_specs_projection") # [batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) # Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_cond_outputs.append(encoder_cond_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) log("initialisation done {}".format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets # self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.compat.v1.trainable_variables() log("Initialized Tacotron model. Dimensions (? = dynamic shape): ") log(" Train mode: {}".format(is_training)) log(" Eval mode: {}".format(is_evaluating)) log(" GTA mode: {}".format(gta)) log(" Synthesis mode: {}".format(not ( is_training or is_evaluating))) log(" Input: {}".format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(" device: {}".format(i)) log(" embedding: {}".format( tower_embedded_inputs[i].shape)) log(" enc conv out: {}".format( tower_enc_conv_output_shape[i])) log(" encoder out (cond): {}".format( tower_encoder_cond_outputs[i].shape)) log(" decoder out: {}".format( self.tower_decoder_output[i].shape)) log(" residual out: {}".format( tower_residual[i].shape)) log(" projected residual out: {}".format( tower_projected_residual[i].shape)) log(" mel out: {}".format( self.tower_mel_outputs[i].shape)) if post_condition: log(" linear out: {}".format( self.tower_linear_outputs[i].shape)) log(" <stop_token> out: {}".format( self.tower_stop_token_prediction[i].shape)) # 1_000_000 is causing syntax problems for some people?! Python please :) log(" Tacotron Parameters {:.3f} Million.".format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def __init__(self, coordinator, metadata_filename, hparams): super(Feeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(",")] self._train_offset = 0 self._test_offset = 0 # Load metadata self._ppg_dir = os.path.join(os.path.dirname(metadata_filename), "ppgs") self._mel_dir = os.path.join(os.path.dirname(metadata_filename), "mels") self._embed_dir = os.path.join(os.path.dirname(metadata_filename), "embeds") with open(metadata_filename, encoding="utf-8") as f: self._metadata = [line.strip().split("|") for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[5]) for x in self._metadata]) * frame_shift_ms / (3600) log("Loaded metadata for {} examples ({:.2f} hours)".format( len(self._metadata), hours)) # Train test split if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches is not None test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None else hparams.tacotron_test_batches * hparams.tacotron_batch_size) indices = np.arange(len(self._metadata)) train_indices, test_indices = train_test_split( indices, test_size=test_size, random_state=hparams.tacotron_data_random_state) # Make sure test_indices is a multiple of batch_size else round up len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size) extra_test = test_indices[len_test_indices:] test_indices = test_indices[:len_test_indices] train_indices = np.concatenate([train_indices, extra_test]) self._train_meta = list(np.array(self._metadata)[train_indices]) self._test_meta = list(np.array(self._metadata)[test_indices]) self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches == self.test_steps # pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 # explicitely setting the padding to a value that doesn"t originally exist in the spectogram # to avoid any possible conflicts, without affecting the output range of the model too much if hparams.symmetric_mels: self._target_pad = -hparams.max_abs_value else: self._target_pad = 0. # Mark finished sequences with 1s self._token_pad = 1. with tf.device("/cpu:0"): # Create placeholders for inputs and targets. Don"t specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.float32, shape=(None, None, hparams.num_ppgs), name="inputs"), tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"), tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name="mel_targets"), tf.placeholder(tf.float32, shape=(None, None), name="token_targets"), tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"), tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos"), # SV2TTS tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), name="speaker_embeddings"), # adversial speaker classifier tf.placeholder(tf.int32, shape=(None, ), name="speaker_labels") ] # Create queue for buffering data queue = tf.FIFOQueue(8, [ tf.float32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32, tf.int32 ], name="input_queue") self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \ self.targets_lengths, self.split_infos, self.speaker_embeddings, self.speaker_labels = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.token_targets.set_shape(self._placeholders[3].shape) self.targets_lengths.set_shape(self._placeholders[4].shape) self.split_infos.set_shape(self._placeholders[5].shape) self.speaker_embeddings.set_shape(self._placeholders[6].shape) self.speaker_labels.set_shape(self._placeholders[7].shape) # Create eval queue for buffering eval data eval_queue = tf.FIFOQueue(1, [ tf.float32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32, tf.int32 ], name="eval_queue") self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \ self.eval_token_targets, self.eval_targets_lengths, \ self.eval_split_infos, self.eval_speaker_embeddings, self.eval_speaker_labels = eval_queue.dequeue() self.eval_inputs.set_shape(self._placeholders[0].shape) self.eval_input_lengths.set_shape(self._placeholders[1].shape) self.eval_mel_targets.set_shape(self._placeholders[2].shape) self.eval_token_targets.set_shape(self._placeholders[3].shape) self.eval_targets_lengths.set_shape(self._placeholders[4].shape) self.eval_split_infos.set_shape(self._placeholders[5].shape) self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape) self.eval_speaker_labels.set_shape(self._placeholders[7].shape)
def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"): log("Constructing model: %s" % model_name) #Force the batch size to be known in order to use attention masking in batch synthesis inputs = tf.placeholder(tf.int32, (None, None), name="inputs") input_lengths = tf.placeholder(tf.int32, (None, ), name="input_lengths") speaker_embeddings = tf.placeholder( tf.float32, (None, hparams.speaker_embedding_size), name="speaker_embeddings") text_embeddings = tf.placeholder( tf.float32, (None, hparams.speaker_embedding_size), name="text_embeddings") targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets") split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos") global_step = tf.Variable(200000, name="global_step", trainable=False) with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope: # 200000 is the step of the ckpt being loaded self.model = create_model_ph(model_name, hparams) if gta: self.model.initialize(inputs, input_lengths, speaker_embeddings, text_embeddings, targets, gta=gta, split_infos=split_infos, global_step=global_step) else: self.model.initialize(inputs, input_lengths, speaker_embeddings, text_embeddings, split_infos=split_infos, global_step=global_step) self.mel_outputs = self.model.tower_mel_outputs self.linear_outputs = self.model.tower_linear_outputs if ( hparams.predict_linear and not gta) else None self.alignments = self.model.tower_alignments self.stop_token_prediction = self.model.tower_stop_token_prediction self.targets = targets self.gta = gta self._hparams = hparams self._pad = 0 if hparams.symmetric_mels: self._target_pad = -hparams.max_abs_value else: self._target_pad = 0. self.inputs = inputs self.input_lengths = input_lengths self.speaker_embeddings = speaker_embeddings self.text_embeddings = text_embeddings self.targets = targets self.split_infos = split_infos log("Loading checkpoint: %s" % checkpoint_path) #Memory allocation on the GPUs as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path)
def __init__(self, coordinator, hparams): super(Feeder, self).__init__() self._coord = coordinator self._hparams = hparams self._train_offset = 0 self._test_offset = 0 gparent_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) data_dir = os.path.join(gparent_dir, 'data') clf_data_path = os.path.join(data_dir, 'clf_data.pkl') clf_data = load_pkl(clf_data_path) texts = clf_data['texts'] mel_paths = clf_data['paths'] emb_paths = [p.replace('mel-', 'mbed-') for p in mel_paths] emb_paths = [p.replace('stft', 'speaker_emb') for p in emb_paths] text_emb_paths = [p.replace('mel-', '') for p in mel_paths] text_emb_paths = [p.replace('stft', 'devise') for p in text_emb_paths] # Load metadata self._metadata = [[ m, e, te, t ] for m, e, te, t in zip(mel_paths, emb_paths, text_emb_paths, texts)] # each element is size-3 list, ie [mel, embed, text] log("Loaded metadata for %d examples" % len(self._metadata)) # Train test split if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches is not None test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None else hparams.tacotron_test_batches * hparams.tacotron_batch_size) indices = np.arange(len(self._metadata)) train_indices, test_indices = train_test_split( indices, test_size=test_size, random_state=hparams.tacotron_data_random_state) # Make sure test_indices is a multiple of batch_size else round up len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size) extra_test = test_indices[len_test_indices:] test_indices = test_indices[:len_test_indices] train_indices = np.concatenate([train_indices, extra_test]) self._train_meta = list(np.array(self._metadata)[train_indices]) self._test_meta = list(np.array(self._metadata)[test_indices]) self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches == self.test_steps # pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 if hparams.symmetric_mels: self._target_pad = -hparams.max_abs_value else: self._target_pad = 0. self._token_pad = 1. with tf.device("/cpu:0"): # Create placeholders for inputs and targets. Don"t specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.int32, shape=(None, None), name="inputs"), tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"), tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name="mel_targets"), tf.placeholder(tf.float32, shape=(None, None), name="token_targets"), tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"), tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos"), # SV2TTS tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), name="speaker_embeddings"), tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), name="text_embeddings") ] # Create queue for buffering data queue = tf.FIFOQueue(8, [ tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32, tf.float32 ], name="input_queue") self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \ self.targets_lengths, self.split_infos, self.speaker_embeddings, \ self.text_embeddings = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.token_targets.set_shape(self._placeholders[3].shape) self.targets_lengths.set_shape(self._placeholders[4].shape) self.split_infos.set_shape(self._placeholders[5].shape) self.speaker_embeddings.set_shape(self._placeholders[6].shape) self.text_embeddings.set_shape(self._placeholders[7].shape) # Create eval queue for buffering eval data eval_queue = tf.FIFOQueue(1, [ tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32, tf.float32 ], name="eval_queue") self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \ self.eval_token_targets, self.eval_targets_lengths, \ self.eval_split_infos, self.eval_speaker_embeddings, \ self.eval_text_embeddings = eval_queue.dequeue() self.eval_inputs.set_shape(self._placeholders[0].shape) self.eval_input_lengths.set_shape(self._placeholders[1].shape) self.eval_mel_targets.set_shape(self._placeholders[2].shape) self.eval_token_targets.set_shape(self._placeholders[3].shape) self.eval_targets_lengths.set_shape(self._placeholders[4].shape) self.eval_split_infos.set_shape(self._placeholders[5].shape) self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape) self.eval_text_embeddings.set_shape(self._placeholders[7].shape)
def __init__(self, coordinator, metadata_filename, hparams): super(Feeder, self).__init__() self.encoder_path = Path(hparams.encoder_path) self._coord = coordinator self._hparams = hparams self._cleaner_names = hparams.cleaners self._train_offset = 0 self._test_offset = 0 # Load metadata self._audio_dir = os.path.join(os.path.dirname(metadata_filename), "audio") self._mel_dir = os.path.join(os.path.dirname(metadata_filename), "mels") self._embed_dir = os.path.join(os.path.dirname(metadata_filename), "embeds") with open(metadata_filename, encoding="utf8") as fin: self._metadata = [] for line in tqdm(fin, ncols=50, mininterval=2): # 支持相对路径和绝对路径 # ../data/samples/aliaudio/Aibao/005397.mp3|mel-aliaudio-Aibao-005397.mp3.npy|embed-aliaudio-Aibao-005397.mp3.npy|64403|254|他走近钢琴并开始演奏“祖国从哪里开始”。 audio_path, mel_path, embed_path, audio_size, mel_size, text = line.strip().split("|") if not os.path.exists(audio_path): audio_path = os.path.join(self._audio_dir, audio_path) if not os.path.exists(mel_path): mel_path = os.path.join(self._mel_dir, mel_path) if not os.path.exists(embed_path): embed_path = os.path.join(self._embed_dir, embed_path) if os.path.exists(audio_path) and os.path.exists(mel_path) and os.path.exists(embed_path): self._metadata.append([audio_path, mel_path, embed_path, audio_size, mel_size, text]) else: print("Load data failed!") print("data:", line) frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) log("Loaded metadata for {} examples ({:.2f} hours)".format(len(self._metadata), hours)) # Train test split if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches is not None test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None else hparams.tacotron_test_batches * hparams.tacotron_batch_size) indices = np.arange(len(self._metadata)) train_indices, test_indices = train_test_split(indices, test_size=test_size, random_state=hparams.tacotron_data_random_state) # Make sure test_indices is a multiple of batch_size else round up len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size) extra_test = test_indices[len_test_indices:] test_indices = test_indices[:len_test_indices] train_indices = np.concatenate([train_indices, extra_test]) self._train_meta = list(np.array(self._metadata)[train_indices]) self._test_meta = list(np.array(self._metadata)[test_indices]) np.random.shuffle(self._train_meta) np.random.shuffle(self._test_meta) self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches == self.test_steps # pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 # explicitely setting the padding to a value that doesn"t originally exist in the spectogram # to avoid any possible conflicts, without affecting the output range of the model too much if hparams.symmetric_mels: self._target_pad = -hparams.max_abs_value else: self._target_pad = 0. # Mark finished sequences with 1s self._token_pad = 1. with tf.device("/cpu:0"): # Create placeholders for inputs and targets. Don"t specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.int32, shape=(None, None), name="inputs"), tf.placeholder(tf.int32, shape=(None,), name="input_lengths"), tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name="mel_targets"), tf.placeholder(tf.float32, shape=(None, None), name="token_targets"), tf.placeholder(tf.int32, shape=(None,), name="targets_lengths"), tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos"), # SV2TTS tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), name="speaker_embeddings") ] # Create queue for buffering data queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32], name="input_queue") self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \ self.targets_lengths, self.split_infos, self.speaker_embeddings = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.token_targets.set_shape(self._placeholders[3].shape) self.targets_lengths.set_shape(self._placeholders[4].shape) self.split_infos.set_shape(self._placeholders[5].shape) self.speaker_embeddings.set_shape(self._placeholders[6].shape) # Create eval queue for buffering eval data eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32], name="eval_queue") self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \ self.eval_token_targets, self.eval_targets_lengths, \ self.eval_split_infos, self.eval_speaker_embeddings = eval_queue.dequeue() self.eval_inputs.set_shape(self._placeholders[0].shape) self.eval_input_lengths.set_shape(self._placeholders[1].shape) self.eval_mel_targets.set_shape(self._placeholders[2].shape) self.eval_token_targets.set_shape(self._placeholders[3].shape) self.eval_targets_lengths.set_shape(self._placeholders[4].shape) self.eval_split_infos.set_shape(self._placeholders[5].shape) self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape)